add json and xml scraper

VinciGit00 · VinciGit00 · commit da2c82a2a2d8 · 2024-04-30T10:52:22.000+02:00
diff --git a/examples/gemini/json_scraper_gemini.py b/examples/gemini/json_scraper_gemini.py
@@ -0,0 +1,57 @@
+"""
+Basic example of scraping pipeline using SmartScraper from JSON documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import JsonScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the JSON file
+# ************************************************
+
+FILE_NAME = "inputs/example.json"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "gemini-pro",
+    },
+}
+
+# ************************************************
+# Create the JsonScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = JsonScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/gemini/script_generator_gemini.py b/examples/gemini/script_generator_gemini.py
@@ -19,7 +19,7 @@
 graph_config = {
     "llm": {
         "api_key": gemini_key,
-        "model": "gpt-3.5-turbo",
+        "model": "gemini-pro",
     },
     "library": "beautifoulsoup"
 }
diff --git a/examples/gemini/xml_scraper_openai.py b/examples/gemini/xml_scraper_openai.py
@@ -1,5 +1,5 @@
 """
-Basic example of scraping pipeline using SmartScraper from XML documents
+Basic example of scraping pipeline using XmlScraperGraph from XML documents
 """
 
 import os
@@ -28,7 +28,7 @@
 graph_config = {
     "llm": {
         "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
+        "model": "gemini-pro",
     },
 }
 
diff --git a/examples/local_models/Docker/json_scraper_docker.py b/examples/local_models/Docker/json_scraper_docker.py
@@ -0,0 +1,61 @@
+"""
+Basic example of scraping pipeline using JsonScraperGraph from JSON documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import JsonScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the JSON file
+# ************************************************
+
+FILE_NAME = "inputs/example.json"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+    }
+}
+
+# ************************************************
+# Create the JsonScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = JsonScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/local_models/Docker/xml_scraper_docker.py b/examples/local_models/Docker/xml_scraper_docker.py
@@ -0,0 +1,61 @@
+"""
+Basic example of scraping pipeline using XmlScraperGraph from XML documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import XmlScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the XML file
+# ************************************************
+
+FILE_NAME = "inputs/books.xml"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+    }
+}
+
+# ************************************************
+# Create the XmlScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = XmlScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/local_models/Ollama/json_scraper_ollama.py b/examples/local_models/Ollama/json_scraper_ollama.py
@@ -0,0 +1,63 @@
+"""
+Basic example of scraping pipeline using JsonScraperGraph from JSON documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import JsonScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the JSON file
+# ************************************************
+
+FILE_NAME = "inputs/example.json"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the XmlScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = JsonScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/local_models/Ollama/xml_scraper_ollama.py b/examples/local_models/Ollama/xml_scraper_ollama.py
@@ -0,0 +1,63 @@
+"""
+Basic example of scraping pipeline using XmlScraperGraph from XML documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import XmlScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the XML file
+# ************************************************
+
+FILE_NAME = "inputs/books.xml"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the XmlScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = XmlScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/openai/json_scraper_openai.py b/examples/openai/json_scraper_openai.py
@@ -0,0 +1,57 @@
+"""
+Basic example of scraping pipeline using JsonScraperGraph from JSON documents
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import JsonScraperGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the JSON file
+# ************************************************
+
+FILE_NAME = "inputs/example.json"
+curr_dir = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(curr_dir, FILE_NAME)
+
+with open(file_path, 'r', encoding="utf-8") as file:
+    text = file.read()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+}
+
+# ************************************************
+# Create the XmlScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = JsonScraperGraph(
+    prompt="List me all the authors, title and genres of the books",
+    source=text,  # Pass the content of the file, not the file object
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json or csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
diff --git a/examples/openai/xml_scraper_openai.py b/examples/openai/xml_scraper_openai.py
diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`graph_config = {`
`20`	`20`	`"llm": {`
`21`	`21`	`"api_key": gemini_key,`
`22`		`- "model": "gpt-3.5-turbo",`
	`22`	`+ "model": "gemini-pro",`
`23`	`23`	`},`
`24`	`24`	`"library": "beautifoulsoup"`
`25`	`25`	`}`