Skip to content

Commit da2c82a

Browse files
committed
add json and xml scraper
1 parent f891732 commit da2c82a

11 files changed

+424
-5
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from JSON documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import JsonScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the JSON file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/example.json"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
gemini_key = os.getenv("GOOGLE_APIKEY")
27+
28+
graph_config = {
29+
"llm": {
30+
"api_key": gemini_key,
31+
"model": "gemini-pro",
32+
},
33+
}
34+
35+
# ************************************************
36+
# Create the JsonScraperGraph instance and run it
37+
# ************************************************
38+
39+
smart_scraper_graph = JsonScraperGraph(
40+
prompt="List me all the authors, title and genres of the books",
41+
source=text, # Pass the content of the file, not the file object
42+
config=graph_config
43+
)
44+
45+
result = smart_scraper_graph.run()
46+
print(result)
47+
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = smart_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
54+
55+
# Save to json or csv
56+
convert_to_csv(result, "result")
57+
convert_to_json(result, "result")

examples/gemini/script_generator_gemini.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
graph_config = {
2020
"llm": {
2121
"api_key": gemini_key,
22-
"model": "gpt-3.5-turbo",
22+
"model": "gemini-pro",
2323
},
2424
"library": "beautifoulsoup"
2525
}

examples/openai/scrape_xml_openai.py renamed to examples/gemini/xml_scraper_openai.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Basic example of scraping pipeline using SmartScraper from XML documents
2+
Basic example of scraping pipeline using XmlScraperGraph from XML documents
33
"""
44

55
import os
@@ -28,7 +28,7 @@
2828
graph_config = {
2929
"llm": {
3030
"api_key": openai_key,
31-
"model": "gpt-3.5-turbo",
31+
"model": "gemini-pro",
3232
},
3333
}
3434

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Basic example of scraping pipeline using JsonScraperGraph from JSON documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import JsonScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the JSON file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/example.json"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"model": "ollama/mistral",
29+
"temperature": 0,
30+
"format": "json", # Ollama needs the format to be specified explicitly
31+
# "model_tokens": 2000, # set context length arbitrarily
32+
},
33+
"embeddings": {
34+
"model": "ollama/nomic-embed-text",
35+
"temperature": 0,
36+
}
37+
}
38+
39+
# ************************************************
40+
# Create the JsonScraperGraph instance and run it
41+
# ************************************************
42+
43+
smart_scraper_graph = JsonScraperGraph(
44+
prompt="List me all the authors, title and genres of the books",
45+
source=text, # Pass the content of the file, not the file object
46+
config=graph_config
47+
)
48+
49+
result = smart_scraper_graph.run()
50+
print(result)
51+
52+
# ************************************************
53+
# Get graph execution info
54+
# ************************************************
55+
56+
graph_exec_info = smart_scraper_graph.get_execution_info()
57+
print(prettify_exec_info(graph_exec_info))
58+
59+
# Save to json or csv
60+
convert_to_csv(result, "result")
61+
convert_to_json(result, "result")
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Basic example of scraping pipeline using XmlScraperGraph from XML documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import XmlScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the XML file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/books.xml"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"model": "ollama/mistral",
29+
"temperature": 0,
30+
"format": "json", # Ollama needs the format to be specified explicitly
31+
# "model_tokens": 2000, # set context length arbitrarily
32+
},
33+
"embeddings": {
34+
"model": "ollama/nomic-embed-text",
35+
"temperature": 0,
36+
}
37+
}
38+
39+
# ************************************************
40+
# Create the XmlScraperGraph instance and run it
41+
# ************************************************
42+
43+
smart_scraper_graph = XmlScraperGraph(
44+
prompt="List me all the authors, title and genres of the books",
45+
source=text, # Pass the content of the file, not the file object
46+
config=graph_config
47+
)
48+
49+
result = smart_scraper_graph.run()
50+
print(result)
51+
52+
# ************************************************
53+
# Get graph execution info
54+
# ************************************************
55+
56+
graph_exec_info = smart_scraper_graph.get_execution_info()
57+
print(prettify_exec_info(graph_exec_info))
58+
59+
# Save to json or csv
60+
convert_to_csv(result, "result")
61+
convert_to_json(result, "result")
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Basic example of scraping pipeline using JsonScraperGraph from JSON documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import JsonScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the JSON file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/example.json"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"model": "ollama/mistral",
29+
"temperature": 0,
30+
"format": "json", # Ollama needs the format to be specified explicitly
31+
# "model_tokens": 2000, # set context length arbitrarily
32+
"base_url": "http://localhost:11434",
33+
},
34+
"embeddings": {
35+
"model": "ollama/nomic-embed-text",
36+
"temperature": 0,
37+
"base_url": "http://localhost:11434",
38+
}
39+
}
40+
41+
# ************************************************
42+
# Create the XmlScraperGraph instance and run it
43+
# ************************************************
44+
45+
smart_scraper_graph = JsonScraperGraph(
46+
prompt="List me all the authors, title and genres of the books",
47+
source=text, # Pass the content of the file, not the file object
48+
config=graph_config
49+
)
50+
51+
result = smart_scraper_graph.run()
52+
print(result)
53+
54+
# ************************************************
55+
# Get graph execution info
56+
# ************************************************
57+
58+
graph_exec_info = smart_scraper_graph.get_execution_info()
59+
print(prettify_exec_info(graph_exec_info))
60+
61+
# Save to json or csv
62+
convert_to_csv(result, "result")
63+
convert_to_json(result, "result")
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Basic example of scraping pipeline using XmlScraperGraph from XML documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import XmlScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the XML file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/books.xml"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
graph_config = {
27+
"llm": {
28+
"model": "ollama/mistral",
29+
"temperature": 0,
30+
"format": "json", # Ollama needs the format to be specified explicitly
31+
# "model_tokens": 2000, # set context length arbitrarily
32+
"base_url": "http://localhost:11434",
33+
},
34+
"embeddings": {
35+
"model": "ollama/nomic-embed-text",
36+
"temperature": 0,
37+
"base_url": "http://localhost:11434",
38+
}
39+
}
40+
41+
# ************************************************
42+
# Create the XmlScraperGraph instance and run it
43+
# ************************************************
44+
45+
smart_scraper_graph = XmlScraperGraph(
46+
prompt="List me all the authors, title and genres of the books",
47+
source=text, # Pass the content of the file, not the file object
48+
config=graph_config
49+
)
50+
51+
result = smart_scraper_graph.run()
52+
print(result)
53+
54+
# ************************************************
55+
# Get graph execution info
56+
# ************************************************
57+
58+
graph_exec_info = smart_scraper_graph.get_execution_info()
59+
print(prettify_exec_info(graph_exec_info))
60+
61+
# Save to json or csv
62+
convert_to_csv(result, "result")
63+
convert_to_json(result, "result")
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using JsonScraperGraph from JSON documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import JsonScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the JSON file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/example.json"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
openai_key = os.getenv("OPENAI_APIKEY")
27+
28+
graph_config = {
29+
"llm": {
30+
"api_key": openai_key,
31+
"model": "gpt-3.5-turbo",
32+
},
33+
}
34+
35+
# ************************************************
36+
# Create the XmlScraperGraph instance and run it
37+
# ************************************************
38+
39+
smart_scraper_graph = JsonScraperGraph(
40+
prompt="List me all the authors, title and genres of the books",
41+
source=text, # Pass the content of the file, not the file object
42+
config=graph_config
43+
)
44+
45+
result = smart_scraper_graph.run()
46+
print(result)
47+
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = smart_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
54+
55+
# Save to json or csv
56+
convert_to_csv(result, "result")
57+
convert_to_json(result, "result")

0 commit comments

Comments
 (0)