Skip to content

Commit d592d27

Browse files
authored
Merge pull request #115 from VinciGit00/101-scrape-json-files
feat: add xml scraper and json scraper
2 parents da95c18 + 8fba7e5 commit d592d27

22 files changed

+1682
-11
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ venv/
2929
*.google-cookie
3030
examples/graph_examples/ScrapeGraphAI_generated_graph
3131
examples/**/*.csv
32-
examples/**/*.json
3332
main.py
3433
poetry.lock
3534

examples/gemini/inputs/example.json

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
{
2+
"kind":"youtube#searchListResponse",
3+
"etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg",
4+
"nextPageToken":"CAUQAA",
5+
"regionCode":"NL",
6+
"pageInfo":{
7+
"totalResults":1000000,
8+
"resultsPerPage":5
9+
},
10+
"items":[
11+
{
12+
"kind":"youtube#searchResult",
13+
"etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ",
14+
"id":{
15+
"kind":"youtube#video",
16+
"videoId":"TvWDY4Mm5GM"
17+
},
18+
"snippet":{
19+
"publishedAt":"2023-07-24T14:15:01Z",
20+
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
21+
"title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts",
22+
"description":"",
23+
"thumbnails":{
24+
"default":{
25+
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg",
26+
"width":120,
27+
"height":90
28+
},
29+
"medium":{
30+
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg",
31+
"width":320,
32+
"height":180
33+
},
34+
"high":{
35+
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg",
36+
"width":480,
37+
"height":360
38+
}
39+
},
40+
"channelTitle":"FC Motivate",
41+
"liveBroadcastContent":"none",
42+
"publishTime":"2023-07-24T14:15:01Z"
43+
}
44+
},
45+
{
46+
"kind":"youtube#searchResult",
47+
"etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k",
48+
"id":{
49+
"kind":"youtube#video",
50+
"videoId":"aZM_42CcNZ4"
51+
},
52+
"snippet":{
53+
"publishedAt":"2023-07-24T16:09:27Z",
54+
"channelId":"UCM5gMM_HqfKHYIEJ3lstMUA",
55+
"title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰",
56+
"description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...",
57+
"thumbnails":{
58+
"default":{
59+
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg",
60+
"width":120,
61+
"height":90
62+
},
63+
"medium":{
64+
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg",
65+
"width":320,
66+
"height":180
67+
},
68+
"high":{
69+
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg",
70+
"width":480,
71+
"height":360
72+
}
73+
},
74+
"channelTitle":"John Nellis",
75+
"liveBroadcastContent":"none",
76+
"publishTime":"2023-07-24T16:09:27Z"
77+
}
78+
},
79+
{
80+
"kind":"youtube#searchResult",
81+
"etag":"WbBz4oh9I5VaYj91LjeJvffrBVY",
82+
"id":{
83+
"kind":"youtube#video",
84+
"videoId":"wkP3XS3aNAY"
85+
},
86+
"snippet":{
87+
"publishedAt":"2023-07-24T16:00:50Z",
88+
"channelId":"UC4EP1dxFDPup_aFLt0ElsDw",
89+
"title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL",
90+
"description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...",
91+
"thumbnails":{
92+
"default":{
93+
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg",
94+
"width":120,
95+
"height":90
96+
},
97+
"medium":{
98+
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg",
99+
"width":320,
100+
"height":180
101+
},
102+
"high":{
103+
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg",
104+
"width":480,
105+
"height":360
106+
}
107+
},
108+
"channelTitle":"Shoot for Love",
109+
"liveBroadcastContent":"none",
110+
"publishTime":"2023-07-24T16:00:50Z"
111+
}
112+
},
113+
{
114+
"kind":"youtube#searchResult",
115+
"etag":"juxv_FhT_l4qrR05S1QTrb4CGh8",
116+
"id":{
117+
"kind":"youtube#video",
118+
"videoId":"rJkDZ0WvfT8"
119+
},
120+
"snippet":{
121+
"publishedAt":"2023-07-24T10:00:39Z",
122+
"channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ",
123+
"title":"TOP 10 DEFENDERS 2023",
124+
"description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...",
125+
"thumbnails":{
126+
"default":{
127+
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg",
128+
"width":120,
129+
"height":90
130+
},
131+
"medium":{
132+
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg",
133+
"width":320,
134+
"height":180
135+
},
136+
"high":{
137+
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg",
138+
"width":480,
139+
"height":360
140+
}
141+
},
142+
"channelTitle":"Home of Football",
143+
"liveBroadcastContent":"none",
144+
"publishTime":"2023-07-24T10:00:39Z"
145+
}
146+
},
147+
{
148+
"kind":"youtube#searchResult",
149+
"etag":"wtuknXTmI1txoULeH3aWaOuXOow",
150+
"id":{
151+
"kind":"youtube#video",
152+
"videoId":"XH0rtu4U6SE"
153+
},
154+
"snippet":{
155+
"publishedAt":"2023-07-21T16:30:05Z",
156+
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
157+
"title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts",
158+
"description":"",
159+
"thumbnails":{
160+
"default":{
161+
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg",
162+
"width":120,
163+
"height":90
164+
},
165+
"medium":{
166+
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg",
167+
"width":320,
168+
"height":180
169+
},
170+
"high":{
171+
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg",
172+
"width":480,
173+
"height":360
174+
}
175+
},
176+
"channelTitle":"FC Motivate",
177+
"liveBroadcastContent":"none",
178+
"publishTime":"2023-07-21T16:30:05Z"
179+
}
180+
}
181+
]
182+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Basic example of scraping pipeline using JSONScraperGraph from JSON documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import JSONScraperGraph
8+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the JSON file
13+
# ************************************************
14+
15+
FILE_NAME = "inputs/example.json"
16+
curr_dir = os.path.dirname(os.path.realpath(__file__))
17+
file_path = os.path.join(curr_dir, FILE_NAME)
18+
19+
with open(file_path, 'r', encoding="utf-8") as file:
20+
text = file.read()
21+
22+
# ************************************************
23+
# Define the configuration for the graph
24+
# ************************************************
25+
26+
gemini_key = os.getenv("GOOGLE_APIKEY")
27+
28+
graph_config = {
29+
"llm": {
30+
"api_key": gemini_key,
31+
"model": "gemini-pro",
32+
},
33+
}
34+
35+
# ************************************************
36+
# Create the JSONScraperGraph instance and run it
37+
# ************************************************
38+
39+
json_scraper_graph = JSONScraperGraph(
40+
prompt="List me all the authors, title and genres of the books",
41+
source=text, # Pass the content of the file, not the file object
42+
config=graph_config
43+
)
44+
45+
result = json_scraper_graph.run()
46+
print(result)
47+
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = json_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
54+
55+
# Save to json or csv
56+
convert_to_csv(result, "result")
57+
convert_to_json(result, "result")

examples/gemini/script_generator_gemini.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
graph_config = {
2020
"llm": {
2121
"api_key": gemini_key,
22-
"model": "gpt-3.5-turbo",
22+
"model": "gemini-pro",
2323
},
2424
"library": "beautifoulsoup"
2525
}

examples/openai/scrape_xml_openai.py renamed to examples/gemini/xml_scraper_openai.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
"""
2-
Basic example of scraping pipeline using SmartScraper from XML documents
2+
Basic example of scraping pipeline using XMLScraperGraph from XML documents
33
"""
44

55
import os
66
from dotenv import load_dotenv
7-
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.graphs import XMLScraperGraph
88
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
99
load_dotenv()
1010

@@ -28,28 +28,28 @@
2828
graph_config = {
2929
"llm": {
3030
"api_key": openai_key,
31-
"model": "gpt-3.5-turbo",
31+
"model": "gemini-pro",
3232
},
3333
}
3434

3535
# ************************************************
36-
# Create the SmartScraperGraph instance and run it
36+
# Create the XMLScraperGraph instance and run it
3737
# ************************************************
3838

39-
smart_scraper_graph = SmartScraperGraph(
39+
xml_scraper_graph = XMLScraperGraph(
4040
prompt="List me all the authors, title and genres of the books",
4141
source=text, # Pass the content of the file, not the file object
4242
config=graph_config
4343
)
4444

45-
result = smart_scraper_graph.run()
45+
result = xml_scraper_graph.run()
4646
print(result)
4747

4848
# ************************************************
4949
# Get graph execution info
5050
# ************************************************
5151

52-
graph_exec_info = smart_scraper_graph.get_execution_info()
52+
graph_exec_info = xml_scraper_graph.get_execution_info()
5353
print(prettify_exec_info(graph_exec_info))
5454

5555
# Save to json or csv

0 commit comments

Comments
 (0)