Skip to content

Commit 674e642

Browse files
committed
add first new graphs
1 parent 45b2317 commit 674e642

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@
66
from .speech_graph import SpeechGraph
77
from .search_graph import SearchGraph
88
from .script_creator_graph import ScriptCreatorGraph
9+
from .xml_scraper_graph import XmlScraperGraph
10+
from .json_scraper_graph import JsonScraperGraph
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Module for creating the smart scraper
3+
"""
4+
from .base_graph import BaseGraph
5+
from ..nodes import (
6+
FetchNode,
7+
ParseNode,
8+
RAGNode,
9+
GenerateAnswerNode
10+
)
11+
from .abstract_graph import AbstractGraph
12+
13+
14+
class JsonScraperGraph(AbstractGraph):
15+
"""
16+
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17+
information from web pages using a natural language model to interpret and answer prompts.
18+
"""
19+
20+
def __init__(self, prompt: str, source: str, config: dict):
21+
"""
22+
Initializes the JsonScraperGraph with a prompt, source, and configuration.
23+
"""
24+
super().__init__(prompt, config, source)
25+
26+
self.input_key = "url" if source.startswith("http") else "local_dir"
27+
28+
def _create_graph(self):
29+
"""
30+
Creates the graph of nodes representing the workflow for web scraping.
31+
"""
32+
fetch_node = FetchNode(
33+
input="url | local_dir",
34+
output=["doc"],
35+
)
36+
parse_node = ParseNode(
37+
input="doc",
38+
output=["parsed_doc"],
39+
node_config={"chunk_size": self.model_token}
40+
)
41+
rag_node = RAGNode(
42+
input="user_prompt & (parsed_doc | doc)",
43+
output=["relevant_chunks"],
44+
node_config={
45+
"llm": self.llm_model,
46+
"embedder_model": self.embedder_model
47+
}
48+
)
49+
generate_answer_node = GenerateAnswerNode(
50+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
51+
output=["answer"],
52+
node_config={"llm": self.llm_model},
53+
)
54+
55+
return BaseGraph(
56+
nodes=[
57+
fetch_node,
58+
parse_node,
59+
rag_node,
60+
generate_answer_node,
61+
],
62+
edges=[
63+
(fetch_node, parse_node),
64+
(parse_node, rag_node),
65+
(rag_node, generate_answer_node)
66+
],
67+
entry_point=fetch_node
68+
)
69+
70+
def run(self) -> str:
71+
"""
72+
Executes the web scraping process and returns the answer to the prompt.
73+
"""
74+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
75+
self.final_state, self.execution_info = self.graph.execute(inputs)
76+
77+
return self.final_state.get("answer", "No answer found.")
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Module for creating the smart scraper
3+
"""
4+
from .base_graph import BaseGraph
5+
from ..nodes import (
6+
FetchNode,
7+
ParseNode,
8+
RAGNode,
9+
GenerateAnswerNode
10+
)
11+
from .abstract_graph import AbstractGraph
12+
13+
14+
class XmlScraperGraph(AbstractGraph):
15+
"""
16+
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17+
information from web pages using a natural language model to interpret and answer prompts.
18+
"""
19+
20+
def __init__(self, prompt: str, source: str, config: dict):
21+
"""
22+
Initializes the XmlScraperGraph with a prompt, source, and configuration.
23+
"""
24+
super().__init__(prompt, config, source)
25+
26+
self.input_key = "url" if source.startswith("http") else "local_dir"
27+
28+
def _create_graph(self):
29+
"""
30+
Creates the graph of nodes representing the workflow for web scraping.
31+
"""
32+
fetch_node = FetchNode(
33+
input="url | local_dir",
34+
output=["doc"],
35+
)
36+
parse_node = ParseNode(
37+
input="doc",
38+
output=["parsed_doc"],
39+
node_config={"chunk_size": self.model_token}
40+
)
41+
rag_node = RAGNode(
42+
input="user_prompt & (parsed_doc | doc)",
43+
output=["relevant_chunks"],
44+
node_config={
45+
"llm": self.llm_model,
46+
"embedder_model": self.embedder_model
47+
}
48+
)
49+
generate_answer_node = GenerateAnswerNode(
50+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
51+
output=["answer"],
52+
node_config={"llm": self.llm_model},
53+
)
54+
55+
return BaseGraph(
56+
nodes=[
57+
fetch_node,
58+
parse_node,
59+
rag_node,
60+
generate_answer_node,
61+
],
62+
edges=[
63+
(fetch_node, parse_node),
64+
(parse_node, rag_node),
65+
(rag_node, generate_answer_node)
66+
],
67+
entry_point=fetch_node
68+
)
69+
70+
def run(self) -> str:
71+
"""
72+
Executes the web scraping process and returns the answer to the prompt.
73+
"""
74+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
75+
self.final_state, self.execution_info = self.graph.execute(inputs)
76+
77+
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)