Skip to content

Commit 5cf4e4f

Browse files
committed
fix: examples and graphs
1 parent ba2b24b commit 5cf4e4f

12 files changed

+95
-21
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Please make sure to format your code accordingly before submitting a pull reques
5151
- [Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/)
5252
- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
5353
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/style/)
54+
- [Pylint style of code for the documentation](https://pylint.pycqa.org/en/1.6.0/tutorial.html)
5455

5556
## Submitting a Pull Request
5657

examples/single_node/fetch_node.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
robots_node = FetchNode(
1313
input="url | local_dir",
1414
output=["doc"],
15+
node_config={
16+
"headless": False
17+
}
1518
)
1619

1720
# ************************************************

examples/single_node/robot_node.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
robots_node = RobotsNode(
2727
input="url",
2828
output=["is_scrapable"],
29-
node_config={"llm": llm_model}
29+
node_config={"llm": llm_model,
30+
"headless": False
31+
}
3032
)
3133

3234
# ************************************************

scrapegraphai/graphs/json_scraper_graph.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class JSONScraperGraph(AbstractGraph):
2121
source (str): The source of the graph.
2222
config (dict): Configuration parameters for the graph.
2323
llm_model: An instance of a language model client, configured for generating answers.
24-
embedder_model: An instance of an embedding model client, configured for generating embeddings.
24+
embedder_model: An instance of an embedding model client,
25+
configured for generating embeddings.
2526
verbose (bool): A flag indicating whether to show print statements during execution.
2627
headless (bool): A flag indicating whether to run the graph in headless mode.
2728
@@ -47,7 +48,7 @@ def __init__(self, prompt: str, source: str, config: dict):
4748
def _create_graph(self) -> BaseGraph:
4849
"""
4950
Creates the graph of nodes representing the workflow for web scraping.
50-
51+
5152
Returns:
5253
BaseGraph: A graph instance representing the web scraping workflow.
5354
"""

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class ScriptCreatorGraph(AbstractGraph):
2121
source (str): The source of the graph.
2222
config (dict): Configuration parameters for the graph.
2323
llm_model: An instance of a language model client, configured for generating answers.
24-
embedder_model: An instance of an embedding model client, configured for generating embeddings.
24+
embedder_model: An instance of an embedding model client,
25+
configured for generating embeddings.
2526
verbose (bool): A flag indicating whether to show print statements during execution.
2627
headless (bool): A flag indicating whether to run the graph in headless mode.
2728
model_token (int): The token limit for the language model.
@@ -44,7 +45,7 @@ class ScriptCreatorGraph(AbstractGraph):
4445
def __init__(self, prompt: str, source: str, config: dict):
4546

4647
self.library = config['library']
47-
48+
4849
super().__init__(prompt, config, source)
4950

5051
self.input_key = "url" if source.startswith("http") else "local_dir"
@@ -61,25 +62,29 @@ def _create_graph(self) -> BaseGraph:
6162
input="url | local_dir",
6263
output=["doc"],
6364
node_config={
64-
"headless": True if self.config is None else self.config.get("headless", True)}
65+
"headless": True if self.config is None else self.config.get("headless", True),
66+
"verbose": self.verbose}
6567
)
6668
parse_node = ParseNode(
6769
input="doc",
6870
output=["parsed_doc"],
69-
node_config={"chunk_size": self.model_token}
71+
node_config={"chunk_size": self.model_token,
72+
"verbose": self.verbose}
7073
)
7174
rag_node = RAGNode(
7275
input="user_prompt & (parsed_doc | doc)",
7376
output=["relevant_chunks"],
7477
node_config={
7578
"llm": self.llm_model,
76-
"embedder_model": self.embedder_model
79+
"embedder_model": self.embedder_model,
80+
"verbose": self.verbose
7781
}
7882
)
7983
generate_scraper_node = GenerateScraperNode(
8084
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
8185
output=["answer"],
82-
node_config={"llm": self.llm_model},
86+
node_config={"llm": self.llm_model,
87+
"verbose": self.verbose},
8388
library=self.library,
8489
website=self.source
8590
)
@@ -106,7 +111,7 @@ def run(self) -> str:
106111
Returns:
107112
str: The answer to the prompt.
108113
"""
109-
114+
110115
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
111116
self.final_state, self.execution_info = self.graph.execute(inputs)
112117

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,17 @@
1414

1515
class SmartScraperGraph(AbstractGraph):
1616
"""
17-
SmartScraper is a scraping pipeline that automates the process of extracting information from web pages
17+
SmartScraper is a scraping pipeline that automates the process of
18+
extracting information from web pages
1819
using a natural language model to interpret and answer prompts.
1920
2021
Attributes:
2122
prompt (str): The prompt for the graph.
2223
source (str): The source of the graph.
2324
config (dict): Configuration parameters for the graph.
2425
llm_model: An instance of a language model client, configured for generating answers.
25-
embedder_model: An instance of an embedding model client, configured for generating embeddings.
26+
embedder_model: An instance of an embedding model client,
27+
configured for generating embeddings.
2628
verbose (bool): A flag indicating whether to show print statements during execution.
2729
headless (bool): A flag indicating whether to run the graph in headless mode.
2830
@@ -45,7 +47,7 @@ def __init__(self, prompt: str, source: str, config: dict):
4547
super().__init__(prompt, config, source)
4648

4749
self.input_key = "url" if source.startswith("http") else "local_dir"
48-
50+
4951
def _create_graph(self) -> BaseGraph:
5052
"""
5153
Creates the graph of nodes representing the workflow for web scraping.

scrapegraphai/graphs/xml_scraper_graph.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ class XMLScraperGraph(AbstractGraph):
2222
source (str): The source of the graph.
2323
config (dict): Configuration parameters for the graph.
2424
llm_model: An instance of a language model client, configured for generating answers.
25-
embedder_model: An instance of an embedding model client, configured for generating embeddings.
25+
embedder_model: An instance of an embedding model client,
26+
configured for generating embeddings.
2627
verbose (bool): A flag indicating whether to show print statements during execution.
2728
headless (bool): A flag indicating whether to run the graph in headless mode.
2829
model_token (int): The token limit for the language model.
@@ -49,7 +50,7 @@ def __init__(self, prompt: str, source: str, config: dict):
4950
def _create_graph(self) -> BaseGraph:
5051
"""
5152
Creates the graph of nodes representing the workflow for web scraping.
52-
53+
5354
Returns:
5455
BaseGraph: A graph instance representing the web scraping workflow.
5556
"""
@@ -110,7 +111,7 @@ def run(self) -> str:
110111
Returns:
111112
str: The answer to the prompt.
112113
"""
113-
114+
114115
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
115116
self.final_state, self.execution_info = self.graph.execute(inputs)
116117

tests/graphs/scrape_json_ollama.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Module for scraping json documents
3+
"""
4+
import os
5+
import pytest
6+
from scrapegraphai.graphs import JSONScraperGraph
7+
8+
9+
@pytest.fixture
10+
def sample_json():
11+
"""
12+
Example of text
13+
"""
14+
file_name = "inputs/example.json"
15+
curr_dir = os.path.dirname(os.path.realpath(__file__))
16+
file_path = os.path.join(curr_dir, file_name)
17+
18+
with open(file_path, 'r', encoding="utf-8") as file:
19+
text = file.read()
20+
21+
return text
22+
23+
24+
@pytest.fixture
25+
def graph_config():
26+
"""
27+
Configuration of the graph
28+
"""
29+
return {
30+
"llm": {
31+
"model": "ollama/mistral",
32+
"temperature": 0,
33+
"format": "json",
34+
"base_url": "http://localhost:11434",
35+
},
36+
"embeddings": {
37+
"model": "ollama/nomic-embed-text",
38+
"temperature": 0,
39+
"base_url": "http://localhost:11434",
40+
}
41+
}
42+
43+
44+
def test_scraping_pipeline(sample_json: str, graph_config: dict):
45+
"""
46+
Start of the scraping pipeline
47+
"""
48+
smart_scraper_graph = JSONScraperGraph(
49+
prompt="List me all the titles",
50+
source=sample_json,
51+
config=graph_config
52+
)
53+
54+
result = smart_scraper_graph.run()
55+
56+
assert result is not None

tests/graphs/scrape_xml_ollama_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
import os
55
import pytest
6-
from scrapegraphai.graphs import SmartScraperGraph
6+
from scrapegraphai.graphs import XMLScraperGraph
77

88

99
@pytest.fixture
@@ -45,7 +45,7 @@ def test_scraping_pipeline(sample_xml: str, graph_config: dict):
4545
"""
4646
Start of the scraping pipeline
4747
"""
48-
smart_scraper_graph = SmartScraperGraph(
48+
smart_scraper_graph = XMLScraperGraph(
4949
prompt="List me all the authors, title and genres of the books",
5050
source=sample_xml,
5151
config=graph_config

tests/graphs/script_generator_test.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,4 @@ def test_script_creator_graph(graph_config: dict):
4646

4747
assert graph_exec_info is not None
4848

49-
assert isinstance(graph_exec_info, dict)
50-
5149
print(prettify_exec_info(graph_exec_info))

tests/nodes/fetch_node_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ def setup():
1717
robots_node = FetchNode(
1818
input="url | local_dir",
1919
output=["doc"],
20+
node_config={
21+
"headless": False
22+
}
2023
)
2124

2225
return robots_node

tests/nodes/robot_node_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ def setup():
3232
robots_node = RobotsNode(
3333
input="url",
3434
output=["is_scrapable"],
35-
node_config={"llm": llm_model}
35+
node_config={"llm": llm_model,
36+
"headless": False
37+
}
3638
)
3739

3840
return robots_node

0 commit comments

Comments
 (0)