add first new graphs

VinciGit00 · VinciGit00 · commit 674e64222e41 · 2024-04-29T15:55:21.000+02:00
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -6,3 +6,5 @@
 from .speech_graph import SpeechGraph
 from .search_graph import SearchGraph
 from .script_creator_graph import ScriptCreatorGraph
+from .xml_scraper_graph import XmlScraperGraph
+from .json_scraper_graph import JsonScraperGraph
diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py
@@ -0,0 +1,77 @@
+"""
+Module for creating the smart scraper
+"""
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    RAGNode,
+    GenerateAnswerNode
+)
+from .abstract_graph import AbstractGraph
+
+
+class JsonScraperGraph(AbstractGraph):
+    """
+    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    information from web pages using a natural language model to interpret and answer prompts.
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+        """
+        Initializes the JsonScraperGraph with a prompt, source, and configuration.
+        """
+        super().__init__(prompt, config, source)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self):
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+        """
+        fetch_node = FetchNode(
+            input="url | local_dir",
+            output=["doc"],
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={"chunk_size": self.model_token}
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm": self.llm_model,
+                "embedder_model": self.embedder_model
+            }
+        )
+        generate_answer_node = GenerateAnswerNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={"llm": self.llm_model},
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                rag_node,
+                generate_answer_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_answer_node)
+            ],
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping process and returns the answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py
@@ -0,0 +1,77 @@
+"""
+Module for creating the smart scraper
+"""
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    RAGNode,
+    GenerateAnswerNode
+)
+from .abstract_graph import AbstractGraph
+
+
+class XmlScraperGraph(AbstractGraph):
+    """
+    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    information from web pages using a natural language model to interpret and answer prompts.
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+        """
+        Initializes the XmlScraperGraph with a prompt, source, and configuration.
+        """
+        super().__init__(prompt, config, source)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self):
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+        """
+        fetch_node = FetchNode(
+            input="url | local_dir",
+            output=["doc"],
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={"chunk_size": self.model_token}
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm": self.llm_model,
+                "embedder_model": self.embedder_model
+            }
+        )
+        generate_answer_node = GenerateAnswerNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={"llm": self.llm_model},
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                rag_node,
+                generate_answer_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_answer_node)
+            ],
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping process and returns the answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")