It is working now. v0.1.0

chuangtc · chuangtc · commit 18d8a7f6a5da · 2025-02-25T04:46:06.000+08:00
diff --git a/.env.example b/.env.example
@@ -2,9 +2,9 @@ FIRECRAWL_KEY="YOUR_KEY"
 # If you want to use your self-hosted Firecrawl, add the following below:
 # FIRECRAWL_BASE_URL="http://localhost:3002"
 
-TAVILY_API_KEY="TAVILY_API_KEY"
+#TAVILY_API_KEY="TAVILY_API_KEY"
 
-OPENAI_MODEL="gpt-4o-mini"
+OPENAI_MODEL="gpt-4o"
 OPENAI_KEY="YOUR_KEY"
 CONTEXT_SIZE="128000"
 # If you want to use other OpenAI compatible API, add the following below:
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+output.md
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
@@ -1,23 +1,38 @@
 # open-deepsearch
+
 open-deepsearch ( Deep Research but Open-Sourced )
 
 Q&A for more details, research, and report generation.
 
 ## How to install in DEV environment after git clone
+
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate
-#modify .env file and put in OPENAI_KEY
+#modify .env file and put in OPENAI_KEY and FIRECRAWL_KEY
 cp .env.example .env
 pip install -r requirements.txt
 pip install -e .
 deepsearch
 ```
-⭐ A python port from node.js version
+
+## How to install in PROD environment after pip install open-deepsearch
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+#modify .env file and put in OPENAI_KEY and FIRECRAWL_KEY
+cp .env.example .env
+pip install open-deepsearch
+deepsearch
+```
+
+## ⭐ A python port from node.js version
+
 <https://github.com/dzhng/deep-research>
 
-## As for now (2025-02-21, v0.0.3), it only uses OpenAI to produce output.md
+## It only uses OpenAI_KEY and FIRECRAWL_KEY to produce output.md
 
 ## Future work
 
-Try out FIRECRAWL or TAVILY to craw recent web data
+Fix some minor bugs
diff --git a/open_deepsearch/deep_research.py b/open_deepsearch/deep_research.py
@@ -3,14 +3,13 @@
 from typing import List, Dict, Optional, Any
 from PyPDF2 import PdfFileReader
 from io import BytesIO
-import requests
+from pydantic import BaseModel
 
 from open_deepsearch.feedback import generate_object
-from research_progress_results import ResearchProgress, ResearchResult
-from prompt import system_prompt
-from output_manager import OutputManager
-from pydantic import BaseModel
-from ai.providers import  custom_model, trim_prompt, WebFirecrawlApp, WebCrawlerApp, TavilySearch, SearchResponse
+from .research_progress_results import ResearchProgress, ResearchResult
+from .prompt import system_prompt
+from .output_manager import OutputManager
+from .ai.providers import  custom_model, trim_prompt, WebFirecrawlApp, WebCrawlerApp, TavilySearch, SearchResponse
 
 output = OutputManager()
 
@@ -63,20 +62,20 @@ async def process_serp_result(query: str, result: SearchResponse, num_learnings:
         'prompt': f"""Given the following contents from a SERP search for the query <query>{query}</query>, generate a list of learnings from the contents. Return a maximum of {num_learnings} learnings, but feel free to return less if the contents are clear. Make sure each learning is unique and not similar to each other. The learnings should be concise and to the point, as detailed and information dense as possible. Make sure to include any entities like people, places, companies, products, things, etc in the learnings, as well as any exact metrics, numbers, or dates. The learnings will be used to research the topic further.\n\n<contents>{''.join([f'<content>\n{content}\n</content>' for content in contents])}</contents>""",
         'schema': SerpResultSchema
     }, is_getting_queries=False)
-    log(f"Created {len(res['object']['learnings'])} learnings", res['object']['learnings'])
+    #log(f"Created {len(res['object']['learnings'])} learnings", res['object']['learnings'])
     return res['object']
 
 async def write_final_report(prompt: str, learnings: List[str], visited_urls: List[str]) -> str:
     learnings_string = trim_prompt(''.join([f'<learning>\n{learning}\n</learning>' for learning in learnings]), 150000)
     res = await generate_object({
         'model': custom_model,
         'system': system_prompt(),
-        'prompt': f"""Given the following prompt from the user, write a final report on the topic using the learnings from research. Make it as as detailed as possible, aim for 3 or more pages, include ALL the learnings from research:\n\n<prompt>{prompt}</prompt>\n\nHere are all the learnings from previous research:\n\n<learnings>\n{learnings_string}\n</learnings>""",
+        'prompt': f"""Please write a final report on the topic using the learnings from research. ALL the learnings from research is below : {learnings_string}""",
         'schema': BaseModel
-    },is_getting_queries=False)
+    },is_getting_queries=False, is_final_report=True)
 
     urls_section = f"\n\n## Sources\n\n{''.join([f'- <{url}>\n' for url in visited_urls])}"
-    return '\n'.join(res['object']['learnings']) + urls_section
+    return (res['object']['content']) + urls_section
 
 async def process_serp_query(serp_query: Dict[str, str], breadth: int, depth: int, learnings: List[str], visited_urls: List[str], progress: ResearchProgress, report_progress: callable) -> Dict[str, List[str]]:
     try:
@@ -144,15 +143,23 @@ async def deep_research(query: str, breadth: int, depth: int, learnings: Optiona
     learnings = learnings or []
     visited_urls = visited_urls or []
     progress = ResearchProgress(current_depth=depth, total_depth=depth, current_breadth=breadth, total_breadth=breadth, total_queries=0, completed_queries=0)
-
+    
     def report_progress(update: Dict[str, Any]) -> None:
-        print("report_progress called with:", update)  # Add this line 
+        print("report_progress called with:", update, "type:", type(update))  # Add type information
         try:
-            tmp = update.items()
+            if not isinstance(update, dict):
+                print(f"Warning: update is not a dictionary, it is {type(update)}")
+                return
+                
             for key, value in update.items():
+                if not hasattr(progress, key):
+                    print(f"Warning: progress object has no attribute '{key}'")
+                    continue
                 setattr(progress, key, value)
-        except AttributeError as e:
-            print(f"Attribute ERROR: {e}")
+        except Exception as e:
+            print(f"Error in report_progress: {str(e)}")
+            print(f"Update object: {update}")
+            print(f"Progress object attributes: {dir(progress)}")
 
         if on_progress:
             on_progress(progress)
diff --git a/open_deepsearch/feedback.py b/open_deepsearch/feedback.py
@@ -1,5 +1,5 @@
-from ai.providers import custom_model
-from prompt import system_prompt
+from .ai.providers import custom_model
+from .prompt import system_prompt
 from pydantic import BaseModel
 from typing import List
 import re
@@ -18,7 +18,7 @@ async def generate_feedback(query: str, num_questions: int = 3) -> List[str]:
     })
     return user_feedback['object']['queries'][:num_questions] 
 
-async def generate_object(params: Dict[str, Any], is_getting_queries: bool = True) -> Dict[str, Any]:
+async def generate_object(params: Dict[str, Any], is_getting_queries: bool = True, is_final_report: bool = False) -> Dict[str, Any]:
     response = openai.chat.completions.create(
         model=params['model'],
         messages=[
@@ -33,6 +33,9 @@ async def generate_object(params: Dict[str, Any], is_getting_queries: bool = Tru
     )
     content = response.choices[0].message.content.strip()
 
+    if is_final_report:
+        return {'object': {'content': content}}
+
     # Split the content by both '\n\n' and '\n  \n'
     results = re.split(r'\s*\n', content)
     queries = []
@@ -47,6 +50,12 @@ async def generate_object(params: Dict[str, Any], is_getting_queries: bool = Tru
     if is_getting_queries:
         return {'object': {'queries': queries, 'researchGoal': research_goals}}
     else:
-        #followQuestions = generate_feedback(queries)
-        return {'object': {'learnings': queries, 'followUpQuestions': queries}}
+        if len(research_goals)==0:
+            if len(queries) == 0:
+                return {'object': {'learnings': results, 'followUpQuestions': []}}
+            else:
+                return {'object': {'learnings': queries, 'followUpQuestions': queries}}
+        else :
+            return {'object': {'learnings': research_goals, 'followUpQuestions': queries}}
+        
 
diff --git a/open_deepsearch/output_manager.py b/open_deepsearch/output_manager.py
@@ -1,5 +1,5 @@
 from typing import Any
-from research_progress_results import ResearchProgress
+from .research_progress_results import ResearchProgress
 
 class OutputManager:
     def __init__(self):
diff --git a/open_deepsearch/run.py b/open_deepsearch/run.py
@@ -1,8 +1,8 @@
 import asyncio
 from typing import Any
-from deep_research import deep_research, write_final_report
-from feedback import generate_feedback
-from output_manager import OutputManager
+from .deep_research import deep_research, write_final_report
+from .feedback import generate_feedback
+from .output_manager import OutputManager
 
 output = OutputManager()
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "open-deepsearch"  
-version = "0.0.3"
+version = "0.1.0"
 description = "Deep Research but Open-Sourced, called open-deepsearch"
 readme = "README.md"
 authors = [{ name = "Jason Chuang", email = "chuangtcee@gmail.com" }]
@@ -18,7 +18,21 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 requires-python = ">=3.9"
-dependencies = []
+dependencies = [
+    "python-dotenv>=1.0.1",
+    "openai==1.63.2",
+    "aiohttp>=3.9.0",
+    "aiofiles>=23.2.1",
+    "tiktoken>=0.5.0",
+    "firecrawl-py>=1.12.0",
+    "tavily-python==0.5.1",
+    "Crawl4AI==0.4.248",
+    "typer>=0.9.0",
+    "prompt-toolkit>=3.0.0",
+    "pydantic==2.10.6",
+    "pypdf2==3.0.1",
+    "html2text==2024.2.26"
+]
 
 [tool.setuptools.packages]
 find = {}
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='open-deepsearch',
-    version='0.0.3',
+    version='0.1.0',
     author='Jason Chuang',
     author_email='chuangtcee@gmail.com',
     description='Deep Research but Open-Sourced, called open-deepsearch',
@@ -28,6 +28,21 @@
             'deepsearch = open_deepsearch.run:main',
         ],
     },
+    install_requires=[
+        "python-dotenv>=1.0.1",
+        "openai==1.63.2",
+        "aiohttp>=3.9.0",
+        "aiofiles>=23.2.1",
+        "tiktoken>=0.5.0",
+        "firecrawl-py>=1.12.0",
+        "tavily-python==0.5.1",
+        "Crawl4AI==0.4.248",
+        "typer>=0.9.0",
+        "prompt-toolkit>=3.0.0",
+        "pydantic==2.10.6",
+        "pypdf2==3.0.1",
+        "html2text==2024.2.26"
+    ],
     # This line enables editable installs
     # With 'pip install -e .' equivalent
     # to install your package in editable mode

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+output.md`
	`2`	`+`
`1`	`3`	`# Byte-compiled / optimized / DLL files`
`2`	`4`	`__pycache__/`
`3`	`5`	`*.py[cod]`