Skip to content

Commit 18d8a7f

Browse files
committed
It is working now. v0.1.0
1 parent 62511a8 commit 18d8a7f

File tree

9 files changed

+95
-33
lines changed

9 files changed

+95
-33
lines changed

.env.example

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ FIRECRAWL_KEY="YOUR_KEY"
22
# If you want to use your self-hosted Firecrawl, add the following below:
33
# FIRECRAWL_BASE_URL="http://localhost:3002"
44

5-
TAVILY_API_KEY="TAVILY_API_KEY"
5+
#TAVILY_API_KEY="TAVILY_API_KEY"
66

7-
OPENAI_MODEL="gpt-4o-mini"
7+
OPENAI_MODEL="gpt-4o"
88
OPENAI_KEY="YOUR_KEY"
99
CONTEXT_SIZE="128000"
1010
# If you want to use other OpenAI compatible API, add the following below:

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
output.md
2+
13
# Byte-compiled / optimized / DLL files
24
__pycache__/
35
*.py[cod]

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,38 @@
11
# open-deepsearch
2+
23
open-deepsearch ( Deep Research but Open-Sourced )
34

45
Q&A for more details, research, and report generation.
56

67
## How to install in DEV environment after git clone
8+
79
```bash
810
python3 -m venv .venv
911
source .venv/bin/activate
10-
#modify .env file and put in OPENAI_KEY
12+
#modify .env file and put in OPENAI_KEY and FIRECRAWL_KEY
1113
cp .env.example .env
1214
pip install -r requirements.txt
1315
pip install -e .
1416
deepsearch
1517
```
16-
⭐ A python port from node.js version
18+
19+
## How to install in PROD environment after pip install open-deepsearch
20+
21+
```bash
22+
python3 -m venv .venv
23+
source .venv/bin/activate
24+
#modify .env file and put in OPENAI_KEY and FIRECRAWL_KEY
25+
cp .env.example .env
26+
pip install open-deepsearch
27+
deepsearch
28+
```
29+
30+
## ⭐ A python port from node.js version
31+
1732
<https://github.com/dzhng/deep-research>
1833

19-
## As for now (2025-02-21, v0.0.3), it only uses OpenAI to produce output.md
34+
## It only uses OpenAI_KEY and FIRECRAWL_KEY to produce output.md
2035

2136
## Future work
2237

23-
Try out FIRECRAWL or TAVILY to craw recent web data
38+
Fix some minor bugs

open_deepsearch/deep_research.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
from typing import List, Dict, Optional, Any
44
from PyPDF2 import PdfFileReader
55
from io import BytesIO
6-
import requests
6+
from pydantic import BaseModel
77

88
from open_deepsearch.feedback import generate_object
9-
from research_progress_results import ResearchProgress, ResearchResult
10-
from prompt import system_prompt
11-
from output_manager import OutputManager
12-
from pydantic import BaseModel
13-
from ai.providers import custom_model, trim_prompt, WebFirecrawlApp, WebCrawlerApp, TavilySearch, SearchResponse
9+
from .research_progress_results import ResearchProgress, ResearchResult
10+
from .prompt import system_prompt
11+
from .output_manager import OutputManager
12+
from .ai.providers import custom_model, trim_prompt, WebFirecrawlApp, WebCrawlerApp, TavilySearch, SearchResponse
1413

1514
output = OutputManager()
1615

@@ -63,20 +62,20 @@ async def process_serp_result(query: str, result: SearchResponse, num_learnings:
6362
'prompt': f"""Given the following contents from a SERP search for the query <query>{query}</query>, generate a list of learnings from the contents. Return a maximum of {num_learnings} learnings, but feel free to return less if the contents are clear. Make sure each learning is unique and not similar to each other. The learnings should be concise and to the point, as detailed and information dense as possible. Make sure to include any entities like people, places, companies, products, things, etc in the learnings, as well as any exact metrics, numbers, or dates. The learnings will be used to research the topic further.\n\n<contents>{''.join([f'<content>\n{content}\n</content>' for content in contents])}</contents>""",
6463
'schema': SerpResultSchema
6564
}, is_getting_queries=False)
66-
log(f"Created {len(res['object']['learnings'])} learnings", res['object']['learnings'])
65+
#log(f"Created {len(res['object']['learnings'])} learnings", res['object']['learnings'])
6766
return res['object']
6867

6968
async def write_final_report(prompt: str, learnings: List[str], visited_urls: List[str]) -> str:
7069
learnings_string = trim_prompt(''.join([f'<learning>\n{learning}\n</learning>' for learning in learnings]), 150000)
7170
res = await generate_object({
7271
'model': custom_model,
7372
'system': system_prompt(),
74-
'prompt': f"""Given the following prompt from the user, write a final report on the topic using the learnings from research. Make it as as detailed as possible, aim for 3 or more pages, include ALL the learnings from research:\n\n<prompt>{prompt}</prompt>\n\nHere are all the learnings from previous research:\n\n<learnings>\n{learnings_string}\n</learnings>""",
73+
'prompt': f"""Please write a final report on the topic using the learnings from research. ALL the learnings from research is below : {learnings_string}""",
7574
'schema': BaseModel
76-
},is_getting_queries=False)
75+
},is_getting_queries=False, is_final_report=True)
7776

7877
urls_section = f"\n\n## Sources\n\n{''.join([f'- <{url}>\n' for url in visited_urls])}"
79-
return '\n'.join(res['object']['learnings']) + urls_section
78+
return (res['object']['content']) + urls_section
8079

8180
async def process_serp_query(serp_query: Dict[str, str], breadth: int, depth: int, learnings: List[str], visited_urls: List[str], progress: ResearchProgress, report_progress: callable) -> Dict[str, List[str]]:
8281
try:
@@ -144,15 +143,23 @@ async def deep_research(query: str, breadth: int, depth: int, learnings: Optiona
144143
learnings = learnings or []
145144
visited_urls = visited_urls or []
146145
progress = ResearchProgress(current_depth=depth, total_depth=depth, current_breadth=breadth, total_breadth=breadth, total_queries=0, completed_queries=0)
147-
146+
148147
def report_progress(update: Dict[str, Any]) -> None:
149-
print("report_progress called with:", update) # Add this line
148+
print("report_progress called with:", update, "type:", type(update)) # Add type information
150149
try:
151-
tmp = update.items()
150+
if not isinstance(update, dict):
151+
print(f"Warning: update is not a dictionary, it is {type(update)}")
152+
return
153+
152154
for key, value in update.items():
155+
if not hasattr(progress, key):
156+
print(f"Warning: progress object has no attribute '{key}'")
157+
continue
153158
setattr(progress, key, value)
154-
except AttributeError as e:
155-
print(f"Attribute ERROR: {e}")
159+
except Exception as e:
160+
print(f"Error in report_progress: {str(e)}")
161+
print(f"Update object: {update}")
162+
print(f"Progress object attributes: {dir(progress)}")
156163

157164
if on_progress:
158165
on_progress(progress)

open_deepsearch/feedback.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from ai.providers import custom_model
2-
from prompt import system_prompt
1+
from .ai.providers import custom_model
2+
from .prompt import system_prompt
33
from pydantic import BaseModel
44
from typing import List
55
import re
@@ -18,7 +18,7 @@ async def generate_feedback(query: str, num_questions: int = 3) -> List[str]:
1818
})
1919
return user_feedback['object']['queries'][:num_questions]
2020

21-
async def generate_object(params: Dict[str, Any], is_getting_queries: bool = True) -> Dict[str, Any]:
21+
async def generate_object(params: Dict[str, Any], is_getting_queries: bool = True, is_final_report: bool = False) -> Dict[str, Any]:
2222
response = openai.chat.completions.create(
2323
model=params['model'],
2424
messages=[
@@ -33,6 +33,9 @@ async def generate_object(params: Dict[str, Any], is_getting_queries: bool = Tru
3333
)
3434
content = response.choices[0].message.content.strip()
3535

36+
if is_final_report:
37+
return {'object': {'content': content}}
38+
3639
# Split the content by both '\n\n' and '\n \n'
3740
results = re.split(r'\s*\n', content)
3841
queries = []
@@ -47,6 +50,12 @@ async def generate_object(params: Dict[str, Any], is_getting_queries: bool = Tru
4750
if is_getting_queries:
4851
return {'object': {'queries': queries, 'researchGoal': research_goals}}
4952
else:
50-
#followQuestions = generate_feedback(queries)
51-
return {'object': {'learnings': queries, 'followUpQuestions': queries}}
53+
if len(research_goals)==0:
54+
if len(queries) == 0:
55+
return {'object': {'learnings': results, 'followUpQuestions': []}}
56+
else:
57+
return {'object': {'learnings': queries, 'followUpQuestions': queries}}
58+
else :
59+
return {'object': {'learnings': research_goals, 'followUpQuestions': queries}}
60+
5261

open_deepsearch/output_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import Any
2-
from research_progress_results import ResearchProgress
2+
from .research_progress_results import ResearchProgress
33

44
class OutputManager:
55
def __init__(self):

open_deepsearch/run.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import asyncio
22
from typing import Any
3-
from deep_research import deep_research, write_final_report
4-
from feedback import generate_feedback
5-
from output_manager import OutputManager
3+
from .deep_research import deep_research, write_final_report
4+
from .feedback import generate_feedback
5+
from .output_manager import OutputManager
66

77
output = OutputManager()
88

pyproject.toml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "open-deepsearch"
7-
version = "0.0.3"
7+
version = "0.1.0"
88
description = "Deep Research but Open-Sourced, called open-deepsearch"
99
readme = "README.md"
1010
authors = [{ name = "Jason Chuang", email = "chuangtcee@gmail.com" }]
@@ -18,7 +18,21 @@ classifiers = [
1818
"Operating System :: OS Independent",
1919
]
2020
requires-python = ">=3.9"
21-
dependencies = []
21+
dependencies = [
22+
"python-dotenv>=1.0.1",
23+
"openai==1.63.2",
24+
"aiohttp>=3.9.0",
25+
"aiofiles>=23.2.1",
26+
"tiktoken>=0.5.0",
27+
"firecrawl-py>=1.12.0",
28+
"tavily-python==0.5.1",
29+
"Crawl4AI==0.4.248",
30+
"typer>=0.9.0",
31+
"prompt-toolkit>=3.0.0",
32+
"pydantic==2.10.6",
33+
"pypdf2==3.0.1",
34+
"html2text==2024.2.26"
35+
]
2236

2337
[tool.setuptools.packages]
2438
find = {}

setup.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name='open-deepsearch',
8-
version='0.0.3',
8+
version='0.1.0',
99
author='Jason Chuang',
1010
author_email='chuangtcee@gmail.com',
1111
description='Deep Research but Open-Sourced, called open-deepsearch',
@@ -28,6 +28,21 @@
2828
'deepsearch = open_deepsearch.run:main',
2929
],
3030
},
31+
install_requires=[
32+
"python-dotenv>=1.0.1",
33+
"openai==1.63.2",
34+
"aiohttp>=3.9.0",
35+
"aiofiles>=23.2.1",
36+
"tiktoken>=0.5.0",
37+
"firecrawl-py>=1.12.0",
38+
"tavily-python==0.5.1",
39+
"Crawl4AI==0.4.248",
40+
"typer>=0.9.0",
41+
"prompt-toolkit>=3.0.0",
42+
"pydantic==2.10.6",
43+
"pypdf2==3.0.1",
44+
"html2text==2024.2.26"
45+
],
3146
# This line enables editable installs
3247
# With 'pip install -e .' equivalent
3348
# to install your package in editable mode

0 commit comments

Comments
 (0)