Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions gpt_researcher/actions/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,21 +79,43 @@ def get_retrievers(headers: dict[str, str], cfg: Config):
Returns:
list: A list of retriever classes to be used for searching.
"""
# Check headers first for multiple retrievers
if headers.get("retrievers"):
retrievers = headers.get("retrievers").split(",")
# If not found, check headers for a single retriever
elif headers.get("retriever"):
retrievers = [headers.get("retriever")]
# If not in headers, check config for multiple retrievers
elif cfg.retrievers:
retrievers = cfg.retrievers
# If not found, check config for a single retriever
elif cfg.retriever:
retrievers = [cfg.retriever]
# If still not set, use default retriever
retrievers = [] # Initialize an empty list

# Check if focus_academic_medical_sources is True in cfg
if getattr(cfg, 'focus_academic_medical_sources', False):
retrievers = ["semantic_scholar", "pubmed_central", "arxiv"]
else:
retrievers = [get_default_retriever().__name__]
# Fallback to existing logic if focus_academic_medical_sources is False or not set
# Check headers first for multiple retrievers
if headers.get("retrievers"):
retrievers = headers.get("retrievers").split(",")
# If not found, check headers for a single retriever
elif headers.get("retriever"):
retrievers = [headers.get("retriever")]
# If not in headers, check config for multiple retrievers
elif cfg.retrievers:
retrievers = cfg.retrievers
# If not found, check config for a single retriever
elif cfg.retriever:
retrievers = [cfg.retriever]
# If still not set, use default retriever
else:
# Assuming get_default_retriever() returns the class itself,
# and we need its name for the list of strings.
# However, the final list comprehension expects names, so this is fine.
# The original code used get_default_retriever().__name__
# Let's be consistent if this list is purely names before conversion.
# The original logic implies 'retrievers' is a list of names.
default_retriever_class = get_default_retriever()
# Try to get the name; this might be tricky if it's not always a class with __name__
# For simplicity, and given how other retrievers are specified (as strings),
# let's assume the default should also be a string name here.
# The default retriever is TavilySearch, so its name is "tavily".
# This part is a bit tricky as the original code structure was a bit inconsistent.
# Given the final line: `[get_retriever(r) or get_default_retriever() for r in retrievers]`
# `retrievers` must be a list of strings.
retrievers = ["tavily"] # Defaulting to "tavily" as a string name.
# This matches the default retriever's expected name.

# Convert retriever names to actual retriever classes
# Use get_default_retriever() as a fallback for any invalid retriever names
Expand Down
1 change: 1 addition & 0 deletions gpt_researcher/config/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class BaseConfig(TypedDict):
TOTAL_WORDS: int
REPORT_FORMAT: str
CURATE_SOURCES: bool
FOCUS_ACADEMIC_MEDICAL_SOURCES: bool
MAX_ITERATIONS: int
LANGUAGE: str
AGENT_ROLE: Union[str, None]
Expand Down
1 change: 1 addition & 0 deletions gpt_researcher/config/variables/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"STRATEGIC_TOKEN_LIMIT": 4000,
"BROWSE_CHUNK_MAX_LENGTH": 8192,
"CURATE_SOURCES": False,
"FOCUS_ACADEMIC_MEDICAL_SOURCES": False,
"SUMMARY_TOKEN_LIMIT": 700,
"TEMPERATURE": 0.4,
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
Expand Down
75 changes: 46 additions & 29 deletions gpt_researcher/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,41 +106,59 @@ def generate_report_prompt(
"""

def curate_sources(query, sources, max_results=10):
return f"""Your goal is to evaluate and curate the provided scraped content for the research task: "{query}"
while prioritizing the inclusion of relevant and high-quality information, especially sources containing statistics, numbers, or concrete data.

The final curated list will be used as context for creating a research report, so prioritize:
- Retaining as much original information as possible, with extra emphasis on sources featuring quantitative data or unique insights
- Including a wide range of perspectives and insights
- Filtering out only clearly irrelevant or unusable content

EVALUATION GUIDELINES:
1. Assess each source based on:
- Relevance: Include sources directly or partially connected to the research query. Err on the side of inclusion.
- Credibility: Favor authoritative sources but retain others unless clearly untrustworthy.
- Currency: Prefer recent information unless older data is essential or valuable.
- Objectivity: Retain sources with bias if they provide a unique or complementary perspective.
- Quantitative Value: Give higher priority to sources with statistics, numbers, or other concrete data.
2. Source Selection:
- Include as many relevant sources as possible, up to {max_results}, focusing on broad coverage and diversity.
- Prioritize sources with statistics, numerical data, or verifiable facts.
- Overlapping content is acceptable if it adds depth, especially when data is involved.
- Exclude sources only if they are entirely irrelevant, severely outdated, or unusable due to poor content quality.
3. Content Retention:
- DO NOT rewrite, summarize, or condense any source content.
- Retain all usable information, cleaning up only clear garbage or formatting issues.
- Keep marginally relevant or incomplete sources if they contain valuable data or insights.
return f"""Your goal is to evaluate and curate the provided list of sources for the research task: "{query}".
The research task is focused on academic and valid medical literature. You must prioritize sources that align with this focus.

Each source in the list is a JSON object that may contain fields such as:
- `title`: Original title from the retriever.
- `scraped_title`: Title found during web scraping.
- `href`: URL of the source.
- `body`: Original abstract or summary from the retriever.
- `raw_content`: Full text content scraped from the web page.
- `retriever_name`: Identifier for the search service that found this source (e.g., "semantic_scholar", "pubmed_central", "arxiv", "google").
- `citation_count`: Number of citations (primarily from "semantic_scholar").
- `venue`: Publication venue (e.g., journal name from "semantic_scholar").
- `year`: Publication year.
- `journal_title`: Journal title (primarily from "pubmed_central").

EVALUATION AND CURATION GUIDELINES FOR ACADEMIC/MEDICAL RESEARCH:

1. **Prioritize by Retriever Source:**
* Strongly prefer sources where `retriever_name` is "semantic_scholar", "pubmed_central", or "arxiv". These are primary academic/medical databases.
* Treat sources from general web retrievers (e.g., "google", "bing", "duckduckgo") with caution. Only include them if their content is exceptionally relevant, clearly academic or medical in nature (e.g., a direct link to a university research paper or a well-known medical institution's publication), and supports the primary academic sources.

2. **Assess Academic Validity and Quality:**
* **For "semantic_scholar" sources:** Give very high priority to sources with a high `citation_count`. Also, consider the `venue` and `year`; more recent, highly cited articles in reputable venues are generally better.
* **For "pubmed_central" sources:** These are generally strong. Consider the `journal_title` for potential journal quality if available.
* **For "arxiv" sources:** These are pre-prints. They can be valuable for cutting-edge research but may not be peer-reviewed. Include if highly relevant.
* Evaluate the `raw_content` (or `body` if `raw_content` is minimal) for scientific rigor, appropriate methodology (if applicable), and clarity.
* Filter out sources that are clearly non-academic (e.g., news articles unless reporting on research, blog posts, forum discussions, commercial product pages) unless they provide specific, verifiable data or context directly supporting an academic point.

3. **Relevance to Query:**
* The source must be highly relevant to the research task: "{query}".

4. **Content Richness & Uniqueness:**
* Prefer sources with substantial `raw_content` that provides in-depth information.
* Among relevant and high-quality sources, aim for a diversity of information and perspectives.

5. **Source Selection and Filtering:**
* Include up to {max_results} of the best sources based on the above criteria.
* Be prepared to filter aggressively if many sources are from general web retrievers or are of low academic quality. The goal is a curated list of high-quality academic/medical literature.
* If a source has both `body` (abstract) and `raw_content` (full text), your evaluation should primarily be based on the `raw_content` as it's more comprehensive.

6. **Content Retention (Important for Output Format):**
* DO NOT rewrite, summarize, or condense any part of the source objects.
* Your task is to SELECT and REORDER the sources from the provided list.

SOURCES LIST TO EVALUATE:
{sources}

You MUST return your response in the EXACT sources JSON list format as the original sources.
The response MUST not contain any markdown format or additional text (like ```json), just the JSON list!
You MUST return your response as a JSON list of source objects, in the EXACT SAME FORMAT as they were provided in the input sources list.
The returned list should only contain the sources you've selected and prioritized, up to {max_results}.
The response MUST not contain any markdown format or additional text (like ```json), just the JSON list itself!
"""




def generate_resource_report_prompt(
question, context, report_source: str, report_format="apa", tone=None, total_words=1000, language="english"
):
Expand Down Expand Up @@ -529,4 +547,3 @@ def get_prompt_by_report_type(report_type):
)
prompt_by_type = report_type_mapping.get(default_report_type)
return prompt_by_type

1 change: 1 addition & 0 deletions gpt_researcher/retrievers/arxiv/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def search(self, max_results=5):
"title": result.title,
"href": result.pdf_url,
"body": result.summary,
"retriever_name": "arxiv",
})

return search_result
1 change: 1 addition & 0 deletions gpt_researcher/retrievers/bing/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def search(self, max_results=7) -> list[dict[str]]:
"title": result["name"],
"href": result["url"],
"body": result["snippet"],
"retriever_name": "bing",
}
search_results.append(search_result)

Expand Down
9 changes: 8 additions & 1 deletion gpt_researcher/retrievers/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,14 @@ def search(self, max_results: int = 5) -> Optional[List[Dict[str, Any]]]:
try:
response = requests.get(self.endpoint, params={**self.params, 'query': self.query})
response.raise_for_status()
return response.json()
results = response.json()
if results:
for result in results:
result["retriever_name"] = "custom"
return results
except requests.RequestException as e:
print(f"Failed to retrieve search results: {e}")
return None
except json.JSONDecodeError as e:
print(f"Failed to parse JSON response: {e}")
return None
6 changes: 5 additions & 1 deletion gpt_researcher/retrievers/duckduckgo/duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def search(self, max_results=5):
"""
# TODO: Add support for query domains
try:
search_response = self.ddg.text(self.query, region='wt-wt', max_results=max_results)
results = self.ddg.text(self.query, region='wt-wt', max_results=max_results)
if results:
for result in results:
result["retriever_name"] = "duckduckgo"
search_response = results if results else []
except Exception as e:
print(f"Error: {e}. Failed fetching sources. Resulting in empty response.")
search_response = []
Expand Down
3 changes: 2 additions & 1 deletion gpt_researcher/retrievers/exa/exa.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def search(
)

search_response = [
{"href": result.url, "body": result.text} for result in results.results
{"href": result.url, "body": result.text, "retriever_name": "exa"}
for result in results.results
]
return search_response

Expand Down
3 changes: 2 additions & 1 deletion gpt_researcher/retrievers/google/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ def search(self, max_results=7):
"title": result["title"],
"href": result["link"],
"body": result["snippet"],
"retriever_name": "google",
}
except:
except KeyError: # More specific exception
continue
search_results.append(search_result)

Expand Down
35 changes: 25 additions & 10 deletions gpt_researcher/retrievers/pubmed_central/pubmed_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,15 @@ def search(self, max_results=10):
if self.has_body_content(xml_content):
article_data = self.parse_xml(xml_content)
if article_data:
search_response.append(
{
"href": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{article_id}/",
"body": f"{article_data['title']}\n\n{article_data['abstract']}\n\n{article_data['body'][:500]}...",
}
)
response_item = {
"href": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{article_id}/",
"body": f"{article_data.get('title', 'No Title')}\n\n{article_data.get('abstract', 'Abstract not available')}\n\n{article_data.get('body', '')[:500]}...",
"retriever_name": "pubmed_central"
}
if "journal_title" in article_data:
response_item["journal_title"] = article_data["journal_title"]

search_response.append(response_item)

if len(search_response) >= max_results:
break
Expand Down Expand Up @@ -151,12 +154,19 @@ def parse_xml(self, xml_content):
return None

title = article.findtext(
".//title-group/article-title", default="", namespaces=ns
".//title-group/article-title", default="No Title", namespaces=ns
)

journal_meta = article.find(".//journal-meta", namespaces=ns)
journal_title = None
if journal_meta is not None:
journal_title_element = journal_meta.find(".//journal-title", namespaces=ns)
if journal_title_element is not None and journal_title_element.text:
journal_title = journal_title_element.text.strip()

abstract = article.find(".//abstract", namespaces=ns)
abstract_text = (
"".join(abstract.itertext()).strip() if abstract is not None else ""
"".join(abstract.itertext()).strip() if abstract is not None else "Abstract not available"
)

body = []
Expand All @@ -166,9 +176,14 @@ def parse_xml(self, xml_content):
if p.text:
body.append(p.text.strip())
else:
# Fallback if body directly under article is not found, check sections
for sec in article.findall(".//sec", namespaces=ns):
for p in sec.findall(".//p", namespaces=ns):
if p.text:
body.append(p.text.strip())

return {"title": title, "abstract": abstract_text, "body": "\n".join(body)}

parsed_data = {"title": title, "abstract": abstract_text, "body": "\n".join(body)}
if journal_title:
parsed_data["journal_title"] = journal_title

return parsed_data
1 change: 1 addition & 0 deletions gpt_researcher/retrievers/searchapi/searchapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def search(self, max_results=7):
"title": result["title"],
"href": result["link"],
"body": result["snippet"],
"retriever_name": "searchapi",
}
search_response.append(search_result)
results_processed += 1
Expand Down
3 changes: 2 additions & 1 deletion gpt_researcher/retrievers/searx/searx.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def search(self, max_results: int = 10) -> List[Dict[str, str]]:
for result in results.get('results', [])[:max_results]:
search_response.append({
"href": result.get('url', ''),
"body": result.get('content', '')
"body": result.get('content', ''),
"retriever_name": "searx"
})

return search_response
Expand Down
23 changes: 16 additions & 7 deletions gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,22 @@ def search(self, max_results: int = 20) -> List[Dict[str, str]]:
search_result = []

for result in results:
href = None
if result.get("isOpenAccess") and result.get("openAccessPdf"):
search_result.append(
{
"title": result.get("title", "No Title"),
"href": result["openAccessPdf"].get("url", "No URL"),
"body": result.get("abstract", "Abstract not available"),
}
)
href = result["openAccessPdf"].get("url")
if not href:
href = result.get("url", "No URL")

search_result.append(
{
"title": result.get("title", "No Title"),
"href": href,
"body": result.get("abstract", "Abstract not available"),
"citation_count": result.get("citationCount"),
"venue": result.get("venue"),
"year": result.get("year"),
"retriever_name": "semantic_scholar",
}
)

return search_result
1 change: 1 addition & 0 deletions gpt_researcher/retrievers/serpapi/serpapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def search(self, max_results=7):
"title": result["title"],
"href": result["link"],
"body": result["snippet"],
"retriever_name": "serpapi",
}
search_response.append(search_result)
results_processed += 1
Expand Down
1 change: 1 addition & 0 deletions gpt_researcher/retrievers/serper/serper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def search(self, max_results=7):
"title": result["title"],
"href": result["link"],
"body": result["snippet"],
"retriever_name": "serper",
}
search_results.append(search_result)

Expand Down
2 changes: 1 addition & 1 deletion gpt_researcher/retrievers/tavily/tavily_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def search(self, max_results=10):
raise Exception("No results found with Tavily API search.")
# Return the results
search_response = [
{"href": obj["url"], "body": obj["content"]} for obj in sources
{"href": obj["url"], "body": obj["content"], "retriever_name": "tavily"} for obj in sources
]
except Exception as e:
print(f"Error: {e}. Failed fetching sources. Resulting in empty response.")
Expand Down
Loading