assafelovic · techycardiac · May 22, 2025
diff --git a/gpt_researcher/actions/retriever.py b/gpt_researcher/actions/retriever.py
@@ -79,21 +79,43 @@ def get_retrievers(headers: dict[str, str], cfg: Config):
     Returns:
         list: A list of retriever classes to be used for searching.
     """
-    # Check headers first for multiple retrievers
-    if headers.get("retrievers"):
-        retrievers = headers.get("retrievers").split(",")
-    # If not found, check headers for a single retriever
-    elif headers.get("retriever"):
-        retrievers = [headers.get("retriever")]
-    # If not in headers, check config for multiple retrievers
-    elif cfg.retrievers:
-        retrievers = cfg.retrievers
-    # If not found, check config for a single retriever
-    elif cfg.retriever:
-        retrievers = [cfg.retriever]
-    # If still not set, use default retriever
+    retrievers = [] # Initialize an empty list
+
+    # Check if focus_academic_medical_sources is True in cfg
+    if getattr(cfg, 'focus_academic_medical_sources', False):
+        retrievers = ["semantic_scholar", "pubmed_central", "arxiv"]
     else:
-        retrievers = [get_default_retriever().__name__]
+        # Fallback to existing logic if focus_academic_medical_sources is False or not set
+        # Check headers first for multiple retrievers
+        if headers.get("retrievers"):
+            retrievers = headers.get("retrievers").split(",")
+        # If not found, check headers for a single retriever
+        elif headers.get("retriever"):
+            retrievers = [headers.get("retriever")]
+        # If not in headers, check config for multiple retrievers
+        elif cfg.retrievers:
+            retrievers = cfg.retrievers
+        # If not found, check config for a single retriever
+        elif cfg.retriever:
+            retrievers = [cfg.retriever]
+        # If still not set, use default retriever
+        else:
+            # Assuming get_default_retriever() returns the class itself, 
+            # and we need its name for the list of strings.
+            # However, the final list comprehension expects names, so this is fine.
+            # The original code used get_default_retriever().__name__
+            # Let's be consistent if this list is purely names before conversion.
+            # The original logic implies 'retrievers' is a list of names.
+            default_retriever_class = get_default_retriever()
+            # Try to get the name; this might be tricky if it's not always a class with __name__
+            # For simplicity, and given how other retrievers are specified (as strings),
+            # let's assume the default should also be a string name here.
+            # The default retriever is TavilySearch, so its name is "tavily".
+            # This part is a bit tricky as the original code structure was a bit inconsistent.
+            # Given the final line: `[get_retriever(r) or get_default_retriever() for r in retrievers]`
+            # `retrievers` must be a list of strings.
+            retrievers = ["tavily"] # Defaulting to "tavily" as a string name.
+                                    # This matches the default retriever's expected name.
 
     # Convert retriever names to actual retriever classes
     # Use get_default_retriever() as a fallback for any invalid retriever names

diff --git a/gpt_researcher/config/variables/base.py b/gpt_researcher/config/variables/base.py
@@ -21,6 +21,7 @@ class BaseConfig(TypedDict):
     TOTAL_WORDS: int
     REPORT_FORMAT: str
     CURATE_SOURCES: bool
+    FOCUS_ACADEMIC_MEDICAL_SOURCES: bool
     MAX_ITERATIONS: int
     LANGUAGE: str
     AGENT_ROLE: Union[str, None]

diff --git a/gpt_researcher/config/variables/default.py b/gpt_researcher/config/variables/default.py
@@ -12,6 +12,7 @@
     "STRATEGIC_TOKEN_LIMIT": 4000,
     "BROWSE_CHUNK_MAX_LENGTH": 8192,
     "CURATE_SOURCES": False,
+    "FOCUS_ACADEMIC_MEDICAL_SOURCES": False,
     "SUMMARY_TOKEN_LIMIT": 700,
     "TEMPERATURE": 0.4,
     "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",

diff --git a/gpt_researcher/prompts.py b/gpt_researcher/prompts.py
@@ -106,41 +106,59 @@ def generate_report_prompt(
 """
 
 def curate_sources(query, sources, max_results=10):
-    return f"""Your goal is to evaluate and curate the provided scraped content for the research task: "{query}" 
-    while prioritizing the inclusion of relevant and high-quality information, especially sources containing statistics, numbers, or concrete data.
-
-The final curated list will be used as context for creating a research report, so prioritize:
-- Retaining as much original information as possible, with extra emphasis on sources featuring quantitative data or unique insights
-- Including a wide range of perspectives and insights
-- Filtering out only clearly irrelevant or unusable content
-
-EVALUATION GUIDELINES:
-1. Assess each source based on:
-   - Relevance: Include sources directly or partially connected to the research query. Err on the side of inclusion.
-   - Credibility: Favor authoritative sources but retain others unless clearly untrustworthy.
-   - Currency: Prefer recent information unless older data is essential or valuable.
-   - Objectivity: Retain sources with bias if they provide a unique or complementary perspective.
-   - Quantitative Value: Give higher priority to sources with statistics, numbers, or other concrete data.
-2. Source Selection:
-   - Include as many relevant sources as possible, up to {max_results}, focusing on broad coverage and diversity.
-   - Prioritize sources with statistics, numerical data, or verifiable facts.
-   - Overlapping content is acceptable if it adds depth, especially when data is involved.
-   - Exclude sources only if they are entirely irrelevant, severely outdated, or unusable due to poor content quality.
-3. Content Retention:
-   - DO NOT rewrite, summarize, or condense any source content.
-   - Retain all usable information, cleaning up only clear garbage or formatting issues.
-   - Keep marginally relevant or incomplete sources if they contain valuable data or insights.
+    return f"""Your goal is to evaluate and curate the provided list of sources for the research task: "{query}".
+The research task is focused on academic and valid medical literature. You must prioritize sources that align with this focus.
+
+Each source in the list is a JSON object that may contain fields such as:
+- `title`: Original title from the retriever.
+- `scraped_title`: Title found during web scraping.
+- `href`: URL of the source.
+- `body`: Original abstract or summary from the retriever.
+- `raw_content`: Full text content scraped from the web page.
+- `retriever_name`: Identifier for the search service that found this source (e.g., "semantic_scholar", "pubmed_central", "arxiv", "google").
+- `citation_count`: Number of citations (primarily from "semantic_scholar").
+- `venue`: Publication venue (e.g., journal name from "semantic_scholar").
+- `year`: Publication year.
+- `journal_title`: Journal title (primarily from "pubmed_central").
+
+EVALUATION AND CURATION GUIDELINES FOR ACADEMIC/MEDICAL RESEARCH:
+
+1.  **Prioritize by Retriever Source:**
+    *   Strongly prefer sources where `retriever_name` is "semantic_scholar", "pubmed_central", or "arxiv". These are primary academic/medical databases.
+    *   Treat sources from general web retrievers (e.g., "google", "bing", "duckduckgo") with caution. Only include them if their content is exceptionally relevant, clearly academic or medical in nature (e.g., a direct link to a university research paper or a well-known medical institution's publication), and supports the primary academic sources.
+
+2.  **Assess Academic Validity and Quality:**
+    *   **For "semantic_scholar" sources:** Give very high priority to sources with a high `citation_count`. Also, consider the `venue` and `year`; more recent, highly cited articles in reputable venues are generally better.
+    *   **For "pubmed_central" sources:** These are generally strong. Consider the `journal_title` for potential journal quality if available.
+    *   **For "arxiv" sources:** These are pre-prints. They can be valuable for cutting-edge research but may not be peer-reviewed. Include if highly relevant.
+    *   Evaluate the `raw_content` (or `body` if `raw_content` is minimal) for scientific rigor, appropriate methodology (if applicable), and clarity.
+    *   Filter out sources that are clearly non-academic (e.g., news articles unless reporting on research, blog posts, forum discussions, commercial product pages) unless they provide specific, verifiable data or context directly supporting an academic point.
+
+3.  **Relevance to Query:**
+    *   The source must be highly relevant to the research task: "{query}".
+
+4.  **Content Richness & Uniqueness:**
+    *   Prefer sources with substantial `raw_content` that provides in-depth information.
+    *   Among relevant and high-quality sources, aim for a diversity of information and perspectives.
+
+5.  **Source Selection and Filtering:**
+    *   Include up to {max_results} of the best sources based on the above criteria.
+    *   Be prepared to filter aggressively if many sources are from general web retrievers or are of low academic quality. The goal is a curated list of high-quality academic/medical literature.
+    *   If a source has both `body` (abstract) and `raw_content` (full text), your evaluation should primarily be based on the `raw_content` as it's more comprehensive.
+
+6.  **Content Retention (Important for Output Format):**
+    *   DO NOT rewrite, summarize, or condense any part of the source objects.
+    *   Your task is to SELECT and REORDER the sources from the provided list.
 
 SOURCES LIST TO EVALUATE:
 {sources}
 
-You MUST return your response in the EXACT sources JSON list format as the original sources.
-The response MUST not contain any markdown format or additional text (like ```json), just the JSON list!
+You MUST return your response as a JSON list of source objects, in the EXACT SAME FORMAT as they were provided in the input sources list.
+The returned list should only contain the sources you've selected and prioritized, up to {max_results}.
+The response MUST not contain any markdown format or additional text (like ```json), just the JSON list itself!
 """
 
 
-
-
 def generate_resource_report_prompt(
     question, context, report_source: str, report_format="apa", tone=None, total_words=1000, language="english"
 ):
@@ -529,4 +547,3 @@ def get_prompt_by_report_type(report_type):
         )
         prompt_by_type = report_type_mapping.get(default_report_type)
     return prompt_by_type
-
diff --git a/gpt_researcher/retrievers/arxiv/arxiv.py b/gpt_researcher/retrievers/arxiv/arxiv.py
@@ -35,6 +35,7 @@ def search(self, max_results=5):
                 "title": result.title,
                 "href": result.pdf_url,
                 "body": result.summary,
+                "retriever_name": "arxiv",
             })
 
         return search_result
diff --git a/gpt_researcher/retrievers/bing/bing.py b/gpt_researcher/retrievers/bing/bing.py
@@ -89,6 +89,7 @@ def search(self, max_results=7) -> list[dict[str]]:
                 "title": result["name"],
                 "href": result["url"],
                 "body": result["snippet"],
+                "retriever_name": "bing",
             }
             search_results.append(search_result)
 

diff --git a/gpt_researcher/retrievers/custom/custom.py b/gpt_researcher/retrievers/custom/custom.py
@@ -46,7 +46,14 @@ def search(self, max_results: int = 5) -> Optional[List[Dict[str, Any]]]:
         try:
             response = requests.get(self.endpoint, params={**self.params, 'query': self.query})
             response.raise_for_status()
-            return response.json()
+            results = response.json()
+            if results:
+                for result in results:
+                    result["retriever_name"] = "custom"
+            return results
         except requests.RequestException as e:
             print(f"Failed to retrieve search results: {e}")
+            return None
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse JSON response: {e}")
             return None
diff --git a/gpt_researcher/retrievers/duckduckgo/duckduckgo.py b/gpt_researcher/retrievers/duckduckgo/duckduckgo.py
@@ -22,7 +22,11 @@ def search(self, max_results=5):
         """
         # TODO: Add support for query domains
         try:
-            search_response = self.ddg.text(self.query, region='wt-wt', max_results=max_results)
+            results = self.ddg.text(self.query, region='wt-wt', max_results=max_results)
+            if results:
+                for result in results:
+                    result["retriever_name"] = "duckduckgo"
+            search_response = results if results else []
         except Exception as e:
             print(f"Error: {e}. Failed fetching sources. Resulting in empty response.")
             search_response = []

diff --git a/gpt_researcher/retrievers/exa/exa.py b/gpt_researcher/retrievers/exa/exa.py
@@ -59,7 +59,8 @@ def search(
         )
 
         search_response = [
-            {"href": result.url, "body": result.text} for result in results.results
+            {"href": result.url, "body": result.text, "retriever_name": "exa"}
+            for result in results.results
         ]
         return search_response
 

diff --git a/gpt_researcher/retrievers/google/google.py b/gpt_researcher/retrievers/google/google.py
@@ -92,8 +92,9 @@ def search(self, max_results=7):
                     "title": result["title"],
                     "href": result["link"],
                     "body": result["snippet"],
+                    "retriever_name": "google",
                 }
-            except:
+            except KeyError: # More specific exception
                 continue
             search_results.append(search_result)
 

diff --git a/gpt_researcher/retrievers/pubmed_central/pubmed_central.py b/gpt_researcher/retrievers/pubmed_central/pubmed_central.py
@@ -69,12 +69,15 @@ def search(self, max_results=10):
             if self.has_body_content(xml_content):
                 article_data = self.parse_xml(xml_content)
                 if article_data:
-                    search_response.append(
-                        {
-                            "href": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{article_id}/",
-                            "body": f"{article_data['title']}\n\n{article_data['abstract']}\n\n{article_data['body'][:500]}...",
-                        }
-                    )
+                    response_item = {
+                        "href": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{article_id}/",
+                        "body": f"{article_data.get('title', 'No Title')}\n\n{article_data.get('abstract', 'Abstract not available')}\n\n{article_data.get('body', '')[:500]}...",
+                        "retriever_name": "pubmed_central"
+                    }
+                    if "journal_title" in article_data:
+                        response_item["journal_title"] = article_data["journal_title"]
+
+                    search_response.append(response_item)
 
             if len(search_response) >= max_results:
                 break
@@ -151,12 +154,19 @@ def parse_xml(self, xml_content):
             return None
 
         title = article.findtext(
-            ".//title-group/article-title", default="", namespaces=ns
+            ".//title-group/article-title", default="No Title", namespaces=ns
         )
 
+        journal_meta = article.find(".//journal-meta", namespaces=ns)
+        journal_title = None
+        if journal_meta is not None:
+            journal_title_element = journal_meta.find(".//journal-title", namespaces=ns)
+            if journal_title_element is not None and journal_title_element.text:
+                journal_title = journal_title_element.text.strip()
+
         abstract = article.find(".//abstract", namespaces=ns)
         abstract_text = (
-            "".join(abstract.itertext()).strip() if abstract is not None else ""
+            "".join(abstract.itertext()).strip() if abstract is not None else "Abstract not available"
         )
 
         body = []
@@ -166,9 +176,14 @@ def parse_xml(self, xml_content):
                 if p.text:
                     body.append(p.text.strip())
         else:
+            # Fallback if body directly under article is not found, check sections
             for sec in article.findall(".//sec", namespaces=ns):
                 for p in sec.findall(".//p", namespaces=ns):
                     if p.text:
                         body.append(p.text.strip())
-
-        return {"title": title, "abstract": abstract_text, "body": "\n".join(body)}
+
+        parsed_data = {"title": title, "abstract": abstract_text, "body": "\n".join(body)}
+        if journal_title:
+            parsed_data["journal_title"] = journal_title
+
+        return parsed_data
diff --git a/gpt_researcher/retrievers/searchapi/searchapi.py b/gpt_researcher/retrievers/searchapi/searchapi.py
@@ -74,6 +74,7 @@ def search(self, max_results=7):
                             "title": result["title"],
                             "href": result["link"],
                             "body": result["snippet"],
+                            "retriever_name": "searchapi",
                         }
                         search_response.append(search_result)
                         results_processed += 1

diff --git a/gpt_researcher/retrievers/searx/searx.py b/gpt_researcher/retrievers/searx/searx.py
@@ -67,7 +67,8 @@ def search(self, max_results: int = 10) -> List[Dict[str, str]]:
             for result in results.get('results', [])[:max_results]:
                 search_response.append({
                     "href": result.get('url', ''),
-                    "body": result.get('content', '')
+                    "body": result.get('content', ''),
+                    "retriever_name": "searx"
                 })
 
             return search_response

diff --git a/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py b/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py
@@ -47,13 +47,22 @@ def search(self, max_results: int = 20) -> List[Dict[str, str]]:
         search_result = []
 
         for result in results:
+            href = None
             if result.get("isOpenAccess") and result.get("openAccessPdf"):
-                search_result.append(
-                    {
-                        "title": result.get("title", "No Title"),
-                        "href": result["openAccessPdf"].get("url", "No URL"),
-                        "body": result.get("abstract", "Abstract not available"),
-                    }
-                )
+                href = result["openAccessPdf"].get("url")
+            if not href:
+                href = result.get("url", "No URL")
+
+            search_result.append(
+                {
+                    "title": result.get("title", "No Title"),
+                    "href": href,
+                    "body": result.get("abstract", "Abstract not available"),
+                    "citation_count": result.get("citationCount"),
+                    "venue": result.get("venue"),
+                    "year": result.get("year"),
+                    "retriever_name": "semantic_scholar",
+                }
+            )
 
         return search_result
diff --git a/gpt_researcher/retrievers/serpapi/serpapi.py b/gpt_researcher/retrievers/serpapi/serpapi.py
@@ -72,6 +72,7 @@ def search(self, max_results=7):
                             "title": result["title"],
                             "href": result["link"],
                             "body": result["snippet"],
+                            "retriever_name": "serpapi",
                         }
                         search_response.append(search_result)
                         results_processed += 1

diff --git a/gpt_researcher/retrievers/serper/serper.py b/gpt_researcher/retrievers/serper/serper.py
@@ -78,6 +78,7 @@ def search(self, max_results=7):
                 "title": result["title"],
                 "href": result["link"],
                 "body": result["snippet"],
+                "retriever_name": "serper",
             }
             search_results.append(search_result)
 

diff --git a/gpt_researcher/retrievers/tavily/tavily_search.py b/gpt_researcher/retrievers/tavily/tavily_search.py
@@ -113,7 +113,7 @@ def search(self, max_results=10):
                 raise Exception("No results found with Tavily API search.")
             # Return the results
             search_response = [
-                {"href": obj["url"], "body": obj["content"]} for obj in sources
+                {"href": obj["url"], "body": obj["content"], "retriever_name": "tavily"} for obj in sources
             ]
         except Exception as e:
             print(f"Error: {e}. Failed fetching sources. Resulting in empty response.")