|
1 | 1 | """ |
2 | | -Research_web module |
| 2 | +research_web module |
3 | 3 | """ |
4 | 4 | import re |
5 | 5 | from typing import List |
|
8 | 8 | import requests |
9 | 9 | from bs4 import BeautifulSoup |
10 | 10 |
|
11 | | -def search_on_web(query: str, search_engine: str = "Google", |
12 | | - max_results: int = 10, port: int = 8080) -> List[str]: |
| 11 | +def search_on_web(query: str, search_engine: str = "Google", |
| 12 | + max_results: int = 10, port: int = 8080, |
| 13 | + timeout: int = 10) -> List[str]: |
13 | 14 | """ |
14 | | - Searches the web for a given query using specified search engine options. |
| 15 | + Searches the web for a given query using specified search |
| 16 | + engine options and filters out PDF links. |
15 | 17 |
|
16 | 18 | Args: |
17 | 19 | query (str): The search query to find on the internet. |
18 | 20 | search_engine (str, optional): Specifies the search engine to use, |
19 | 21 | options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. |
20 | 22 | max_results (int, optional): The maximum number of search results to return. |
21 | 23 | port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. |
| 24 | + timeout (int, optional): The number of seconds to wait |
| 25 | + for a response from a request. Default is 10 seconds. |
22 | 26 |
|
23 | 27 | Returns: |
24 | | - List[str]: A list of URLs as strings that are the search results. |
| 28 | + List[str]: A list of URLs as strings that are the search results, excluding any PDF links. |
25 | 29 |
|
26 | 30 | Raises: |
27 | 31 | ValueError: If the search engine specified is not supported. |
| 32 | + requests.exceptions.Timeout: If the request times out. |
28 | 33 |
|
29 | 34 | Example: |
30 | 35 | >>> search_on_web("example query", search_engine="Google", max_results=5) |
31 | 36 | ['http://example.com', 'http://example.org', ...] |
32 | 37 | """ |
33 | 38 |
|
| 39 | + def filter_pdf_links(links: List[str]) -> List[str]: |
| 40 | + """ |
| 41 | + Filters out any links that point to PDF files. |
| 42 | +
|
| 43 | + Args: |
| 44 | + links (List[str]): A list of URLs as strings. |
| 45 | +
|
| 46 | + Returns: |
| 47 | + List[str]: A list of URLs excluding any that end with '.pdf'. |
| 48 | + """ |
| 49 | + return [link for link in links if not link.lower().endswith('.pdf')] |
| 50 | + |
34 | 51 | if search_engine.lower() == "google": |
35 | 52 | res = [] |
36 | 53 | for url in google_search(query, stop=max_results): |
37 | 54 | res.append(url) |
38 | | - return res |
| 55 | + return filter_pdf_links(res) |
39 | 56 |
|
40 | 57 | elif search_engine.lower() == "duckduckgo": |
41 | 58 | research = DuckDuckGoSearchResults(max_results=max_results) |
42 | 59 | res = research.run(query) |
43 | 60 | links = re.findall(r'https?://[^\s,\]]+', res) |
44 | | - return links |
| 61 | + return filter_pdf_links(links) |
45 | 62 |
|
46 | 63 | elif search_engine.lower() == "bing": |
47 | 64 | headers = { |
48 | 65 | "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) |
49 | 66 | AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" |
50 | 67 | } |
51 | 68 | search_url = f"https://www.bing.com/search?q={query}" |
52 | | - response = requests.get(search_url, headers=headers) |
| 69 | + response = requests.get(search_url, headers=headers, timeout=timeout) |
53 | 70 | response.raise_for_status() |
54 | 71 | soup = BeautifulSoup(response.text, "html.parser") |
55 | 72 |
|
56 | 73 | search_results = [] |
57 | 74 | for result in soup.find_all('li', class_='b_algo', limit=max_results): |
58 | 75 | link = result.find('a')['href'] |
59 | 76 | search_results.append(link) |
60 | | - return search_results |
| 77 | + return filter_pdf_links(search_results) |
61 | 78 |
|
62 | 79 | elif search_engine.lower() == "searxng": |
63 | 80 | url = f"http://localhost:{port}" |
64 | | - params = {"q": query, |
65 | | - "format": "json", |
66 | | - "engines": "google,duckduckgo,brave,qwant,bing"} |
67 | | - |
68 | | - response = requests.get(url, params=params) |
69 | | - |
| 81 | + params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} |
| 82 | + response = requests.get(url, params=params, timeout=timeout) |
70 | 83 | data = response.json() |
71 | | - limited_results = data["results"][:max_results] |
72 | | - return limited_results |
| 84 | + limited_results = [result['url'] for result in data["results"][:max_results]] |
| 85 | + return filter_pdf_links(limited_results) |
73 | 86 |
|
74 | 87 | else: |
75 | | - raise ValueError("""The only search engines available are |
| 88 | + raise ValueError("""The only search engines available are |
76 | 89 | DuckDuckGo, Google, Bing, or SearXNG""") |
0 commit comments