Skip to content

Commit ede68a4

Browse files
authored
Update find_urls.py
1 parent 1284c7c commit ede68a4

File tree

1 file changed

+20
-12
lines changed

1 file changed

+20
-12
lines changed

find_urls.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22
from bs4 import BeautifulSoup
33
import os
44

5-
# Configuration for the target website
6-
TARGET_URL = "https://en.wikipedia.org/wiki/Main_Page" # Main page of Wikipedia
7-
URL_PREFIX = "/wiki/" # The prefix for all valid Wikipedia article URLs
5+
# Configuration for multiple target websites
6+
TARGET_SITES = {
7+
"https://en.wikipedia.org/wiki/Main_Page": "https://en.wikipedia.org/wiki/",
8+
"https://example.com/blog/": "https://example.com/blog/"
9+
# Add more sites here in the format: "URL": "URL_PREFIX"
10+
}
11+
812
URLS_FILE = "urls.txt"
913

1014
def find_new_urls(target_url, url_prefix):
@@ -19,9 +23,10 @@ def find_new_urls(target_url, url_prefix):
1923
found_urls = set()
2024
for link in soup.find_all('a', href=True):
2125
href = link.get('href')
22-
# Check for the correct prefix and ignore special/system links
23-
if href and href.startswith(url_prefix) and ":" not in href and "Main_Page" not in href:
24-
full_url = "https://en.wikipedia.org" + href
26+
# Check for correct prefix and ignore special/system links
27+
if href and href.startswith(url_prefix):
28+
# Clean up relative URLs and add them to the set
29+
full_url = href if href.startswith('http') else url_prefix.rstrip('/') + href
2530
found_urls.add(full_url)
2631
return found_urls
2732
except requests.exceptions.RequestException as e:
@@ -35,7 +40,6 @@ def update_urls_file(new_urls):
3540
with open(URLS_FILE, "r") as f:
3641
existing_urls = {line.strip() for line in f.readlines()}
3742

38-
# Find the URLs that are new
3943
new_and_unique = new_urls - existing_urls
4044

4145
if new_and_unique:
@@ -50,11 +54,15 @@ def update_urls_file(new_urls):
5054

5155
def main():
5256
"""Main function to discover and update URLs."""
53-
discovered_urls = find_new_urls(TARGET_URL, URL_PREFIX)
54-
if discovered_urls:
55-
if update_urls_file(discovered_urls):
56-
return 0 # Success, changes made
57-
return 1 # No changes, nothing to commit
57+
total_discovered_urls = set()
58+
for target_url, url_prefix in TARGET_SITES.items():
59+
discovered_urls = find_new_urls(target_url, url_prefix)
60+
total_discovered_urls.update(discovered_urls)
61+
62+
if total_discovered_urls:
63+
if update_urls_file(total_discovered_urls):
64+
return 0 # Success, changes made
65+
return 1 # No changes, nothing to commit
5866

5967
if __name__ == "__main__":
6068
import sys

0 commit comments

Comments
 (0)