2
2
from bs4 import BeautifulSoup
3
3
import os
4
4
5
- # Configuration for the target website
6
- TARGET_URL = "https://en.wikipedia.org/wiki/Main_Page" # Main page of Wikipedia
7
- URL_PREFIX = "/wiki/" # The prefix for all valid Wikipedia article URLs
5
+ # Configuration for multiple target websites
6
+ TARGET_SITES = {
7
+ "https://en.wikipedia.org/wiki/Main_Page" : "https://en.wikipedia.org/wiki/" ,
8
+ "https://example.com/blog/" : "https://example.com/blog/"
9
+ # Add more sites here in the format: "URL": "URL_PREFIX"
10
+ }
11
+
8
12
URLS_FILE = "urls.txt"
9
13
10
14
def find_new_urls (target_url , url_prefix ):
@@ -19,9 +23,10 @@ def find_new_urls(target_url, url_prefix):
19
23
found_urls = set ()
20
24
for link in soup .find_all ('a' , href = True ):
21
25
href = link .get ('href' )
22
- # Check for the correct prefix and ignore special/system links
23
- if href and href .startswith (url_prefix ) and ":" not in href and "Main_Page" not in href :
24
- full_url = "https://en.wikipedia.org" + href
26
+ # Check for correct prefix and ignore special/system links
27
+ if href and href .startswith (url_prefix ):
28
+ # Clean up relative URLs and add them to the set
29
+ full_url = href if href .startswith ('http' ) else url_prefix .rstrip ('/' ) + href
25
30
found_urls .add (full_url )
26
31
return found_urls
27
32
except requests .exceptions .RequestException as e :
@@ -35,7 +40,6 @@ def update_urls_file(new_urls):
35
40
with open (URLS_FILE , "r" ) as f :
36
41
existing_urls = {line .strip () for line in f .readlines ()}
37
42
38
- # Find the URLs that are new
39
43
new_and_unique = new_urls - existing_urls
40
44
41
45
if new_and_unique :
@@ -50,11 +54,15 @@ def update_urls_file(new_urls):
50
54
51
55
def main ():
52
56
"""Main function to discover and update URLs."""
53
- discovered_urls = find_new_urls (TARGET_URL , URL_PREFIX )
54
- if discovered_urls :
55
- if update_urls_file (discovered_urls ):
56
- return 0 # Success, changes made
57
- return 1 # No changes, nothing to commit
57
+ total_discovered_urls = set ()
58
+ for target_url , url_prefix in TARGET_SITES .items ():
59
+ discovered_urls = find_new_urls (target_url , url_prefix )
60
+ total_discovered_urls .update (discovered_urls )
61
+
62
+ if total_discovered_urls :
63
+ if update_urls_file (total_discovered_urls ):
64
+ return 0 # Success, changes made
65
+ return 1 # No changes, nothing to commit
58
66
59
67
if __name__ == "__main__" :
60
68
import sys
0 commit comments