1
1
import requests
2
2
from bs4 import BeautifulSoup
3
3
import os
4
- import json
5
4
from urllib .parse import urljoin , urlparse
6
5
7
- # Load target sites from JSON
8
- with open ("target_sites.json" , "r" ) as f :
9
- TARGET_SITES = json .load (f )
6
+ # Configuration for multiple target websites
7
+ TARGET_SITES = {
8
+ # Academic and Research
9
+ "https://arxiv.org/" : "https://arxiv.org/" ,
10
+ "https://eric.ed.gov/" : "https://eric.ed.gov/" ,
11
+ "https://www.jstor.org/" : "https://www.jstor.org/" ,
12
+ "https://pubmed.ncbi.nlm.nih.gov/" : "https://pubmed.ncbi.nlm.nih.gov/" ,
13
+ "https://www.researchgate.net/" : "https://www.researchgate.net/" ,
14
+ "https://scholar.google.com/" : "https://scholar.google.com/" ,
15
+ "https://www.semanticscholar.org/" : "https://www.semanticscholar.org/" ,
16
+ "https://www.nature.com/" : "https://www.nature.com/" ,
17
+ "https://www.science.org/" : "https://www.science.org/" ,
18
+ "https://plos.org/" : "https://plos.org/" ,
19
+ "https://www.springer.com/" : "https://www.springer.com/" ,
20
+
21
+ # Blogs and Forums
22
+ "https://www.blogger.com/" : "https://www.blogger.com/" ,
23
+ "https://dev.to/" : "https://dev.to/" ,
24
+ "https://hashnode.com/" : "https://hashnode.com/" ,
25
+ "https://medium.com/" : "https://medium.com/" ,
26
+ "https://quora.com/" : "https://quora.com/" ,
27
+ "https://www.reddit.com/r/programming/" : "https://www.reddit.com/" ,
28
+ "https://www.tripadvisor.com/" : "https://www.tripadvisor.com/" ,
29
+ "https://substack.com/" : "https://substack.com/" ,
30
+ "https://www.tumblr.com/" : "https://www.tumblr.com/" ,
31
+ "https://wordpress.com/" : "https://wordpress.com/" ,
32
+ "https://ghost.org/" : "https://ghost.org/" ,
33
+
34
+ # E-commerce and Popular Sites
35
+ "https://www.amazon.com/" : "https://www.amazon.com/" ,
36
+ "https://www.ebay.com/" : "https://www.ebay.com/" ,
37
+ "https://www.target.com/" : "https://www.target.com/" ,
38
+ "https://www.walmart.com/" : "https://www.walmart.com/" ,
39
+ "https://www.bestbuy.com/" : "https://www.bestbuy.com/" ,
40
+ "https://www.costco.com/" : "https://www.costco.com/" ,
41
+ "https://www.homedepot.com/" : "https://www.homedepot.com/" ,
42
+ "https://www.lowes.com/" : "https://www.lowes.com/" ,
43
+ "https://www.etsy.com/" : "https://www.etsy.com/" ,
44
+ "https://www.shopify.com/" : "https://www.shopify.com/" ,
45
+ "https://www.alibaba.com/" : "https://www.alibaba.com/" ,
46
+ "https://www.aliexpress.com/" : "https://www.aliexpress.com/" ,
47
+
48
+ # Educational and Reference
49
+ "https://www.codecademy.com/" : "https://www.codecademy.com/" ,
50
+ "https://www.coursera.org/" : "https://www.coursera.org/" ,
51
+ "https://developer.mozilla.org/en-US/" : "https://developer.mozilla.org/" ,
52
+ "https://docs.python.org/3/" : "https://docs.python.org/" ,
53
+ "https://www.edx.org/" : "https://www.edx.org/" ,
54
+ "https://www.freecodecamp.org/" : "https://www.freecodecamp.org/" ,
55
+ "https://www.geeksforgeeks.org/" : "https://www.geeksforgeeks.org/" ,
56
+ "https://www.khanacademy.org/" : "https://www.khanacademy.org/" ,
57
+ "https://www.udemy.com/" : "https://www.udemy.com/" ,
58
+ "https://www.w3schools.com/" : "https://www.w3schools.com/" ,
59
+ "https://www.pluralsight.com/" : "https://www.pluralsight.com/" ,
60
+ "https://www.skillshare.com/" : "https://www.skillshare.com/" ,
61
+ "https://www.lynda.com/" : "https://www.lynda.com/" ,
62
+ "https://www.udacity.com/" : "https://www.udacity.com/" ,
63
+ "https://www.masterclass.com/" : "https://www.masterclass.com/" ,
64
+ "https://brilliant.org/" : "https://brilliant.org/" ,
65
+ "https://www.duolingo.com/" : "https://www.duolingo.com/" ,
66
+
67
+ # Entertainment and Social
68
+ "https://www.cnet.com/" : "https://www.cnet.com/" ,
69
+ "https://www.gamespot.com/" : "https://www.gamespot.com/" ,
70
+ "https://www.ign.com/" : "https://www.ign.com/" ,
71
+ "https://www.imdb.com/" : "https://www.imdb.com/" ,
72
+ "https://www.instagram.com/" : "https://www.instagram.com/" ,
73
+ "https://www.pinterest.com/" : "https://www.pinterest.com/" ,
74
+ "https://www.rottentomatoes.com/" : "https://www.rottentomatoes.com/" ,
75
+ "https://www.tiktok.com/" : "https://www.tiktok.com/" ,
76
+ "https://www.youtube.com/" : "https://www.youtube.com/" ,
77
+ "https://www.twitch.tv/" : "https://www.twitch.tv/" ,
78
+ "https://www.spotify.com/" : "https://www.spotify.com/" ,
79
+ "https://www.netflix.com/" : "https://www.netflix.com/" ,
80
+ "https://www.hulu.com/" : "https://www.hulu.com/" ,
81
+ "https://www.discord.com/" : "https://www.discord.com/" ,
82
+ "https://www.steam.com/" : "https://www.steam.com/" ,
83
+ "https://www.epicgames.com/" : "https://www.epicgames.com/" ,
84
+ "https://www.polygon.com/" : "https://www.polygon.com/" ,
85
+ "https://kotaku.com/" : "https://kotaku.com/" ,
86
+
87
+ # News and Media
88
+ "https://abcnews.go.com/" : "https://abcnews.go.com/" ,
89
+ "https://www.apnews.com/" : "https://www.apnews.com/" ,
90
+ "https://www.bbc.com/news" : "https://www.bbc.com/" ,
91
+ "https://www.bloomberg.com/" : "https://www.bloomberg.com/" ,
92
+ "https://www.cbsnews.com/" : "https://www.cbsnews.com/" ,
93
+ "https://www.cnn.com/" : "https://www.cnn.com/" ,
94
+ "https://www.cnbc.com/" : "https://www.cnbc.com/" ,
95
+ "https://www.forbes.com/" : "https://www.forbes.com/" ,
96
+ "https://www.huffpost.com/" : "https://www.huffpost.com/" ,
97
+ "https://www.nytimes.com/" : "https://www.nytimes.com/" ,
98
+ "https://www.reuters.com/" : "https://www.reuters.com/" ,
99
+ "https://www.usatoday.com/" : "https://www.usatoday.com/" ,
100
+ "https://www.washingtonpost.com/" : "https://www.washingtonpost.com/" ,
101
+ "https://www.wsj.com/" : "https://www.wsj.com/" ,
102
+ "https://www.npr.org/" : "https://www.npr.org/" ,
103
+ "https://www.pbs.org/" : "https://www.pbs.org/" ,
104
+ "https://www.time.com/" : "https://www.time.com/" ,
105
+ "https://www.newsweek.com/" : "https://www.newsweek.com/" ,
106
+ "https://www.axios.com/" : "https://www.axios.com/" ,
107
+ "https://www.politico.com/" : "https://www.politico.com/" ,
108
+ "https://www.thehill.com/" : "https://www.thehill.com/" ,
109
+ "https://www.vox.com/" : "https://www.vox.com/" ,
110
+ "https://www.vice.com/" : "https://www.vice.com/" ,
111
+ "https://www.buzzfeed.com/" : "https://www.buzzfeed.com/" ,
112
+
113
+ # Technology and Programming Sites
114
+ "https://arstechnica.com/" : "https://arstechnica.com/" ,
115
+ "https://www.bleepingcomputer.com/" : "https://www.bleepingcomputer.com/" ,
116
+ "https://github.com/trending" : "https://github.com/" ,
117
+ "https://kernel.org/" : "https://www.kernel.org/" ,
118
+ "https://www.linuxquestions.org/" : "https://www.linuxquestions.org/" ,
119
+ "https://www.maketecheasier.com/" : "https://www.maketecheasier.com/" ,
120
+ "https://news.ycombinator.com/" : "https://news.ycombinator.com/" ,
121
+ "https://stackoverflow.com/" : "https://stackoverflow.com/" ,
122
+ "https://techcrunch.com/" : "https://techcrunch.com/" ,
123
+ "https://www.theverge.com/" : "https://www.theverge.com/" ,
124
+ "https://en.wikipedia.org/wiki/Main_Page" : "https://en.wikipedia.org/wiki/" ,
125
+ "https://xda-developers.com/" : "https://xda-developers.com/" ,
126
+ "https://www.wired.com/" : "https://www.wired.com/" ,
127
+ "https://www.engadget.com/" : "https://www.engadget.com/" ,
128
+ "https://www.tomshardware.com/" : "https://www.tomshardware.com/" ,
129
+ "https://www.anandtech.com/" : "https://www.anandtech.com/" ,
130
+ "https://www.pcworld.com/" : "https://www.pcworld.com/" ,
131
+ "https://www.computerworld.com/" : "https://www.computerworld.com/" ,
132
+ "https://www.infoworld.com/" : "https://www.infoworld.com/" ,
133
+ "https://www.zdnet.com/" : "https://www.zdnet.com/" ,
134
+ "https://www.techmeme.com/" : "https://www.techmeme.com/" ,
135
+ "https://slashdot.org/" : "https://slashdot.org/" ,
136
+ "https://www.hackernoon.com/" : "https://www.hackernoon.com/" ,
137
+ "https://www.dzone.com/" : "https://www.dzone.com/" ,
138
+ "https://css-tricks.com/" : "https://css-tricks.com/" ,
139
+ "https://codepen.io/" : "https://codepen.io/" ,
140
+ "https://jsfiddle.net/" : "https://jsfiddle.net/" ,
141
+ "https://replit.com/" : "https://replit.com/" ,
142
+ "https://codesandbox.io/" : "https://codesandbox.io/" ,
143
+ "https://glitch.com/" : "https://glitch.com/" ,
144
+ "https://www.hackerrank.com/" : "https://www.hackerrank.com/" ,
145
+ "https://leetcode.com/" : "https://leetcode.com/" ,
146
+ "https://www.codewars.com/" : "https://www.codewars.com/" ,
147
+ "https://www.topcoder.com/" : "https://www.topcoder.com/" ,
148
+ "https://codeforces.com/" : "https://codeforces.com/" ,
149
+
150
+ # AI and Machine Learning
151
+ "https://openai.com/" : "https://openai.com/" ,
152
+ "https://www.anthropic.com/" : "https://www.anthropic.com/" ,
153
+ "https://huggingface.co/" : "https://huggingface.co/" ,
154
+ "https://www.tensorflow.org/" : "https://www.tensorflow.org/" ,
155
+ "https://pytorch.org/" : "https://pytorch.org/" ,
156
+ "https://scikit-learn.org/" : "https://scikit-learn.org/" ,
157
+ "https://www.kaggle.com/" : "https://www.kaggle.com/" ,
158
+ "https://papers.withcode.com/" : "https://papers.withcode.com/" ,
159
+ "https://distill.pub/" : "https://distill.pub/" ,
160
+ "https://towardsdatascience.com/" : "https://towardsdatascience.com/" ,
161
+ "https://machinelearningmastery.com/" : "https://machinelearningmastery.com/" ,
162
+ "https://www.deeplearning.ai/" : "https://www.deeplearning.ai/" ,
163
+
164
+ # Cloud and DevOps
165
+ "https://aws.amazon.com/" : "https://aws.amazon.com/" ,
166
+ "https://cloud.google.com/" : "https://cloud.google.com/" ,
167
+ "https://azure.microsoft.com/" : "https://azure.microsoft.com/" ,
168
+ "https://www.digitalocean.com/" : "https://www.digitalocean.com/" ,
169
+ "https://www.linode.com/" : "https://www.linode.com/" ,
170
+ "https://www.vultr.com/" : "https://www.vultr.com/" ,
171
+ "https://www.heroku.com/" : "https://www.heroku.com/" ,
172
+ "https://vercel.com/" : "https://vercel.com/" ,
173
+ "https://netlify.com/" : "https://netlify.com/" ,
174
+ "https://www.docker.com/" : "https://www.docker.com/" ,
175
+ "https://kubernetes.io/" : "https://kubernetes.io/" ,
176
+ "https://www.jenkins.io/" : "https://www.jenkins.io/" ,
177
+ "https://github.com/actions" : "https://github.com/" ,
178
+ "https://gitlab.com/" : "https://gitlab.com/" ,
179
+ "https://bitbucket.org/" : "https://bitbucket.org/" ,
180
+
181
+ # Design and Creative
182
+ "https://www.behance.net/" : "https://www.behance.net/" ,
183
+ "https://dribbble.com/" : "https://dribbble.com/" ,
184
+ "https://www.figma.com/" : "https://www.figma.com/" ,
185
+ "https://www.adobe.com/" : "https://www.adobe.com/" ,
186
+ "https://www.canva.com/" : "https://www.canva.com/" ,
187
+ "https://unsplash.com/" : "https://unsplash.com/" ,
188
+ "https://www.pexels.com/" : "https://www.pexels.com/" ,
189
+ "https://pixabay.com/" : "https://pixabay.com/" ,
190
+ "https://www.shutterstock.com/" : "https://www.shutterstock.com/" ,
191
+ "https://www.gettyimages.com/" : "https://www.gettyimages.com/" ,
192
+
193
+ # Finance and Business
194
+ "https://www.investopedia.com/" : "https://www.investopedia.com/" ,
195
+ "https://finance.yahoo.com/" : "https://finance.yahoo.com/" ,
196
+ "https://www.marketwatch.com/" : "https://www.marketwatch.com/" ,
197
+ "https://www.fool.com/" : "https://www.fool.com/" ,
198
+ "https://www.morningstar.com/" : "https://www.morningstar.com/" ,
199
+ "https://www.sec.gov/" : "https://www.sec.gov/" ,
200
+ "https://www.nasdaq.com/" : "https://www.nasdaq.com/" ,
201
+ "https://www.nyse.com/" : "https://www.nyse.com/" ,
202
+ "https://www.entrepreneur.com/" : "https://www.entrepreneur.com/" ,
203
+ "https://www.inc.com/" : "https://www.inc.com/" ,
204
+ "https://hbr.org/" : "https://hbr.org/" ,
205
+ "https://www.fastcompany.com/" : "https://www.fastcompany.com/" ,
206
+ "https://www.businessinsider.com/" : "https://www.businessinsider.com/" ,
207
+
208
+ # Health and Lifestyle
209
+ "https://www.webmd.com/" : "https://www.webmd.com/" ,
210
+ "https://www.mayoclinic.org/" : "https://www.mayoclinic.org/" ,
211
+ "https://www.healthline.com/" : "https://www.healthline.com/" ,
212
+ "https://www.medicalnewstoday.com/" : "https://www.medicalnewstoday.com/" ,
213
+ "https://www.nih.gov/" : "https://www.nih.gov/" ,
214
+ "https://www.cdc.gov/" : "https://www.cdc.gov/" ,
215
+ "https://www.who.int/" : "https://www.who.int/" ,
216
+ "https://www.goodhousekeeping.com/" : "https://www.goodhousekeeping.com/" ,
217
+ "https://www.allrecipes.com/" : "https://www.allrecipes.com/" ,
218
+ "https://www.foodnetwork.com/" : "https://www.foodnetwork.com/" ,
219
+ "https://www.tasteofhome.com/" : "https://www.tasteofhome.com/" ,
220
+ "https://www.epicurious.com/" : "https://www.epicurious.com/" ,
221
+
222
+ # Travel and Geography
223
+ "https://www.booking.com/" : "https://www.booking.com/" ,
224
+ "https://www.expedia.com/" : "https://www.expedia.com/" ,
225
+ "https://www.airbnb.com/" : "https://www.airbnb.com/" ,
226
+ "https://www.kayak.com/" : "https://www.kayak.com/" ,
227
+ "https://www.skyscanner.com/" : "https://www.skyscanner.com/" ,
228
+ "https://www.lonelyplanet.com/" : "https://www.lonelyplanet.com/" ,
229
+ "https://www.nationalgeographic.com/" : "https://www.nationalgeographic.com/" ,
230
+ "https://www.atlasobscura.com/" : "https://www.atlasobscura.com/" ,
231
+
232
+ # Government and Legal
233
+ "https://www.usa.gov/" : "https://www.usa.gov/" ,
234
+ "https://www.congress.gov/" : "https://www.congress.gov/" ,
235
+ "https://www.whitehouse.gov/" : "https://www.whitehouse.gov/" ,
236
+ "https://www.supremecourt.gov/" : "https://www.supremecourt.gov/" ,
237
+ "https://www.fbi.gov/" : "https://www.fbi.gov/" ,
238
+ "https://www.irs.gov/" : "https://www.irs.gov/" ,
239
+ "https://www.fda.gov/" : "https://www.fda.gov/" ,
240
+ "https://www.epa.gov/" : "https://www.epa.gov/" ,
241
+
242
+ # Utilities and Tools
243
+ "https://archive.org/" : "https://archive.org/" ,
244
+ "https://translate.google.com/" : "https://translate.google.com/" ,
245
+ "https://maps.google.com/" : "https://maps.google.com/" ,
246
+ "https://www.google.com/" : "https://www.google.com/" ,
247
+ "https://www.bing.com/" : "https://www.bing.com/" ,
248
+ "https://duckduckgo.com/" : "https://duckduckgo.com/" ,
249
+ "https://www.wolframalpha.com/" : "https://www.wolframalpha.com/" ,
250
+ "https://www.mathway.com/" : "https://www.mathway.com/" ,
251
+ "https://www.grammarly.com/" : "https://www.grammarly.com/" ,
252
+ "https://pastebin.com/" : "https://pastebin.com/" ,
253
+ "https://gist.github.com/" : "https://gist.github.com/" ,
254
+ }
10
255
11
256
URLS_FILE = "urls.txt"
12
257
@@ -24,10 +269,13 @@ def find_new_urls(target_url):
24
269
25
270
for link in soup .find_all ('a' , href = True ):
26
271
href = link .get ('href' )
272
+ # Handle different types of links
27
273
full_url = urljoin (target_url , href )
28
274
parsed_url = urlparse (full_url )
29
275
276
+ # Check for same domain and valid scheme
30
277
if parsed_url .netloc == base_domain and parsed_url .scheme in ['http' , 'https' ]:
278
+ # Clean up the URL by removing fragments (#)
31
279
clean_url = full_url .split ('#' )[0 ]
32
280
found_urls .add (clean_url )
33
281
@@ -64,8 +312,8 @@ def main():
64
312
65
313
if total_discovered_urls :
66
314
if update_urls_file (total_discovered_urls ):
67
- return 0
68
- return 1
315
+ return 0 # Success, changes made
316
+ return 1 # No changes, nothing to commit
69
317
70
318
if __name__ == "__main__" :
71
319
import sys
0 commit comments