Skip to content

Commit f5abcc0

Browse files
authored
Update find_urls.py
1 parent 3d2e60d commit f5abcc0

File tree

1 file changed

+254
-6
lines changed

1 file changed

+254
-6
lines changed

find_urls.py

Lines changed: 254 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,257 @@
11
import requests
22
from bs4 import BeautifulSoup
33
import os
4-
import json
54
from urllib.parse import urljoin, urlparse
65

7-
# Load target sites from JSON
8-
with open("target_sites.json", "r") as f:
9-
TARGET_SITES = json.load(f)
6+
# Configuration for multiple target websites
7+
TARGET_SITES = {
8+
# Academic and Research
9+
"https://arxiv.org/": "https://arxiv.org/",
10+
"https://eric.ed.gov/": "https://eric.ed.gov/",
11+
"https://www.jstor.org/": "https://www.jstor.org/",
12+
"https://pubmed.ncbi.nlm.nih.gov/": "https://pubmed.ncbi.nlm.nih.gov/",
13+
"https://www.researchgate.net/": "https://www.researchgate.net/",
14+
"https://scholar.google.com/": "https://scholar.google.com/",
15+
"https://www.semanticscholar.org/": "https://www.semanticscholar.org/",
16+
"https://www.nature.com/": "https://www.nature.com/",
17+
"https://www.science.org/": "https://www.science.org/",
18+
"https://plos.org/": "https://plos.org/",
19+
"https://www.springer.com/": "https://www.springer.com/",
20+
21+
# Blogs and Forums
22+
"https://www.blogger.com/": "https://www.blogger.com/",
23+
"https://dev.to/": "https://dev.to/",
24+
"https://hashnode.com/": "https://hashnode.com/",
25+
"https://medium.com/": "https://medium.com/",
26+
"https://quora.com/": "https://quora.com/",
27+
"https://www.reddit.com/r/programming/": "https://www.reddit.com/",
28+
"https://www.tripadvisor.com/": "https://www.tripadvisor.com/",
29+
"https://substack.com/": "https://substack.com/",
30+
"https://www.tumblr.com/": "https://www.tumblr.com/",
31+
"https://wordpress.com/": "https://wordpress.com/",
32+
"https://ghost.org/": "https://ghost.org/",
33+
34+
# E-commerce and Popular Sites
35+
"https://www.amazon.com/": "https://www.amazon.com/",
36+
"https://www.ebay.com/": "https://www.ebay.com/",
37+
"https://www.target.com/": "https://www.target.com/",
38+
"https://www.walmart.com/": "https://www.walmart.com/",
39+
"https://www.bestbuy.com/": "https://www.bestbuy.com/",
40+
"https://www.costco.com/": "https://www.costco.com/",
41+
"https://www.homedepot.com/": "https://www.homedepot.com/",
42+
"https://www.lowes.com/": "https://www.lowes.com/",
43+
"https://www.etsy.com/": "https://www.etsy.com/",
44+
"https://www.shopify.com/": "https://www.shopify.com/",
45+
"https://www.alibaba.com/": "https://www.alibaba.com/",
46+
"https://www.aliexpress.com/": "https://www.aliexpress.com/",
47+
48+
# Educational and Reference
49+
"https://www.codecademy.com/": "https://www.codecademy.com/",
50+
"https://www.coursera.org/": "https://www.coursera.org/",
51+
"https://developer.mozilla.org/en-US/": "https://developer.mozilla.org/",
52+
"https://docs.python.org/3/": "https://docs.python.org/",
53+
"https://www.edx.org/": "https://www.edx.org/",
54+
"https://www.freecodecamp.org/": "https://www.freecodecamp.org/",
55+
"https://www.geeksforgeeks.org/": "https://www.geeksforgeeks.org/",
56+
"https://www.khanacademy.org/": "https://www.khanacademy.org/",
57+
"https://www.udemy.com/": "https://www.udemy.com/",
58+
"https://www.w3schools.com/": "https://www.w3schools.com/",
59+
"https://www.pluralsight.com/": "https://www.pluralsight.com/",
60+
"https://www.skillshare.com/": "https://www.skillshare.com/",
61+
"https://www.lynda.com/": "https://www.lynda.com/",
62+
"https://www.udacity.com/": "https://www.udacity.com/",
63+
"https://www.masterclass.com/": "https://www.masterclass.com/",
64+
"https://brilliant.org/": "https://brilliant.org/",
65+
"https://www.duolingo.com/": "https://www.duolingo.com/",
66+
67+
# Entertainment and Social
68+
"https://www.cnet.com/": "https://www.cnet.com/",
69+
"https://www.gamespot.com/": "https://www.gamespot.com/",
70+
"https://www.ign.com/": "https://www.ign.com/",
71+
"https://www.imdb.com/": "https://www.imdb.com/",
72+
"https://www.instagram.com/": "https://www.instagram.com/",
73+
"https://www.pinterest.com/": "https://www.pinterest.com/",
74+
"https://www.rottentomatoes.com/": "https://www.rottentomatoes.com/",
75+
"https://www.tiktok.com/": "https://www.tiktok.com/",
76+
"https://www.youtube.com/": "https://www.youtube.com/",
77+
"https://www.twitch.tv/": "https://www.twitch.tv/",
78+
"https://www.spotify.com/": "https://www.spotify.com/",
79+
"https://www.netflix.com/": "https://www.netflix.com/",
80+
"https://www.hulu.com/": "https://www.hulu.com/",
81+
"https://www.discord.com/": "https://www.discord.com/",
82+
"https://www.steam.com/": "https://www.steam.com/",
83+
"https://www.epicgames.com/": "https://www.epicgames.com/",
84+
"https://www.polygon.com/": "https://www.polygon.com/",
85+
"https://kotaku.com/": "https://kotaku.com/",
86+
87+
# News and Media
88+
"https://abcnews.go.com/": "https://abcnews.go.com/",
89+
"https://www.apnews.com/": "https://www.apnews.com/",
90+
"https://www.bbc.com/news": "https://www.bbc.com/",
91+
"https://www.bloomberg.com/": "https://www.bloomberg.com/",
92+
"https://www.cbsnews.com/": "https://www.cbsnews.com/",
93+
"https://www.cnn.com/": "https://www.cnn.com/",
94+
"https://www.cnbc.com/": "https://www.cnbc.com/",
95+
"https://www.forbes.com/": "https://www.forbes.com/",
96+
"https://www.huffpost.com/": "https://www.huffpost.com/",
97+
"https://www.nytimes.com/": "https://www.nytimes.com/",
98+
"https://www.reuters.com/": "https://www.reuters.com/",
99+
"https://www.usatoday.com/": "https://www.usatoday.com/",
100+
"https://www.washingtonpost.com/": "https://www.washingtonpost.com/",
101+
"https://www.wsj.com/": "https://www.wsj.com/",
102+
"https://www.npr.org/": "https://www.npr.org/",
103+
"https://www.pbs.org/": "https://www.pbs.org/",
104+
"https://www.time.com/": "https://www.time.com/",
105+
"https://www.newsweek.com/": "https://www.newsweek.com/",
106+
"https://www.axios.com/": "https://www.axios.com/",
107+
"https://www.politico.com/": "https://www.politico.com/",
108+
"https://www.thehill.com/": "https://www.thehill.com/",
109+
"https://www.vox.com/": "https://www.vox.com/",
110+
"https://www.vice.com/": "https://www.vice.com/",
111+
"https://www.buzzfeed.com/": "https://www.buzzfeed.com/",
112+
113+
# Technology and Programming Sites
114+
"https://arstechnica.com/": "https://arstechnica.com/",
115+
"https://www.bleepingcomputer.com/": "https://www.bleepingcomputer.com/",
116+
"https://github.com/trending": "https://github.com/",
117+
"https://kernel.org/": "https://www.kernel.org/",
118+
"https://www.linuxquestions.org/": "https://www.linuxquestions.org/",
119+
"https://www.maketecheasier.com/": "https://www.maketecheasier.com/",
120+
"https://news.ycombinator.com/": "https://news.ycombinator.com/",
121+
"https://stackoverflow.com/": "https://stackoverflow.com/",
122+
"https://techcrunch.com/": "https://techcrunch.com/",
123+
"https://www.theverge.com/": "https://www.theverge.com/",
124+
"https://en.wikipedia.org/wiki/Main_Page": "https://en.wikipedia.org/wiki/",
125+
"https://xda-developers.com/": "https://xda-developers.com/",
126+
"https://www.wired.com/": "https://www.wired.com/",
127+
"https://www.engadget.com/": "https://www.engadget.com/",
128+
"https://www.tomshardware.com/": "https://www.tomshardware.com/",
129+
"https://www.anandtech.com/": "https://www.anandtech.com/",
130+
"https://www.pcworld.com/": "https://www.pcworld.com/",
131+
"https://www.computerworld.com/": "https://www.computerworld.com/",
132+
"https://www.infoworld.com/": "https://www.infoworld.com/",
133+
"https://www.zdnet.com/": "https://www.zdnet.com/",
134+
"https://www.techmeme.com/": "https://www.techmeme.com/",
135+
"https://slashdot.org/": "https://slashdot.org/",
136+
"https://www.hackernoon.com/": "https://www.hackernoon.com/",
137+
"https://www.dzone.com/": "https://www.dzone.com/",
138+
"https://css-tricks.com/": "https://css-tricks.com/",
139+
"https://codepen.io/": "https://codepen.io/",
140+
"https://jsfiddle.net/": "https://jsfiddle.net/",
141+
"https://replit.com/": "https://replit.com/",
142+
"https://codesandbox.io/": "https://codesandbox.io/",
143+
"https://glitch.com/": "https://glitch.com/",
144+
"https://www.hackerrank.com/": "https://www.hackerrank.com/",
145+
"https://leetcode.com/": "https://leetcode.com/",
146+
"https://www.codewars.com/": "https://www.codewars.com/",
147+
"https://www.topcoder.com/": "https://www.topcoder.com/",
148+
"https://codeforces.com/": "https://codeforces.com/",
149+
150+
# AI and Machine Learning
151+
"https://openai.com/": "https://openai.com/",
152+
"https://www.anthropic.com/": "https://www.anthropic.com/",
153+
"https://huggingface.co/": "https://huggingface.co/",
154+
"https://www.tensorflow.org/": "https://www.tensorflow.org/",
155+
"https://pytorch.org/": "https://pytorch.org/",
156+
"https://scikit-learn.org/": "https://scikit-learn.org/",
157+
"https://www.kaggle.com/": "https://www.kaggle.com/",
158+
"https://papers.withcode.com/": "https://papers.withcode.com/",
159+
"https://distill.pub/": "https://distill.pub/",
160+
"https://towardsdatascience.com/": "https://towardsdatascience.com/",
161+
"https://machinelearningmastery.com/": "https://machinelearningmastery.com/",
162+
"https://www.deeplearning.ai/": "https://www.deeplearning.ai/",
163+
164+
# Cloud and DevOps
165+
"https://aws.amazon.com/": "https://aws.amazon.com/",
166+
"https://cloud.google.com/": "https://cloud.google.com/",
167+
"https://azure.microsoft.com/": "https://azure.microsoft.com/",
168+
"https://www.digitalocean.com/": "https://www.digitalocean.com/",
169+
"https://www.linode.com/": "https://www.linode.com/",
170+
"https://www.vultr.com/": "https://www.vultr.com/",
171+
"https://www.heroku.com/": "https://www.heroku.com/",
172+
"https://vercel.com/": "https://vercel.com/",
173+
"https://netlify.com/": "https://netlify.com/",
174+
"https://www.docker.com/": "https://www.docker.com/",
175+
"https://kubernetes.io/": "https://kubernetes.io/",
176+
"https://www.jenkins.io/": "https://www.jenkins.io/",
177+
"https://github.com/actions": "https://github.com/",
178+
"https://gitlab.com/": "https://gitlab.com/",
179+
"https://bitbucket.org/": "https://bitbucket.org/",
180+
181+
# Design and Creative
182+
"https://www.behance.net/": "https://www.behance.net/",
183+
"https://dribbble.com/": "https://dribbble.com/",
184+
"https://www.figma.com/": "https://www.figma.com/",
185+
"https://www.adobe.com/": "https://www.adobe.com/",
186+
"https://www.canva.com/": "https://www.canva.com/",
187+
"https://unsplash.com/": "https://unsplash.com/",
188+
"https://www.pexels.com/": "https://www.pexels.com/",
189+
"https://pixabay.com/": "https://pixabay.com/",
190+
"https://www.shutterstock.com/": "https://www.shutterstock.com/",
191+
"https://www.gettyimages.com/": "https://www.gettyimages.com/",
192+
193+
# Finance and Business
194+
"https://www.investopedia.com/": "https://www.investopedia.com/",
195+
"https://finance.yahoo.com/": "https://finance.yahoo.com/",
196+
"https://www.marketwatch.com/": "https://www.marketwatch.com/",
197+
"https://www.fool.com/": "https://www.fool.com/",
198+
"https://www.morningstar.com/": "https://www.morningstar.com/",
199+
"https://www.sec.gov/": "https://www.sec.gov/",
200+
"https://www.nasdaq.com/": "https://www.nasdaq.com/",
201+
"https://www.nyse.com/": "https://www.nyse.com/",
202+
"https://www.entrepreneur.com/": "https://www.entrepreneur.com/",
203+
"https://www.inc.com/": "https://www.inc.com/",
204+
"https://hbr.org/": "https://hbr.org/",
205+
"https://www.fastcompany.com/": "https://www.fastcompany.com/",
206+
"https://www.businessinsider.com/": "https://www.businessinsider.com/",
207+
208+
# Health and Lifestyle
209+
"https://www.webmd.com/": "https://www.webmd.com/",
210+
"https://www.mayoclinic.org/": "https://www.mayoclinic.org/",
211+
"https://www.healthline.com/": "https://www.healthline.com/",
212+
"https://www.medicalnewstoday.com/": "https://www.medicalnewstoday.com/",
213+
"https://www.nih.gov/": "https://www.nih.gov/",
214+
"https://www.cdc.gov/": "https://www.cdc.gov/",
215+
"https://www.who.int/": "https://www.who.int/",
216+
"https://www.goodhousekeeping.com/": "https://www.goodhousekeeping.com/",
217+
"https://www.allrecipes.com/": "https://www.allrecipes.com/",
218+
"https://www.foodnetwork.com/": "https://www.foodnetwork.com/",
219+
"https://www.tasteofhome.com/": "https://www.tasteofhome.com/",
220+
"https://www.epicurious.com/": "https://www.epicurious.com/",
221+
222+
# Travel and Geography
223+
"https://www.booking.com/": "https://www.booking.com/",
224+
"https://www.expedia.com/": "https://www.expedia.com/",
225+
"https://www.airbnb.com/": "https://www.airbnb.com/",
226+
"https://www.kayak.com/": "https://www.kayak.com/",
227+
"https://www.skyscanner.com/": "https://www.skyscanner.com/",
228+
"https://www.lonelyplanet.com/": "https://www.lonelyplanet.com/",
229+
"https://www.nationalgeographic.com/": "https://www.nationalgeographic.com/",
230+
"https://www.atlasobscura.com/": "https://www.atlasobscura.com/",
231+
232+
# Government and Legal
233+
"https://www.usa.gov/": "https://www.usa.gov/",
234+
"https://www.congress.gov/": "https://www.congress.gov/",
235+
"https://www.whitehouse.gov/": "https://www.whitehouse.gov/",
236+
"https://www.supremecourt.gov/": "https://www.supremecourt.gov/",
237+
"https://www.fbi.gov/": "https://www.fbi.gov/",
238+
"https://www.irs.gov/": "https://www.irs.gov/",
239+
"https://www.fda.gov/": "https://www.fda.gov/",
240+
"https://www.epa.gov/": "https://www.epa.gov/",
241+
242+
# Utilities and Tools
243+
"https://archive.org/": "https://archive.org/",
244+
"https://translate.google.com/": "https://translate.google.com/",
245+
"https://maps.google.com/": "https://maps.google.com/",
246+
"https://www.google.com/": "https://www.google.com/",
247+
"https://www.bing.com/": "https://www.bing.com/",
248+
"https://duckduckgo.com/": "https://duckduckgo.com/",
249+
"https://www.wolframalpha.com/": "https://www.wolframalpha.com/",
250+
"https://www.mathway.com/": "https://www.mathway.com/",
251+
"https://www.grammarly.com/": "https://www.grammarly.com/",
252+
"https://pastebin.com/": "https://pastebin.com/",
253+
"https://gist.github.com/": "https://gist.github.com/",
254+
}
10255

11256
URLS_FILE = "urls.txt"
12257

@@ -24,10 +269,13 @@ def find_new_urls(target_url):
24269

25270
for link in soup.find_all('a', href=True):
26271
href = link.get('href')
272+
# Handle different types of links
27273
full_url = urljoin(target_url, href)
28274
parsed_url = urlparse(full_url)
29275

276+
# Check for same domain and valid scheme
30277
if parsed_url.netloc == base_domain and parsed_url.scheme in ['http', 'https']:
278+
# Clean up the URL by removing fragments (#)
31279
clean_url = full_url.split('#')[0]
32280
found_urls.add(clean_url)
33281

@@ -64,8 +312,8 @@ def main():
64312

65313
if total_discovered_urls:
66314
if update_urls_file(total_discovered_urls):
67-
return 0
68-
return 1
315+
return 0 # Success, changes made
316+
return 1 # No changes, nothing to commit
69317

70318
if __name__ == "__main__":
71319
import sys

0 commit comments

Comments
 (0)