Skip to content

Commit cff511a

Browse files
authored
Merge pull request #68 from my-dev-app/scraper/additional-parsers
Scraper/additional parsers
2 parents 2239247 + b5c8a13 commit cff511a

10 files changed

+356
-4
lines changed

aproxyrelay/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ async def get_proxies(self) -> None:
7878
self.logger.info('Scraper: Skip discovery of new proxy servers ...')
7979

8080
if self.filter and self.scrape:
81+
self.logger.info(
82+
f'Validating: Proxies ({self._queue_filter.qsize()}), checking if proxies meet connection requirements ...'
83+
)
8184
async with ClientSession(conn_timeout=15) as session:
8285
await self._test_all_proxies(session)
8386
self.logger.info(f'Filter: Found {self._filtered_failed} incompetent and {self._filtered_available} available proxy servers in {datetime.now(UTC) - self.started}') # noqa: B950

aproxyrelay/req.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ async def _test_all_proxies(self, session):
8181
_target['proxy'] = f"{_target['protocol'].replace('https', 'http')}://{_target['ip']}:{_target['port']}"
8282
to_filter.append(_target)
8383

84+
# Remove duplicate entries
85+
to_filter = [dict(x) for x in list(set([tuple(item.items()) for item in to_filter]))]
8486
tasks = [self._test_proxy_link(proxy['proxy'], proxy, session) for proxy in to_filter]
8587
await gather(*tasks)
8688

aproxyrelay/scrapers/__init__.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@
2323
from .parser_spys_nl import ParserSpysNL
2424
from .parser_spys_us import ParserSpysUS
2525
from .parser_ssl_proxies import ParserSSLProxies
26+
from .parser_sunny9577_proxy_scraper import ParserSunnyProxyScraper
27+
from .parser_roosterkid_openproxylist_socks4 import ParserRoosterkidOpenproxylistSocks4
28+
from .parser_roosterkid_openproxylist_socks5 import ParserRoosterkidOpenproxylistSocks5
29+
from .parser_murongpig_proxy_master_http import ParserMurongpigProxyMasterHttp
30+
from .parser_murongpig_proxy_master_socks4 import ParserMurongpigProxyMasterSocks4
31+
from .parser_murongpig_proxy_master_socks5 import ParserMurongpigProxyMasterSocks5
2632

2733

2834
proxy_list = [
@@ -78,4 +84,28 @@
7884
'url': 'https://gg.my-dev.app/api/v1/proxies/available?zone=nl&anonimity=all&protocol=all&page=1&size=1000',
7985
'parser': ParserGGMyDevApp,
8086
},
87+
{
88+
'url': 'https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json',
89+
'parser': ParserSunnyProxyScraper,
90+
},
91+
{
92+
'url': 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt',
93+
'parser': ParserRoosterkidOpenproxylistSocks4,
94+
},
95+
{
96+
'url': 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt',
97+
'parser': ParserRoosterkidOpenproxylistSocks5,
98+
},
99+
{
100+
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt',
101+
'parser': ParserMurongpigProxyMasterHttp,
102+
},
103+
{
104+
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt',
105+
'parser': ParserMurongpigProxyMasterSocks4,
106+
},
107+
{
108+
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt',
109+
'parser': ParserMurongpigProxyMasterSocks5,
110+
},
81111
]
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
from .parser import MainScraper
16+
17+
18+
class ParserMurongpigProxyMasterHttp(MainScraper):
19+
def __init__(self) -> None:
20+
MainScraper.__init__(self)
21+
self.zone = None
22+
23+
@classmethod
24+
async def format_url(cls, url, *args, **kwargs) -> str:
25+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
26+
cls.zone = kwargs.get("zone", "us")
27+
return url
28+
29+
@classmethod
30+
async def format_raw(cls, html: str) -> list:
31+
"""Parse text/html pages, customized method for the parser of this website"""
32+
return [
33+
{
34+
'zone': cls.zone.upper(),
35+
'method': 'http',
36+
'anonymity': 'unknown',
37+
'protocol': 'https',
38+
'port': item.split(':')[1],
39+
'ip': item.split(':')[0],
40+
} for item in html.split('\n') if item
41+
]
42+
43+
@classmethod
44+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
45+
"""Data formatter, formats data and returns is back in the process Queue"""
46+
queue.put(data)
47+
return queue
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
from .parser import MainScraper
16+
17+
18+
class ParserMurongpigProxyMasterSocks4(MainScraper):
19+
def __init__(self) -> None:
20+
MainScraper.__init__(self)
21+
self.zone = None
22+
23+
@classmethod
24+
async def format_url(cls, url, *args, **kwargs) -> str:
25+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
26+
cls.zone = kwargs.get("zone", "us")
27+
return url
28+
29+
@classmethod
30+
async def format_raw(cls, html: str) -> list:
31+
"""Parse text/html pages, customized method for the parser of this website"""
32+
return [
33+
{
34+
'zone': cls.zone.upper(),
35+
'method': 'socks4',
36+
'anonymity': 'unknown',
37+
'protocol': 'socks4',
38+
'port': item.split(':')[1],
39+
'ip': item.split(':')[0],
40+
} for item in html.split('\n') if item
41+
]
42+
43+
@classmethod
44+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
45+
"""Data formatter, formats data and returns is back in the process Queue"""
46+
queue.put(data)
47+
return queue
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
from .parser import MainScraper
16+
17+
18+
class ParserMurongpigProxyMasterSocks5(MainScraper):
19+
def __init__(self) -> None:
20+
MainScraper.__init__(self)
21+
self.zone = None
22+
23+
@classmethod
24+
async def format_url(cls, url, *args, **kwargs) -> str:
25+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
26+
cls.zone = kwargs.get("zone", "us")
27+
return url
28+
29+
@classmethod
30+
async def format_raw(cls, html: str) -> list:
31+
"""Parse text/html pages, customized method for the parser of this website"""
32+
return [
33+
{
34+
'zone': cls.zone.upper(),
35+
'method': 'socks5',
36+
'anonymity': 'unknown',
37+
'protocol': 'socks5',
38+
'port': item.split(':')[1],
39+
'ip': item.split(':')[0],
40+
} for item in html.split('\n') if item
41+
]
42+
43+
@classmethod
44+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
45+
"""Data formatter, formats data and returns is back in the process Queue"""
46+
queue.put(data)
47+
return queue
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
from .parser import MainScraper
16+
17+
18+
class ParserRoosterkidOpenproxylistSocks4(MainScraper):
19+
def __init__(self) -> None:
20+
MainScraper.__init__(self)
21+
self.zone = None
22+
23+
@classmethod
24+
async def format_url(cls, url, *args, **kwargs) -> str:
25+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
26+
cls.zone = kwargs.get("zone", "us")
27+
return url
28+
29+
@classmethod
30+
async def format_raw(cls, html: str) -> list:
31+
"""Parse text/html pages, customized method for the parser of this website"""
32+
return [
33+
{
34+
'zone': cls.zone.upper(),
35+
'method': 'socks4',
36+
'anonymity': 'unknown',
37+
'protocol': 'socks4',
38+
'port': item.split(':')[1],
39+
'ip': item.split(':')[0],
40+
} for item in html.split('\n')
41+
]
42+
43+
@classmethod
44+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
45+
"""Data formatter, formats data and returns is back in the process Queue"""
46+
queue.put(data)
47+
return queue
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
from .parser import MainScraper
16+
17+
18+
class ParserRoosterkidOpenproxylistSocks5(MainScraper):
19+
def __init__(self) -> None:
20+
MainScraper.__init__(self)
21+
self.zone = None
22+
23+
@classmethod
24+
async def format_url(cls, url, *args, **kwargs) -> str:
25+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
26+
cls.zone = kwargs.get("zone", "us")
27+
return url
28+
29+
@classmethod
30+
async def format_raw(cls, html: str) -> list:
31+
"""Parse text/html pages, customized method for the parser of this website"""
32+
return [
33+
{
34+
'zone': cls.zone.upper(),
35+
'method': 'socks5',
36+
'anonymity': 'unknown',
37+
'protocol': 'socks5',
38+
'port': item.split(':')[1],
39+
'ip': item.split(':')[0],
40+
} for item in html.split('\n')
41+
]
42+
43+
@classmethod
44+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
45+
"""Data formatter, formats data and returns is back in the process Queue"""
46+
queue.put(data)
47+
return queue
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# -*- mode: python ; coding: utf-8 -*-
2+
"""
3+
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
4+
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
5+
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
6+
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
7+
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
8+
By undeƒined
9+
------------
10+
11+
Main parser example, other parsers can inherit from this class
12+
"""
13+
from queue import Queue
14+
15+
import ast
16+
17+
from .parser import MainScraper
18+
19+
20+
class ParserSunnyProxyScraper(MainScraper):
21+
def __init__(self) -> None:
22+
MainScraper.__init__(self)
23+
self.zone = None
24+
25+
@classmethod
26+
async def format_url(cls, url, *args, **kwargs) -> str:
27+
"""Formats URL before scraping, let us adjust query parameters for each parser"""
28+
cls.zone = kwargs.get("zone", "us")
29+
return url
30+
31+
@classmethod
32+
def generate_method(cls, target_method) -> str:
33+
if 'socks4' in target_method.lower():
34+
return 'socks4'
35+
elif 'socks5' in target_method.lower():
36+
return 'socks5'
37+
elif 'http' in target_method.lower():
38+
return 'https'
39+
return 'unknown'
40+
41+
@classmethod
42+
def generate_protocol(cls, target_protocol) -> str:
43+
if 'socks4' in target_protocol.lower():
44+
return 'socks4'
45+
elif 'socks5' in target_protocol.lower():
46+
return 'socks5'
47+
elif 'https' in target_protocol.lower():
48+
return 'https'
49+
elif 'http' in target_protocol.lower():
50+
return 'http'
51+
return 'unknown'
52+
53+
@classmethod
54+
def generate_anonymity(cls, target_anonimity) -> str:
55+
if target_anonimity.lower() in (
56+
'anonymous',
57+
'elite',
58+
):
59+
return 'anonymous'
60+
elif target_anonimity.lower() in (
61+
'transparent',
62+
):
63+
return 'transparent'
64+
return 'unknown'
65+
66+
@classmethod
67+
async def format_raw(cls, html: str) -> list:
68+
"""Parse text/html pages, customized method for the parser of this website"""
69+
return [
70+
{
71+
'zone': cls.zone.upper(),
72+
'method': cls.generate_method(item['type']),
73+
'anonymity': cls.generate_anonymity(item['anonymity']),
74+
'protocol': cls.generate_protocol(item['type']),
75+
'port': item['port'],
76+
'ip': item['ip'],
77+
} for item in ast.literal_eval(html)
78+
]
79+
80+
@classmethod
81+
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
82+
"""Data formatter, formats data and returns is back in the process Queue"""
83+
queue.put(data)
84+
return queue

0 commit comments

Comments
 (0)