-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractor_engine.py
More file actions
55 lines (48 loc) · 1.86 KB
/
extractor_engine.py
File metadata and controls
55 lines (48 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import extruct, json, re, phonenumbers, spacy
from bs4 import BeautifulSoup
from w3lib.html import get_base_url
from email_validator import validate_email, EmailNotValidError
nlp = spacy.load("en_core_web_sm")
def extract_structured(html, url):
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url, syntaxes=['json-ld', 'microdata'])
return data
def extract_emails(text):
emails = re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text)
valid = []
for e in emails:
try:
validate_email(e)
valid.append(e)
except EmailNotValidError:
continue
return list(set(valid))
def extract_phones(text):
phones = []
for match in phonenumbers.PhoneNumberMatcher(text, "US"):
phones.append(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL))
return list(set(phones))
def extract_about_text(html):
soup = BeautifulSoup(html, "lxml")
text_blocks = [t.get_text(" ", strip=True) for t in soup.find_all(['p', 'div']) if len(t.get_text(strip=True)) > 50]
about_candidates = [t for t in text_blocks if "about" in t.lower() or "mission" in t.lower()]
if not about_candidates:
doc = nlp(" ".join(text_blocks))
about_candidates = [sent.text for sent in doc.sents if len(sent.text.split()) > 10]
return " ".join(about_candidates[:3]) # top 3 paragraphs
def extract_contacts(html):
text = BeautifulSoup(html, "lxml").get_text(" ", strip=True)
return {
"emails": extract_emails(text),
"phones": extract_phones(text),
}
def extract_all(html, url):
structured = extract_structured(html, url)
about = extract_about_text(html)
contact = extract_contacts(html)
return {
"url": url,
"structured": structured,
"about_text": about,
**contact
}