Skip to content

Commit 6c334ce

Browse files
committed
Updates regex expressions
1 parent a498d0e commit 6c334ce

File tree

9 files changed

+37
-70
lines changed

9 files changed

+37
-70
lines changed

extractors/numbers/percentage_extraction/__init__.py

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,42 +8,25 @@
88
"spacyTokenizer": "en_core_web_sm",
99
}
1010

11-
1211
class PercentageExtractionModel(BaseModel):
1312
text: str
14-
regex: str = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%"
1513
spacyTokenizer: str = "en_core_web_sm"
16-
yourLabel: str = "percentage"
1714

1815
class Config:
1916
schema_extra = {"example": INPUT_EXAMPLE}
2017

2118

2219
def percentage_extraction(request: PercentageExtractionModel):
23-
"""Extracts percentages from a given text."""
24-
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
25-
doc = nlp(request.text)
26-
27-
matches = []
28-
29-
def regex_search(pattern, string):
30-
"""
31-
some helper function to easily iterate over regex matches
32-
"""
33-
prev_end = 0
34-
while True:
35-
match = re.search(pattern, string)
36-
if not match:
37-
break
20+
"""Extracts the Percentages from a text"""
3821

39-
start, end = match.span()
40-
yield start + prev_end, end + prev_end
41-
42-
prev_end += end
43-
string = string[end:]
22+
text = request.text
23+
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24+
doc = nlp(text)
25+
regex = re.compile(r"(?:[\d-]{17}|[\d-]{13})")
4426

45-
for start, end in regex_search(request.regex, request.text):
27+
p = []
28+
for match in regex.finditer(text):
29+
start, end = match.span()
4630
span = doc.char_span(start, end, alignment_mode="expand")
47-
matches.append([request.yourLabel, span.start, span.end])
48-
49-
return {f"{request.yourLabel}s": matches}
31+
p.append([span.start, span.end, span.text])
32+
return {"percentages": p}

extractors/numbers/percentage_extraction/code_snippet_common.md

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,39 @@ import re
33
import spacy
44
from typing import List, Tuple
55

6-
def percentage_extraction(text: str, extraction_keyword: str, regex_pattern: str) -> List[Tuple[str, int]]:
7-
8-
def regex_search(pattern, string):
9-
prev_end = 0
10-
while True:
11-
match = re.search(pattern, string)
12-
if not match:
13-
break
14-
15-
start_, end_ = match.span()
16-
yield start_ + prev_end, end_ + prev_end
17-
18-
prev_end += end_
19-
string = string[end_:]
20-
6+
def percentage_extraction(text: str, extraction_keyword:str) -> List[Tuple[str, int, int]]:
7+
"""
8+
@param text: the input text
9+
@param extraction_keyword: the label that is assigned to extracted words
10+
@return: positions of extracted percentages
11+
"""
2112
nlp = spacy.load("en_core_web_sm")
2213
doc = nlp(text)
2314

24-
percentage_positions = []
25-
for start, end in regex_search(regex_pattern, text):
15+
regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%")
16+
17+
isbn_positions = []
18+
for match in regex.finditer(text):
19+
start, end = match.span()
2620
span = doc.char_span(start, end, alignment_mode="expand")
27-
percentage_positions.append((extraction_keyword, span.start, span.end))
28-
return percentage_positions
21+
isbn_positions.append((extraction_keyword, span.start, span.end))
22+
return isbn_positions
2923

3024
# ↑ necessary bricks function
3125
# -----------------------------------------------------------------------------------------
3226
# ↓ example implementation
3327

3428
def example_integration():
3529
texts = ["percentages 110% are found -.5% at 42,13% positions 1, 5 and 8", "Apple stock fell today."]
36-
regex_pattern = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%"
3730
extraction_keyword = "percentage"
3831
for text in texts:
39-
found = percentage_extraction(text, regex_pattern, extraction_keyword)
32+
found = percentage_extraction(text, extraction_keyword)
4033
if found:
4134
print(f"text: \"{text}\" has {extraction_keyword} -> \"{found}\"")
4235
else:
4336
print(f"text: \"{text}\" doesn't have {extraction_keyword}")
4437

4538
example_integration()
39+
40+
4641
```

extractors/numbers/percentage_extraction/code_snippet_refinery.md

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,14 @@
22
import re
33

44
ATTRIBUTE: str = "text" # only text attributes
5-
REGEX: str = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%" # this will capture all percentages
6-
LABEL: str = "percentage" # Choose any available label here
5+
LABEL: str = "percentage"
76

87
def percentage_extraction(record):
8+
regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%")
9+
text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
910

10-
def regex_search(pattern, string):
11-
prev_end = 0
12-
while True:
13-
match = re.search(pattern, string)
14-
if not match:
15-
break
16-
17-
start_, end_ = match.span()
18-
yield start_ + prev_end, end_ + prev_end
19-
20-
prev_end += end_
21-
string = string[end_:]
22-
23-
for start, end in regex_search(REGEX, record[ATTRIBUTE].text):
11+
for match in regex.finditer(text):
12+
start, end = match.span()
2413
span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
2514
yield LABEL, span.start, span.end
2615
```

extractors/paths/url_extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def url_extraction(request: UrlExtractionModel):
2222
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
2323
doc = nlp(text)
2424

25-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
25+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
2626
regex_pattern.findall(text)
2727

2828
urls = []

extractors/paths/url_extraction/code_snippet_common.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def url_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]:
1212
npl = spacy.load("en_core_web_sm")
1313
doc = npl(text)
1414

15-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
15+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
1616

1717
url_positions = []
1818
for match in regex_pattern.finditer(text):

extractors/paths/url_extraction/code_snippet_refinery.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ LABEL: str = "url"
66

77
def url_extraction(record):
88
text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
9-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
9+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
1010

1111
for match in regex_pattern.finditer(text):
1212
start, end = match.span()

extractors/words/goodbye_extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def goodbye_extraction(request: GoodbyeExtractionModel):
2525
text = request.text
2626
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
2727
doc = nlp(text)
28-
regex = re.compile(r"((?:((?i)good)(?:[ ])?)?((?i)bye)|(?i)Ciao|(?:((?i)see you)(?:[ ]?)((?i)tomorrow|later|soon)?))")
28+
regex = re.compile(r"((?:(good)(?:[ ])?)?(bye)|Ciao|(?:(see you)(?:[ ]?)(tomorrow|later|soon)?))", re.IGNORECASE)
2929

3030
farewell = []
3131
for match in regex.finditer(text):

extractors/words/goodbye_extraction/code_snippet_common.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def goodbye_extraction(text: str, extraction_keyword: str) -> List[Tuple[str,int
1212
nlp = spacy.load("en_core_web_sm")
1313
doc = nlp(text)
1414

15-
regex = re.compile(r"((?:((?i)good)(?:[ ])?)?((?i)bye)|(?i)Ciao|(?:((?i)see you)(?:[ ]?)((?i)tomorrow|later|soon)?))")
15+
regex = re.compile(r"((?:(good)(?:[ ])?)?(bye)|Ciao|(?:(see you)(?:[ ]?)(tomorrow|later|soon)?))", re.IGNORECASE)
1616

1717
goodbye_positions = []
1818
for match in regex.finditer(text):

extractors/words/goodbye_extraction/code_snippet_refinery.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ ATTRIBUTE: str = "text" # only text attributes
55
LABEL: str = "goodbye"
66

77
def goodbye_extraction(record):
8-
regex = re.compile(r"((?:((?i)good)(?:[ ])?)?((?i)bye)|(?i)Ciao|(?:((?i)see you)(?:[ ]?)((?i)tomorrow|later|soon)?))")
8+
regex = re.compile(r"((?:(good)(?:[ ])?)?(bye)|Ciao|(?:(see you)(?:[ ]?)(tomorrow|later|soon)?))", re.IGNORECASE)
99
text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
1010

1111
for match in regex.finditer(text):

0 commit comments

Comments
 (0)