Skip to content

Commit e060620

Browse files
authored
Merge pull request #388 from code-kern-ai/regex-changes
Updates regex expressions
2 parents 56328f6 + 4f9eeba commit e060620

File tree

12 files changed

+38
-80
lines changed

12 files changed

+38
-80
lines changed

extractors/numbers/ip_extraction/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def ip_extraction(request: IpExtractionModel):
2222
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
2323
doc = nlp(text)
2424
regex = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
25-
regex.findall(text)
2625

2726
ip_addresses = []
2827
for match in regex.finditer(text):

extractors/numbers/isbn_extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,5 @@ def isbn_extraction(request: IsbnExtractionModel):
2828
for match in regex.finditer(text):
2929
start, end = match.span()
3030
span = doc.char_span(start, end, alignment_mode="expand")
31-
isbn.append([span.start, span.end, span.text])
31+
isbn.append(["isbn", span.start, span.end])
3232
return {"isbn": isbn}
Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
from pydantic import BaseModel
32
from extractors.util.spacy import SpacySingleton
43
import re
@@ -11,39 +10,22 @@
1110

1211
class PercentageExtractionModel(BaseModel):
1312
text: str
14-
regex: str = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%"
1513
spacyTokenizer: str = "en_core_web_sm"
16-
yourLabel: str = "percentage"
1714

1815
class Config:
1916
schema_extra = {"example": INPUT_EXAMPLE}
2017

2118

2219
def percentage_extraction(request: PercentageExtractionModel):
23-
"""Extracts percentages from a given text."""
24-
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
25-
doc = nlp(request.text)
26-
27-
matches = []
28-
29-
def regex_search(pattern, string):
30-
"""
31-
some helper function to easily iterate over regex matches
32-
"""
33-
prev_end = 0
34-
while True:
35-
match = re.search(pattern, string)
36-
if not match:
37-
break
20+
"""Extracts the Percentages from a text"""
3821

39-
start, end = match.span()
40-
yield start + prev_end, end + prev_end
41-
42-
prev_end += end
43-
string = string[end:]
44-
45-
for start, end in regex_search(request.regex, request.text):
22+
text = request.text
23+
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
24+
doc = nlp(text)
25+
regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)\s*%")
26+
percentages = []
27+
for match in regex.finditer(text):
28+
start, end = match.span()
4629
span = doc.char_span(start, end, alignment_mode="expand")
47-
matches.append([request.yourLabel, span.start, span.end])
48-
49-
return {f"{request.yourLabel}s": matches}
30+
percentages.append(["percentage", span.start, span.end])
31+
return {"percentages": percentages}

extractors/numbers/percentage_extraction/code_snippet_common.md

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,20 @@ import re
33
import spacy
44
from typing import List, Tuple
55

6-
def percentage_extraction(text: str, extraction_keyword: str, regex_pattern: str) -> List[Tuple[str, int]]:
7-
8-
def regex_search(pattern, string):
9-
prev_end = 0
10-
while True:
11-
match = re.search(pattern, string)
12-
if not match:
13-
break
14-
15-
start_, end_ = match.span()
16-
yield start_ + prev_end, end_ + prev_end
17-
18-
prev_end += end_
19-
string = string[end_:]
20-
6+
def percentage_extraction(text: str, extraction_keyword:str) -> List[Tuple[str, int, int]]:
7+
"""
8+
@param text: the input text
9+
@param extraction_keyword: the label that is assigned to extracted words
10+
@return: positions of extracted percentages
11+
"""
2112
nlp = spacy.load("en_core_web_sm")
2213
doc = nlp(text)
2314

24-
percentage_positions = []
25-
for start, end in regex_search(regex_pattern, text):
15+
regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)\s*%")
16+
17+
percentage_positions = []
18+
for match in regex.finditer(text):
19+
start, end = match.span()
2620
span = doc.char_span(start, end, alignment_mode="expand")
2721
percentage_positions.append((extraction_keyword, span.start, span.end))
2822
return percentage_positions
@@ -33,14 +27,15 @@ def percentage_extraction(text: str, extraction_keyword: str, regex_pattern: str
3327

3428
def example_integration():
3529
texts = ["percentages 110% are found -.5% at 42,13% positions 1, 5 and 8", "Apple stock fell today."]
36-
regex_pattern = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%"
3730
extraction_keyword = "percentage"
3831
for text in texts:
39-
found = percentage_extraction(text, regex_pattern, extraction_keyword)
32+
found = percentage_extraction(text, extraction_keyword)
4033
if found:
4134
print(f"text: \"{text}\" has {extraction_keyword} -> \"{found}\"")
4235
else:
4336
print(f"text: \"{text}\" doesn't have {extraction_keyword}")
4437

4538
example_integration()
39+
40+
4641
```

extractors/numbers/percentage_extraction/code_snippet_refinery.md

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,14 @@
22
import re
33

44
ATTRIBUTE: str = "text" # only text attributes
5-
REGEX: str = r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%" # this will capture all percentages
6-
LABEL: str = "percentage" # Choose any available label here
5+
LABEL: str = "percentage"
76

87
def percentage_extraction(record):
8+
regex = re.compile(r"(-?\d+(?:[.,]\d*)?|-?[.,]\d+)\s*%")
9+
text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string
910

10-
def regex_search(pattern, string):
11-
prev_end = 0
12-
while True:
13-
match = re.search(pattern, string)
14-
if not match:
15-
break
16-
17-
start_, end_ = match.span()
18-
yield start_ + prev_end, end_ + prev_end
19-
20-
prev_end += end_
21-
string = string[end_:]
22-
23-
for start, end in regex_search(REGEX, record[ATTRIBUTE].text):
11+
for match in regex.finditer(text):
12+
start, end = match.span()
2413
span = record[ATTRIBUTE].char_span(start, end, alignment_mode="expand")
2514
yield LABEL, span.start, span.end
2615
```

extractors/numbers/percentage_extraction/config.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,13 @@ def get_config():
3131
},
3232
"LABEL": {
3333
"selectionType": SelectionType.CHOICE.value,
34-
"defaultValue": "isbn",
34+
"defaultValue": "percentage",
3535
"optional": "false",
3636
"addInfo": [
3737
BricksVariableType.LABEL.value,
3838
BricksVariableType.GENERIC_STRING.value,
3939
],
4040
},
41-
"REGEX": {
42-
"selectionType": SelectionType.STRING.value,
43-
"defaultValue": "(-?\d+(?:[.,]\d*)?|-?[.,]\d+)%",
44-
"description": "Choose any regex here",
45-
"optional": "false",
46-
"addInfo": [BricksVariableType.REGEX.value],
47-
},
4841
},
4942
},
5043
)

extractors/paths/url_extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def url_extraction(request: UrlExtractionModel):
2222
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
2323
doc = nlp(text)
2424

25-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
25+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
2626
regex_pattern.findall(text)
2727

2828
urls = []

extractors/paths/url_extraction/code_snippet_common.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def url_extraction(text: str, extraction_keyword: str) -> List[Tuple[str, int]]:
1212
npl = spacy.load("en_core_web_sm")
1313
doc = npl(text)
1414

15-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
15+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
1616

1717
url_positions = []
1818
for match in regex_pattern.finditer(text):

extractors/paths/url_extraction/code_snippet_refinery.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ LABEL: str = "url"
66

77
def url_extraction(record):
88
text = record[ATTRIBUTE].text # SpaCy doc, hence we need to use .text to get the string.
9-
regex_pattern = re.compile(r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+")
9+
regex_pattern = re.compile(r"(?:(?:(?:https?|ftp):\/\/){1})?[\w\-\/?=%.]{3,}\.[\/\w\-&?=%.]{2,}")
1010

1111
for match in regex_pattern.finditer(text):
1212
start, end = match.span()

extractors/words/goodbye_extraction/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ def goodbye_extraction(request: GoodbyeExtractionModel):
2525
text = request.text
2626
nlp = SpacySingleton.get_nlp(request.spacyTokenizer)
2727
doc = nlp(text)
28-
regex = re.compile(r"((?:((?i)good)(?:[ ])?)?((?i)bye)|(?i)Ciao|(?:((?i)see you)(?:[ ]?)((?i)tomorrow|later|soon)?))")
28+
regex = re.compile(r"((?:(good)(?:[ ])?)?(bye)|Ciao|(?:(see you)(?:[ ]?)(tomorrow|later|soon)?))", re.IGNORECASE)
2929

3030
farewell = []
3131
for match in regex.finditer(text):
3232
start, end = match.span()
3333
span = doc.char_span(start, end, alignment_mode="expand")
34-
farewell.append(["span", span.start, span.end])
34+
farewell.append(["farewellWords", span.start, span.end])
3535

3636
return {"farewellWords": farewell}

0 commit comments

Comments
 (0)