Skip to content

Commit 87b784f

Browse files
Merge pull request #371 from code-kern-ai/location-extraction
Location extraction
2 parents 043a08c + 67f9c6e commit 87b784f

File tree

7 files changed

+128
-1
lines changed

7 files changed

+128
-1
lines changed

extractors/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
)
4545

4646
from .personal_identifiers import (
47+
location_extraction,
4748
address_extraction,
4849
email_extraction,
4950
person_extraction,
@@ -109,6 +110,7 @@
109110
bic_extraction,
110111
deberta_ner_extraction,
111112
bert_ner_extraction,
113+
location_extraction,
112114
]:
113115
module_name = module.__name__.split(".")[-1]
114116
model_name = (

extractors/personal_identifiers/address_extraction/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def get_config():
1111
issue_id=62,
1212
tabler_icon="AddressBook",
1313
min_refinery_version="1.7.0",
14-
state=State.DRAFT.value,
14+
state=State.PUBLIC.value,
1515
type="python_function",
1616
available_for=["refinery", "common"],
1717
part_of_group=[
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from pydantic import BaseModel
2+
from extractors.util.spacy import SpacySingleton
3+
4+
INPUT_EXAMPLE = {
5+
"text": "Tokyo is a beautiful city, which is not located in Kansas, USA.",
6+
"spacyTokenizer": "en_core_web_sm",
7+
}
8+
9+
10+
class LocationExtractionModel(BaseModel):
11+
text: str
12+
spacyTokenizer: str = "en_core_web_sm"
13+
14+
class Config:
15+
schema_extra = {"example": INPUT_EXAMPLE}
16+
17+
18+
def location_extraction(req: LocationExtractionModel):
19+
""" Uses SpaCy to extract locations from a text."""
20+
text = req.text
21+
nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
22+
doc = nlp(text)
23+
24+
names = []
25+
for ent in doc.ents:
26+
if ent.label_ == "GPE" or ent.label_ == "LOC":
27+
names.append(["location", ent.start, ent.end])
28+
return {"locations": names}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
```python
2+
import spacy
3+
from typing import List, Tuple
4+
5+
loaded_models = {}
6+
def load_spacy(spacy_model):
7+
if spacy_model not in loaded_models:
8+
loaded_models[spacy_model] = spacy.load(spacy_model)
9+
return loaded_models[spacy_model]
10+
11+
12+
def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
13+
"""
14+
@param text: the input text
15+
@param extraction_keyword: the label that is assigned to extracted words
16+
@return: positions of extracted names of persons
17+
"""
18+
nlp = load_spacy(spacy_model)
19+
doc = nlp(text)
20+
21+
name_positions = []
22+
for ent in doc.ents:
23+
if ent.label_ == "GPE" or ent.label_ == "LOC":
24+
name_positions.append((extraction_keyword, ent.start, ent.end))
25+
return name_positions
26+
27+
28+
# ↑ necessary bricks function
29+
# -----------------------------------------------------------------------------------------
30+
# ↓ example implementation
31+
32+
def example_integration():
33+
texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."]
34+
extraction_keyword = "location"
35+
for text in texts:
36+
found = location_extraction(text, extraction_keyword)
37+
if found:
38+
print(f"text: \"{text}\" has {extraction_keyword} -> {found}")
39+
else:
40+
print(f"text: \"{text}\" doesn't have {extraction_keyword}")
41+
42+
example_integration()
43+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
```python
2+
ATTRIBUTE: str = "text" # only text attributes
3+
LABEL: str = "location"
4+
5+
def location_extraction(record):
6+
for ent in record[ATTRIBUTE].ents:
7+
if ent.label_ == "GPE" or ent.label_ == "LOC":
8+
yield LABEL, ent.start, ent.end
9+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from util.configs import build_extractor_function_config
2+
from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
3+
from . import location_extraction, INPUT_EXAMPLE
4+
5+
6+
def get_config():
7+
return build_extractor_function_config(
8+
function=location_extraction,
9+
input_example=INPUT_EXAMPLE,
10+
issue_id=369,
11+
tabler_icon="Location",
12+
min_refinery_version="1.7.0",
13+
state=State.PUBLIC.value,
14+
type="python_function",
15+
available_for=["refinery", "common"],
16+
part_of_group=[
17+
"personal_identifiers",
18+
], # first entry should be parent directory
19+
# bricks integrator information
20+
cognition_init_mapping={
21+
"@@LABEL@@": "Location"
22+
},
23+
integrator_inputs={
24+
"name": "location_extraction",
25+
"refineryDataType": RefineryDataType.TEXT.value,
26+
"variables": {
27+
"ATTRIBUTE": {
28+
"selectionType": SelectionType.CHOICE.value,
29+
"addInfo": [
30+
BricksVariableType.ATTRIBUTE.value,
31+
BricksVariableType.GENERIC_STRING.value,
32+
],
33+
},
34+
"LABEL": {
35+
"selectionType": SelectionType.CHOICE.value,
36+
"defaultValue": "location",
37+
"addInfo": [
38+
BricksVariableType.LABEL.value,
39+
BricksVariableType.GENERIC_STRING.value,
40+
],
41+
},
42+
},
43+
},
44+
)

0 commit comments

Comments
 (0)