Merge pull request #371 from code-kern-ai/location-extraction

LeonardPuettmannKern · web-flow · commit 87b784f4a498 · 2023-10-17T17:12:52.000+02:00
Location extraction
diff --git a/extractors/__init__.py b/extractors/__init__.py
@@ -44,6 +44,7 @@
 )
 
 from .personal_identifiers import (
+    location_extraction,
     address_extraction,
     email_extraction,
     person_extraction,
@@ -109,6 +110,7 @@
     bic_extraction,
     deberta_ner_extraction,
     bert_ner_extraction,
+    location_extraction,
 ]:
     module_name = module.__name__.split(".")[-1]
     model_name = (
diff --git a/extractors/personal_identifiers/address_extraction/config.py b/extractors/personal_identifiers/address_extraction/config.py
@@ -11,7 +11,7 @@ def get_config():
         issue_id=62,
         tabler_icon="AddressBook",
         min_refinery_version="1.7.0",
-        state=State.DRAFT.value,
+        state=State.PUBLIC.value,
         type="python_function",
         available_for=["refinery", "common"],
         part_of_group=[
diff --git a/extractors/personal_identifiers/location_extraction/README.md b/extractors/personal_identifiers/location_extraction/README.md
@@ -0,0 +1 @@
+Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC). 
diff --git a/extractors/personal_identifiers/location_extraction/__init__.py b/extractors/personal_identifiers/location_extraction/__init__.py
@@ -0,0 +1,28 @@
+from pydantic import BaseModel
+from extractors.util.spacy import SpacySingleton
+
+INPUT_EXAMPLE = {
+    "text": "Tokyo is a beautiful city, which is not located in Kansas, USA.",
+    "spacyTokenizer": "en_core_web_sm",
+}
+
+
+class LocationExtractionModel(BaseModel):
+    text: str
+    spacyTokenizer: str = "en_core_web_sm"
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+
+def location_extraction(req: LocationExtractionModel):
+    """ Uses SpaCy to extract locations from a text."""
+    text = req.text
+    nlp = SpacySingleton.get_nlp(req.spacyTokenizer)
+    doc = nlp(text)
+
+    names = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            names.append(["location", ent.start, ent.end])
+    return {"locations": names}
diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_common.md b/extractors/personal_identifiers/location_extraction/code_snippet_common.md
@@ -0,0 +1,43 @@
+```python
+import spacy
+from typing import List, Tuple
+
+loaded_models = {}
+def load_spacy(spacy_model):
+    if spacy_model not in loaded_models:  
+        loaded_models[spacy_model] = spacy.load(spacy_model)
+    return loaded_models[spacy_model]
+
+
+def location_extraction(text: str, extraction_keyword: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
+    """
+    @param text: the input text
+    @param extraction_keyword: the label that is assigned to extracted words
+    @return: positions of extracted names of persons  
+    """
+    nlp = load_spacy(spacy_model)
+    doc = nlp(text)
+
+    name_positions = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            name_positions.append((extraction_keyword, ent.start, ent.end))
+    return name_positions
+
+
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation
+
+def example_integration():
+    texts = ["Tokyo is a place in Japan.", "My hometown is Cologne in Northrhine-Westphalia.", "She's from Berlin and likes EDM.", "Man I love pasta."]
+    extraction_keyword = "location"
+    for text in texts:
+        found = location_extraction(text, extraction_keyword)
+        if found:
+            print(f"text: \"{text}\" has {extraction_keyword} -> {found}")
+        else:
+            print(f"text: \"{text}\" doesn't have {extraction_keyword}")
+
+example_integration()
+```
diff --git a/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md b/extractors/personal_identifiers/location_extraction/code_snippet_refinery.md
@@ -0,0 +1,9 @@
+```python
+ATTRIBUTE: str = "text" # only text attributes
+LABEL: str = "location"
+
+def location_extraction(record):
+    for ent in record[ATTRIBUTE].ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            yield LABEL, ent.start, ent.end
+```
diff --git a/extractors/personal_identifiers/location_extraction/config.py b/extractors/personal_identifiers/location_extraction/config.py
@@ -0,0 +1,44 @@
+from util.configs import build_extractor_function_config
+from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
+from . import location_extraction, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_extractor_function_config(
+        function=location_extraction,
+        input_example=INPUT_EXAMPLE,
+        issue_id=369,
+        tabler_icon="Location",
+        min_refinery_version="1.7.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "personal_identifiers",
+        ],  # first entry should be parent directory
+        # bricks integrator information 
+        cognition_init_mapping={
+            "@@LABEL@@": "Location"
+        },       
+        integrator_inputs={
+            "name": "location_extraction",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+                "LABEL": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "defaultValue": "location",
+                    "addInfo": [
+                        BricksVariableType.LABEL.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+            },
+        },
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Uses SpaCy to extract locations such as cities and countries (GPE) or names of other famous places like mountains and rivers (LOC).`