Merge pull request #379 from code-kern-ai/max-sentence-complexity

FelixKirschKern · web-flow · commit c22cfb66ff89 · 2023-10-23T16:00:21.000+02:00
Maximum sentence complexity
diff --git a/classifiers/__init__.py b/classifiers/__init__.py
@@ -1,26 +1,23 @@
 from fastapi import APIRouter
 
 from .llm import (
-    gpt_classifier, 
+    gpt_classifier,
     deberta_review_classifier,
     bert_sentiment_german,
-    distilbert_stock_news_classifier
+    distilbert_stock_news_classifier,
 )
 
 from .lookup_lists import lookup_list
 
 from .reference_complexity import (
+    maximum_sentence_complexity,
     tiktoken_length_classifier,
-    chunked_sentence_complexity
+    chunked_sentence_complexity,
 )
 
-from .question_type import (
-    question_type_classifier
-)
+from .question_type import question_type_classifier
 
-from .communication_style import (
-    communication_style_classifier
-)
+from .communication_style import communication_style_classifier
 
 from .reference_quality import (
     word_count_classifier,
@@ -68,14 +65,15 @@
     textblob_subjectivity,
     distilbert_stock_news_classifier,
     workday_classifier,
-    deberta_review_classifier, 
+    deberta_review_classifier,
     bert_sentiment_german,
     tiktoken_length_classifier,
     word_count_classifier,
     special_character_classifier,
     chunked_sentence_complexity,
+    maximum_sentence_complexity,
     question_type_classifier,
-    communication_style_classifier
+    communication_style_classifier,
 ]:
     module_name = module.__name__.split(".")[-1]
     model_name = (
diff --git a/classifiers/reference_complexity/maximum_sentence_complexity/README.md b/classifiers/reference_complexity/maximum_sentence_complexity/README.md
@@ -0,0 +1,2 @@
+This is similar to the standard sentence complexity brick, with the difference that this brick returns the highest sentence complexity found in a text. The formula for calculating the complexity is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences
+The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 2. The coefficients were determined through trial and error to best fit the data available at the time. 
diff --git a/classifiers/reference_complexity/maximum_sentence_complexity/__init__.py b/classifiers/reference_complexity/maximum_sentence_complexity/__init__.py
@@ -0,0 +1,48 @@
+from pydantic import BaseModel
+from typing import Optional
+from extractors.util.spacy import SpacySingleton
+from collections import Counter
+import textstat
+
+INPUT_EXAMPLE = {
+    "text": "An easy sentence. Despite the rains persistence, the resilient team continued their expedition, undeterred by the relentless downpour.", 
+    "language": "en",
+}
+
+MODELS = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm"
+}
+
+
+class MaximumSentenceComplexityModel(BaseModel):
+    text: str
+    language: Optional[str] = None
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+def get_mapping_complexity(score):
+    if score < 30:
+        return "very difficult"
+    if score < 50:
+        return "difficult"
+    if score < 60:
+        return "fairly difficult"
+    if score < 70:
+        return "standard"
+    if score < 80:
+        return "fairly easy"
+    if score < 90:
+        return "easy"        
+    return "very easy"
+    
+def maximum_sentence_complexity(req: MaximumSentenceComplexityModel):
+    """Chunks a text and calculates complexity of it."""
+    textstat.set_lang(req.language)
+
+    nlp = SpacySingleton.get_nlp(MODELS.get(req.language, "en_core_web_sm")) # defaults to "en_core_web_sm"  
+    doc = nlp(req.text)
+    
+    complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
+    return {"overall_text_complexity": get_mapping_complexity(min(complexities))}
diff --git a/classifiers/reference_complexity/maximum_sentence_complexity/code_snippet_common.md b/classifiers/reference_complexity/maximum_sentence_complexity/code_snippet_common.md
@@ -0,0 +1,63 @@
+```python
+import textstat
+import spacy
+
+def get_mapping_complexity(score: int)-> str:    
+    if score < 30:
+        return "very difficult"
+    if score < 50:
+        return "difficult"
+    if score < 60:
+        return "fairly difficult"
+    if score < 70:
+        return "standard"
+    if score < 80:
+        return "fairly easy"
+    if score < 90:
+        return "easy"        
+    return "very easy"
+
+spacy_models_loaded = {}
+
+def get_spacy(spacy_model: str):
+    global spacy_models_loaded
+    if spacy_model not in spacy_models_loaded:
+        spacy_models_loaded[spacy_model] = spacy.load(spacy_model)
+    return spacy_models_loaded[spacy_model]
+
+def maximum_sentence_complexity(text: str, language: str = "en", spacy_model: str = "en_core_web_sm") -> str:
+    """
+    @param text: 
+    @param language: iso language code
+    @spacy model: name of a language model from SpaCy 
+    @return: string label of the aggregated sentence complexity of a text
+    """
+    textstat.set_lang(language)
+    nlp = get_spacy(spacy_model)
+    doc = nlp(text)
+
+    complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
+    return get_mapping_complexity(min(complexities))
+
+
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation 
+
+
+def example_integration():
+    texts = [
+    """
+    In a small town, there lived a humble baker named Thomas. He was known for his delicious pastries, which were loved by everyone in the town. Every morning, he would wake up early to prepare the dough for his pastries. He would then bake them in his old but reliable oven.
+    One day, a stranger came to the town. He had heard about Thomas's pastries and wanted to try them. He went to the bakery and ordered a pastry. As he took his first bite, his eyes lit up with delight. He praised Thomas for his skill and promised to spread the word about his bakery.
+    Word of Thomas's pastries spread far and wide. People from neighboring towns started visiting his bakery. Despite the increase in customers, Thomas remained humble. He continued to wake up early every morning to prepare his pastries, ensuring that each one was made with care.
+    Thomas's story is a reminder that passion and dedication can lead to success. It shows that humility and hard work are respected and rewarded. His delicious pastries were not just food items but a source of joy for everyone who tasted them.
+    """
+    ]
+    language = "en" # other languages: de, es, fr, it, nl, ru
+    spacy_model = "en_core_web_sm"
+    for text in texts:
+        print(f"The highest complexity in \"{text}\" is {maximum_sentence_complexity(text, language, spacy_model)}")
+
+example_integration()
+```
diff --git a/classifiers/reference_complexity/maximum_sentence_complexity/code_snippet_refinery.md b/classifiers/reference_complexity/maximum_sentence_complexity/code_snippet_refinery.md
@@ -0,0 +1,28 @@
+```python
+import textstat
+
+ATTRIBUTE: str = "text" # only text attributes
+TARGET_LANGUAGE: str = "en" # iso codes
+
+if TARGET_LANGUAGE is not None:
+    textstat.set_lang(TARGET_LANGUAGE)
+
+def maximum_sentence_complexity(record):
+    complexities = [textstat.flesch_reading_ease(sent.text) for sent in record[ATTRIBUTE].sents] 
+    return get_mapping_complexity(min(complexities))
+
+def get_mapping_complexity(score):
+    if score < 30:
+        return "very difficult"
+    if score < 50:
+        return "difficult"
+    if score < 60:
+        return "fairly difficult"
+    if score < 70:
+        return "standard"
+    if score < 80:
+        return "fairly easy"
+    if score < 90:
+        return "easy"        
+    return "very easy"
+```
diff --git a/classifiers/reference_complexity/maximum_sentence_complexity/config.py b/classifiers/reference_complexity/maximum_sentence_complexity/config.py
@@ -0,0 +1,62 @@
+from util.configs import build_classifier_function_config
+from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
+from . import maximum_sentence_complexity, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_classifier_function_config(
+        function=maximum_sentence_complexity,
+        input_example=INPUT_EXAMPLE,
+        issue_id=378,
+        tabler_icon="ThreeDCubeSphere",
+        min_refinery_version="1.7.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "reference_complexity",
+            "question_complexity",
+            "text_analysis"
+        ],  # first entry should be parent directory
+        # bricks integrator information
+        cognition_init_mapping={
+            "very easy": "Low",
+            "easy": "Low",
+            "fairly easy": "Low",
+            "standard": "Medium",
+            "fairly difficult": "Medium",
+            "difficult": "High",
+            "very difficult": "High",
+        },
+        integrator_inputs={
+            "name": "maximum_sentence_complexity",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "outputs": [
+                "very easy",
+                "easy",
+                "fairly easy",
+                "standard",
+                "fairly difficult",
+                "difficult",
+                "very difficult",
+            ],
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+                "TARGET_LANGUAGE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "description": "supported iso codes",
+                    "defaultValue": "en",
+                    "allowedValues": ["en", "de", "es", "fr", "it", "nl", "ru"],
+                    "addInfo": [
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                },
+            },
+        },
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This is similar to the standard sentence complexity brick, with the difference that this brick returns the highest sentence complexity found in a text. The formula for calculating the complexity is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences`
	`2`	`+The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 2. The coefficients were determined through trial and error to best fit the data available at the time.`