Skip to content

Maximum sentence complexity #379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions classifiers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
from fastapi import APIRouter

from .llm import (
gpt_classifier,
gpt_classifier,
deberta_review_classifier,
bert_sentiment_german,
distilbert_stock_news_classifier
distilbert_stock_news_classifier,
)

from .lookup_lists import lookup_list

from .reference_complexity import (
maximum_sentence_complexity,
tiktoken_length_classifier,
chunked_sentence_complexity
chunked_sentence_complexity,
)

from .question_type import (
question_type_classifier
)
from .question_type import question_type_classifier

from .communication_style import (
communication_style_classifier
)
from .communication_style import communication_style_classifier

from .reference_quality import (
word_count_classifier,
Expand Down Expand Up @@ -68,14 +65,15 @@
textblob_subjectivity,
distilbert_stock_news_classifier,
workday_classifier,
deberta_review_classifier,
deberta_review_classifier,
bert_sentiment_german,
tiktoken_length_classifier,
word_count_classifier,
special_character_classifier,
chunked_sentence_complexity,
maximum_sentence_complexity,
question_type_classifier,
communication_style_classifier
communication_style_classifier,
]:
module_name = module.__name__.split(".")[-1]
model_name = (
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This is similar to the standard sentence complexity brick, with the difference that this brick returns the highest sentence complexity found in a text. The formula for calculating the complexity is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences
The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 2. The coefficients were determined through trial and error to best fit the data available at the time.
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from pydantic import BaseModel
from typing import Optional
from extractors.util.spacy import SpacySingleton
from collections import Counter
import textstat

INPUT_EXAMPLE = {
"text": "An easy sentence. Despite the rains persistence, the resilient team continued their expedition, undeterred by the relentless downpour.",
"language": "en",
}

MODELS = {
"en": "en_core_web_sm",
"de": "de_core_news_sm"
}


class MaximumSentenceComplexityModel(BaseModel):
text: str
language: Optional[str] = None

class Config:
schema_extra = {"example": INPUT_EXAMPLE}

def get_mapping_complexity(score):
if score < 30:
return "very difficult"
if score < 50:
return "difficult"
if score < 60:
return "fairly difficult"
if score < 70:
return "standard"
if score < 80:
return "fairly easy"
if score < 90:
return "easy"
return "very easy"

def maximum_sentence_complexity(req: MaximumSentenceComplexityModel):
"""Chunks a text and calculates complexity of it."""
textstat.set_lang(req.language)

nlp = SpacySingleton.get_nlp(MODELS.get(req.language, "en_core_web_sm")) # defaults to "en_core_web_sm"
doc = nlp(req.text)

complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
return {"overall_text_complexity": get_mapping_complexity(min(complexities))}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
```python
import textstat
import spacy

def get_mapping_complexity(score: int)-> str:
if score < 30:
return "very difficult"
if score < 50:
return "difficult"
if score < 60:
return "fairly difficult"
if score < 70:
return "standard"
if score < 80:
return "fairly easy"
if score < 90:
return "easy"
return "very easy"

spacy_models_loaded = {}

def get_spacy(spacy_model: str):
global spacy_models_loaded
if spacy_model not in spacy_models_loaded:
spacy_models_loaded[spacy_model] = spacy.load(spacy_model)
return spacy_models_loaded[spacy_model]

def maximum_sentence_complexity(text: str, language: str = "en", spacy_model: str = "en_core_web_sm") -> str:
"""
@param text:
@param language: iso language code
@spacy model: name of a language model from SpaCy
@return: string label of the aggregated sentence complexity of a text
"""
textstat.set_lang(language)
nlp = get_spacy(spacy_model)
doc = nlp(text)

complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
return get_mapping_complexity(min(complexities))


# ↑ necessary bricks function
# -----------------------------------------------------------------------------------------
# ↓ example implementation


def example_integration():
texts = [
"""
In a small town, there lived a humble baker named Thomas. He was known for his delicious pastries, which were loved by everyone in the town. Every morning, he would wake up early to prepare the dough for his pastries. He would then bake them in his old but reliable oven.
One day, a stranger came to the town. He had heard about Thomas's pastries and wanted to try them. He went to the bakery and ordered a pastry. As he took his first bite, his eyes lit up with delight. He praised Thomas for his skill and promised to spread the word about his bakery.
Word of Thomas's pastries spread far and wide. People from neighboring towns started visiting his bakery. Despite the increase in customers, Thomas remained humble. He continued to wake up early every morning to prepare his pastries, ensuring that each one was made with care.
Thomas's story is a reminder that passion and dedication can lead to success. It shows that humility and hard work are respected and rewarded. His delicious pastries were not just food items but a source of joy for everyone who tasted them.
"""
]
language = "en" # other languages: de, es, fr, it, nl, ru
spacy_model = "en_core_web_sm"
for text in texts:
print(f"The highest complexity in \"{text}\" is {maximum_sentence_complexity(text, language, spacy_model)}")

example_integration()
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
```python
import textstat

ATTRIBUTE: str = "text" # only text attributes
TARGET_LANGUAGE: str = "en" # iso codes

if TARGET_LANGUAGE is not None:
textstat.set_lang(TARGET_LANGUAGE)

def maximum_sentence_complexity(record):
complexities = [textstat.flesch_reading_ease(sent.text) for sent in record[ATTRIBUTE].sents]
return get_mapping_complexity(min(complexities))

def get_mapping_complexity(score):
if score < 30:
return "very difficult"
if score < 50:
return "difficult"
if score < 60:
return "fairly difficult"
if score < 70:
return "standard"
if score < 80:
return "fairly easy"
if score < 90:
return "easy"
return "very easy"
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from util.configs import build_classifier_function_config
from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
from . import maximum_sentence_complexity, INPUT_EXAMPLE


def get_config():
return build_classifier_function_config(
function=maximum_sentence_complexity,
input_example=INPUT_EXAMPLE,
issue_id=378,
tabler_icon="ThreeDCubeSphere",
min_refinery_version="1.7.0",
state=State.PUBLIC.value,
type="python_function",
available_for=["refinery", "common"],
part_of_group=[
"reference_complexity",
"question_complexity",
"text_analysis"
], # first entry should be parent directory
# bricks integrator information
cognition_init_mapping={
"very easy": "Low",
"easy": "Low",
"fairly easy": "Low",
"standard": "Medium",
"fairly difficult": "Medium",
"difficult": "High",
"very difficult": "High",
},
integrator_inputs={
"name": "maximum_sentence_complexity",
"refineryDataType": RefineryDataType.TEXT.value,
"outputs": [
"very easy",
"easy",
"fairly easy",
"standard",
"fairly difficult",
"difficult",
"very difficult",
],
"variables": {
"ATTRIBUTE": {
"selectionType": SelectionType.CHOICE.value,
"addInfo": [
BricksVariableType.ATTRIBUTE.value,
BricksVariableType.GENERIC_STRING.value,
],
},
"TARGET_LANGUAGE": {
"selectionType": SelectionType.CHOICE.value,
"description": "supported iso codes",
"defaultValue": "en",
"allowedValues": ["en", "de", "es", "fr", "it", "nl", "ru"],
"addInfo": [
BricksVariableType.GENERIC_STRING.value,
],
},
},
},
)