Skip to content

Commit c22cfb6

Browse files
Merge pull request #379 from code-kern-ai/max-sentence-complexity
Maximum sentence complexity
2 parents c4c6ec7 + 0a150a7 commit c22cfb6

File tree

6 files changed

+212
-11
lines changed

6 files changed

+212
-11
lines changed

classifiers/__init__.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,23 @@
11
from fastapi import APIRouter
22

33
from .llm import (
4-
gpt_classifier,
4+
gpt_classifier,
55
deberta_review_classifier,
66
bert_sentiment_german,
7-
distilbert_stock_news_classifier
7+
distilbert_stock_news_classifier,
88
)
99

1010
from .lookup_lists import lookup_list
1111

1212
from .reference_complexity import (
13+
maximum_sentence_complexity,
1314
tiktoken_length_classifier,
14-
chunked_sentence_complexity
15+
chunked_sentence_complexity,
1516
)
1617

17-
from .question_type import (
18-
question_type_classifier
19-
)
18+
from .question_type import question_type_classifier
2019

21-
from .communication_style import (
22-
communication_style_classifier
23-
)
20+
from .communication_style import communication_style_classifier
2421

2522
from .reference_quality import (
2623
word_count_classifier,
@@ -68,14 +65,15 @@
6865
textblob_subjectivity,
6966
distilbert_stock_news_classifier,
7067
workday_classifier,
71-
deberta_review_classifier,
68+
deberta_review_classifier,
7269
bert_sentiment_german,
7370
tiktoken_length_classifier,
7471
word_count_classifier,
7572
special_character_classifier,
7673
chunked_sentence_complexity,
74+
maximum_sentence_complexity,
7775
question_type_classifier,
78-
communication_style_classifier
76+
communication_style_classifier,
7977
]:
8078
module_name = module.__name__.split(".")[-1]
8179
model_name = (
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is similar to the standard sentence complexity brick, with the difference that this brick returns the highest sentence complexity found in a text. The formula for calculating the complexity is as follows: Flesch Reading Ease = 206.835 – (1.015 x Average Sentence Length) – (84.6 x Average Syllables Per Word). The higher the score, the easier the content is to read and understand. Average sentence length can be calculated by dividing the number of words by the number of sentences
2+
The score is categorized, where 0 is the most difficult and 122 is the easiest. The coefficients used in the formula were chosen to match a scale where a very easy text has a score of 100 and a really difficult one has a score of 2. The coefficients were determined through trial and error to best fit the data available at the time.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from pydantic import BaseModel
2+
from typing import Optional
3+
from extractors.util.spacy import SpacySingleton
4+
from collections import Counter
5+
import textstat
6+
7+
INPUT_EXAMPLE = {
8+
"text": "An easy sentence. Despite the rains persistence, the resilient team continued their expedition, undeterred by the relentless downpour.",
9+
"language": "en",
10+
}
11+
12+
MODELS = {
13+
"en": "en_core_web_sm",
14+
"de": "de_core_news_sm"
15+
}
16+
17+
18+
class MaximumSentenceComplexityModel(BaseModel):
19+
text: str
20+
language: Optional[str] = None
21+
22+
class Config:
23+
schema_extra = {"example": INPUT_EXAMPLE}
24+
25+
def get_mapping_complexity(score):
26+
if score < 30:
27+
return "very difficult"
28+
if score < 50:
29+
return "difficult"
30+
if score < 60:
31+
return "fairly difficult"
32+
if score < 70:
33+
return "standard"
34+
if score < 80:
35+
return "fairly easy"
36+
if score < 90:
37+
return "easy"
38+
return "very easy"
39+
40+
def maximum_sentence_complexity(req: MaximumSentenceComplexityModel):
41+
"""Chunks a text and calculates complexity of it."""
42+
textstat.set_lang(req.language)
43+
44+
nlp = SpacySingleton.get_nlp(MODELS.get(req.language, "en_core_web_sm")) # defaults to "en_core_web_sm"
45+
doc = nlp(req.text)
46+
47+
complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
48+
return {"overall_text_complexity": get_mapping_complexity(min(complexities))}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
```python
2+
import textstat
3+
import spacy
4+
5+
def get_mapping_complexity(score: int)-> str:
6+
if score < 30:
7+
return "very difficult"
8+
if score < 50:
9+
return "difficult"
10+
if score < 60:
11+
return "fairly difficult"
12+
if score < 70:
13+
return "standard"
14+
if score < 80:
15+
return "fairly easy"
16+
if score < 90:
17+
return "easy"
18+
return "very easy"
19+
20+
spacy_models_loaded = {}
21+
22+
def get_spacy(spacy_model: str):
23+
global spacy_models_loaded
24+
if spacy_model not in spacy_models_loaded:
25+
spacy_models_loaded[spacy_model] = spacy.load(spacy_model)
26+
return spacy_models_loaded[spacy_model]
27+
28+
def maximum_sentence_complexity(text: str, language: str = "en", spacy_model: str = "en_core_web_sm") -> str:
29+
"""
30+
@param text:
31+
@param language: iso language code
32+
@spacy model: name of a language model from SpaCy
33+
@return: string label of the aggregated sentence complexity of a text
34+
"""
35+
textstat.set_lang(language)
36+
nlp = get_spacy(spacy_model)
37+
doc = nlp(text)
38+
39+
complexities = [textstat.flesch_reading_ease(sent.text) for sent in doc.sents]
40+
return get_mapping_complexity(min(complexities))
41+
42+
43+
# ↑ necessary bricks function
44+
# -----------------------------------------------------------------------------------------
45+
# ↓ example implementation
46+
47+
48+
def example_integration():
49+
texts = [
50+
"""
51+
In a small town, there lived a humble baker named Thomas. He was known for his delicious pastries, which were loved by everyone in the town. Every morning, he would wake up early to prepare the dough for his pastries. He would then bake them in his old but reliable oven.
52+
One day, a stranger came to the town. He had heard about Thomas's pastries and wanted to try them. He went to the bakery and ordered a pastry. As he took his first bite, his eyes lit up with delight. He praised Thomas for his skill and promised to spread the word about his bakery.
53+
Word of Thomas's pastries spread far and wide. People from neighboring towns started visiting his bakery. Despite the increase in customers, Thomas remained humble. He continued to wake up early every morning to prepare his pastries, ensuring that each one was made with care.
54+
Thomas's story is a reminder that passion and dedication can lead to success. It shows that humility and hard work are respected and rewarded. His delicious pastries were not just food items but a source of joy for everyone who tasted them.
55+
"""
56+
]
57+
language = "en" # other languages: de, es, fr, it, nl, ru
58+
spacy_model = "en_core_web_sm"
59+
for text in texts:
60+
print(f"The highest complexity in \"{text}\" is {maximum_sentence_complexity(text, language, spacy_model)}")
61+
62+
example_integration()
63+
```
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
```python
2+
import textstat
3+
4+
ATTRIBUTE: str = "text" # only text attributes
5+
TARGET_LANGUAGE: str = "en" # iso codes
6+
7+
if TARGET_LANGUAGE is not None:
8+
textstat.set_lang(TARGET_LANGUAGE)
9+
10+
def maximum_sentence_complexity(record):
11+
complexities = [textstat.flesch_reading_ease(sent.text) for sent in record[ATTRIBUTE].sents]
12+
return get_mapping_complexity(min(complexities))
13+
14+
def get_mapping_complexity(score):
15+
if score < 30:
16+
return "very difficult"
17+
if score < 50:
18+
return "difficult"
19+
if score < 60:
20+
return "fairly difficult"
21+
if score < 70:
22+
return "standard"
23+
if score < 80:
24+
return "fairly easy"
25+
if score < 90:
26+
return "easy"
27+
return "very easy"
28+
```
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from util.configs import build_classifier_function_config
2+
from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
3+
from . import maximum_sentence_complexity, INPUT_EXAMPLE
4+
5+
6+
def get_config():
7+
return build_classifier_function_config(
8+
function=maximum_sentence_complexity,
9+
input_example=INPUT_EXAMPLE,
10+
issue_id=378,
11+
tabler_icon="ThreeDCubeSphere",
12+
min_refinery_version="1.7.0",
13+
state=State.PUBLIC.value,
14+
type="python_function",
15+
available_for=["refinery", "common"],
16+
part_of_group=[
17+
"reference_complexity",
18+
"question_complexity",
19+
"text_analysis"
20+
], # first entry should be parent directory
21+
# bricks integrator information
22+
cognition_init_mapping={
23+
"very easy": "Low",
24+
"easy": "Low",
25+
"fairly easy": "Low",
26+
"standard": "Medium",
27+
"fairly difficult": "Medium",
28+
"difficult": "High",
29+
"very difficult": "High",
30+
},
31+
integrator_inputs={
32+
"name": "maximum_sentence_complexity",
33+
"refineryDataType": RefineryDataType.TEXT.value,
34+
"outputs": [
35+
"very easy",
36+
"easy",
37+
"fairly easy",
38+
"standard",
39+
"fairly difficult",
40+
"difficult",
41+
"very difficult",
42+
],
43+
"variables": {
44+
"ATTRIBUTE": {
45+
"selectionType": SelectionType.CHOICE.value,
46+
"addInfo": [
47+
BricksVariableType.ATTRIBUTE.value,
48+
BricksVariableType.GENERIC_STRING.value,
49+
],
50+
},
51+
"TARGET_LANGUAGE": {
52+
"selectionType": SelectionType.CHOICE.value,
53+
"description": "supported iso codes",
54+
"defaultValue": "en",
55+
"allowedValues": ["en", "de", "es", "fr", "it", "nl", "ru"],
56+
"addInfo": [
57+
BricksVariableType.GENERIC_STRING.value,
58+
],
59+
},
60+
},
61+
},
62+
)

0 commit comments

Comments
 (0)