Skip to content

Commit 2c80e77

Browse files
Merge pull request #364 from code-kern-ai/word-count-classifier
Word count classifier
2 parents 24bebe5 + 0741971 commit 2c80e77

File tree

6 files changed

+111
-0
lines changed

6 files changed

+111
-0
lines changed

classifiers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515

1616
from .reference_quality import (
17+
word_count_classifier,
1718
special_character_classifier,
1819
chunked_sentence_complexity,
1920
)
@@ -65,6 +66,8 @@
6566
workday_classifier,
6667
deberta_review_classifier,
6768
bert_sentiment_german,
69+
word_count_classifier,
70+
chunked_sentence_complexity,
6871
special_character_classifier,
6972
chunked_sentence_complexity,
7073
question_type_classifier
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This simple brick counts the amount of words in a string by splitting it. If there are less than five words found, it returns "short". If there are less than 20, it returns "medium"
2+
and with more than 20, it returns "long". This brick can be used to check the quality of references for RAG (Retrieval Augmented Generation) use cases.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from pydantic import BaseModel
2+
3+
INPUT_EXAMPLE = {"text": "This is too short!"}
4+
5+
6+
class WordCountClassifierModel(BaseModel):
7+
text: str
8+
9+
class Config:
10+
schema_extra = {"example": INPUT_EXAMPLE}
11+
12+
13+
def word_count_classifier(req: WordCountClassifierModel):
14+
"""Checks the length of a string by counting the number of words in it"""
15+
words = req.text.split()
16+
length = len(words)
17+
if length < 5:
18+
return {"text_length": "short"}
19+
elif length < 20:
20+
return {"text_length": "medium"}
21+
else:
22+
return {"text_length": "long"}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
```python
2+
def word_count_classifier(text: str) -> str:
3+
"""
4+
@param text: text to check the length of.
5+
@return: either 'short', 'medium' or 'long' depending on the counted words.
6+
"""
7+
words = text.split()
8+
length = len(words)
9+
if length < 5:
10+
return "short"
11+
elif length < 20:
12+
return "medium"
13+
else:
14+
return "long"
15+
16+
# ↑ necessary bricks function
17+
# -----------------------------------------------------------------------------------------
18+
# ↓ example implementation
19+
20+
def example_integration():
21+
texts = ["This is short.", "This is a text with medium length.", "This is a longer text with many more words. There is even a second sentence with extra words. Splendid, what a joyful day!"]
22+
for text in texts:
23+
print(f"\"{text}\" is -> {word_count_classifier(text)}")
24+
25+
example_integration()
26+
```
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
```python
2+
ATTRIBUTE: str = "text" # only text attributes
3+
4+
def word_count_classifier(record):
5+
words = record[ATTRIBUTE].text.split()
6+
length = len(words)
7+
if length < 5:
8+
return "short"
9+
elif length < 20:
10+
return "medium"
11+
else:
12+
return "long"
13+
```
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from util.configs import build_classifier_function_config
2+
from util.enums import State, RefineryDataType, SelectionType, BricksVariableType
3+
from . import word_count_classifier, INPUT_EXAMPLE
4+
5+
6+
def get_config():
7+
return build_classifier_function_config(
8+
function=word_count_classifier,
9+
input_example=INPUT_EXAMPLE,
10+
issue_id=348,
11+
tabler_icon="RulerMeasure",
12+
min_refinery_version="1.7.0",
13+
state=State.PUBLIC.value,
14+
type="python_function",
15+
available_for=["refinery", "common"],
16+
part_of_group=[
17+
"reference_quality",
18+
"text_analysis"
19+
], # first entry should be parent directory
20+
# mapping lables for cognition
21+
cognition_init_mapping={
22+
"short": "Needs fix",
23+
"medium": "null",
24+
"long": "null",
25+
},
26+
# bricks integrator information
27+
integrator_inputs={
28+
"name": "word_count_classifier",
29+
"refineryDataType": RefineryDataType.TEXT.value,
30+
"outputs": [
31+
"short",
32+
"medium",
33+
"long"
34+
],
35+
"variables": {
36+
"ATTRIBUTE": {
37+
"selectionType": SelectionType.STRING.value,
38+
"addInfo": [
39+
BricksVariableType.ATTRIBUTE.value,
40+
BricksVariableType.GENERIC_STRING.value,
41+
],
42+
}
43+
},
44+
},
45+
)

0 commit comments

Comments
 (0)