Merge pull request #364 from code-kern-ai/word-count-classifier

LeonardPuettmannKern · web-flow · commit 2c80e776a5b5 · 2023-10-17T14:12:19.000+02:00
Word count classifier
diff --git a/classifiers/__init__.py b/classifiers/__init__.py
@@ -14,6 +14,7 @@
 )
 
 from .reference_quality import (
+    word_count_classifier,
     special_character_classifier,
     chunked_sentence_complexity,
 )
@@ -65,6 +66,8 @@
     workday_classifier,
     deberta_review_classifier, 
     bert_sentiment_german,
+    word_count_classifier,
+    chunked_sentence_complexity,
     special_character_classifier,
     chunked_sentence_complexity,
     question_type_classifier
diff --git a/classifiers/reference_quality/word_count_classifier/README.md b/classifiers/reference_quality/word_count_classifier/README.md
@@ -0,0 +1,2 @@
+This simple brick counts the amount of words in a string by splitting it. If there are less than five words found, it returns "short". If there are less than 20, it returns "medium" 
+and with more than 20, it returns "long". This brick can be used to check the quality of references for RAG (Retrieval Augmented Generation) use cases.
diff --git a/classifiers/reference_quality/word_count_classifier/__init__.py b/classifiers/reference_quality/word_count_classifier/__init__.py
@@ -0,0 +1,22 @@
+from pydantic import BaseModel
+
+INPUT_EXAMPLE = {"text": "This is too short!"}
+
+
+class WordCountClassifierModel(BaseModel):
+    text: str
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+
+def word_count_classifier(req: WordCountClassifierModel):
+    """Checks the length of a string by counting the number of words in it"""
+    words = req.text.split()
+    length = len(words)
+    if length < 5:
+        return {"text_length": "short"}
+    elif length < 20:
+        return {"text_length": "medium"}
+    else:
+        return {"text_length": "long"}
diff --git a/classifiers/reference_quality/word_count_classifier/code_snippet_common.md b/classifiers/reference_quality/word_count_classifier/code_snippet_common.md
@@ -0,0 +1,26 @@
+```python
+def word_count_classifier(text: str) -> str:
+    """
+    @param text: text to check the length of.
+    @return: either 'short', 'medium' or 'long' depending on the counted words.
+    """
+    words = text.split()
+    length = len(words)
+    if length < 5:
+          return "short"
+    elif length < 20:
+          return "medium"
+    else:
+          return "long"
+
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation 
+
+def example_integration():
+    texts = ["This is short.", "This is a text with medium length.", "This is a longer text with many more words. There is even a second sentence with extra words. Splendid, what a joyful day!"]
+    for text in texts:
+        print(f"\"{text}\" is -> {word_count_classifier(text)}")
+
+example_integration()
+```
diff --git a/classifiers/reference_quality/word_count_classifier/code_snippet_refinery.md b/classifiers/reference_quality/word_count_classifier/code_snippet_refinery.md
@@ -0,0 +1,13 @@
+```python
+ATTRIBUTE: str = "text" # only text attributes
+
+def word_count_classifier(record):
+    words = record[ATTRIBUTE].text.split()
+    length = len(words)
+    if length < 5:
+        return "short"
+    elif length < 20:
+        return "medium"
+    else:
+        return "long"
+```
diff --git a/classifiers/reference_quality/word_count_classifier/config.py b/classifiers/reference_quality/word_count_classifier/config.py
@@ -0,0 +1,45 @@
+from util.configs import build_classifier_function_config
+from util.enums import State, RefineryDataType, SelectionType, BricksVariableType
+from . import word_count_classifier, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_classifier_function_config(
+        function=word_count_classifier,
+        input_example=INPUT_EXAMPLE,
+        issue_id=348,
+        tabler_icon="RulerMeasure",
+        min_refinery_version="1.7.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "reference_quality",
+            "text_analysis"
+        ],  # first entry should be parent directory
+        # mapping lables for cognition
+        cognition_init_mapping={
+            "short": "Needs fix",
+            "medium": "null",
+            "long": "null",
+        },
+        # bricks integrator information
+        integrator_inputs={
+            "name": "word_count_classifier",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "outputs": [
+                "short",
+                "medium", 
+                "long"
+            ],
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.STRING.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                }
+            },
+        },
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This simple brick counts the amount of words in a string by splitting it. If there are less than five words found, it returns "short". If there are less than 20, it returns "medium"`
	`2`	`+and with more than 20, it returns "long". This brick can be used to check the quality of references for RAG (Retrieval Augmented Generation) use cases.`