Skip to content

Commit eaaef97

Browse files
Merge remote-tracking branch 'origin/main' into word-count-classifier
2 parents 94254a8 + b1a8392 commit eaaef97

File tree

10 files changed

+178
-6
lines changed

10 files changed

+178
-6
lines changed

classifiers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from .reference_quality import (
1313
word_count_classifier,
14+
special_character_classifier,
1415
chunked_sentence_complexity,
1516
)
1617

@@ -63,6 +64,8 @@
6364
bert_sentiment_german,
6465
word_count_classifier,
6566
chunked_sentence_complexity,
67+
special_character_classifier,
68+
chunked_sentence_complexity
6669
]:
6770
module_name = module.__name__.split(".")[-1]
6871
model_name = (

classifiers/reference_quality/chunked_sentence_complexity/config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@ def get_config():
1818
"text_analysis"
1919
], # first entry should be parent directory
2020
# bricks integrator information
21+
cognition_init_mapping={
22+
"very easy": "null",
23+
"easy": "null",
24+
"fairly easy": "null",
25+
"standard": "null",
26+
"fairly difficult": "Needs fix",
27+
"difficult": "Needs fix",
28+
"very difficult": "Needs fix",
29+
},
2130
integrator_inputs={
2231
"name": "chunked_sentence_complexity",
2332
"refineryDataType": RefineryDataType.TEXT.value,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The purpose of this brick is to identify if there are any unusual characters in the given text. This function can be useful for text preprocessing tasks, especially for checking reference material in RAG (Retrieval Augmented Generation) use cases where you want to filter out text that contains unusual or unexpected characters.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import unicodedata
2+
from typing import Optional, List, Tuple
3+
from pydantic import BaseModel
4+
5+
INPUT_EXAMPLE = {
6+
"text": "Super funny haha 😀.",
7+
"allowedRange": None
8+
}
9+
10+
ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin
11+
set(range(160, 255)), # Latin-1 Supplement
12+
set(range(256, 384)), # Latin Extended-A
13+
set(range(384, 592)), # Latin Extended-B
14+
set(range(8192, 8303)), # General Punctuation
15+
set(range(8352, 8399)), # Currency Symbols
16+
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
17+
)
18+
19+
class SpecialCharacterClassifierModel(BaseModel):
20+
text: str
21+
allowedRange: Optional[List[int]] = None
22+
23+
class Config:
24+
schema_extra = {"example": INPUT_EXAMPLE}
25+
26+
27+
def special_character_classifier(req: SpecialCharacterClassifierModel):
28+
"""Checks if a string contains special characters"""
29+
text = req.text
30+
allowed_range = req.allowedRange
31+
if allowed_range is None:
32+
allowed_range = ALLOWED_RANGE
33+
34+
for char in text:
35+
if ord(char) not in allowed_range and unicodedata.category(char) != "Zs":
36+
return {"contains_special_char": True}
37+
return {"contains_special_char": False}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
```python
2+
import unicodedata
3+
from typing import List, Tuple
4+
5+
DEFAULT_ALLOWED_RANGE = set(range(32, 127)).union( # Basic Latin
6+
set(range(160, 255)), # Latin-1 Supplement
7+
set(range(256, 384)), # Latin Extended-A
8+
set(range(384, 592)), # Latin Extended-B
9+
set(range(8192, 8303)), # General Punctuation
10+
set(range(8352, 8399)), # Currency Symbols
11+
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
12+
)
13+
14+
15+
def special_character_classifier(text: str, allowed_range: List[int] = None) -> str:
16+
"""
17+
@param text: Text to detect special characters in
18+
@param allowed_ranges: whitelist of hexcodes
19+
@return: boolean if text contains special characters
20+
"""
21+
22+
if allowed_range is None:
23+
allowed_range= DEFAULT_ALLOWED_RANGE
24+
25+
for char in text:
26+
if ord(char) not in allowed_range and unicodedata.category(char) != "Zs":
27+
return True
28+
return False
29+
30+
31+
# ↑ necessary bricks function
32+
# -----------------------------------------------------------------------------------------
33+
# ↓ example implementation
34+
35+
def example_integration():
36+
texts = ["This contains a special char 你好.", "Such a clean text, wow!", "This is a greek letter: α", "Super funny 😀", "Rainbows are very nice."]
37+
for text in texts:
38+
print(f"\"{text}\" -> {special_character_classifier(text)}")
39+
40+
example_integration()
41+
```
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
```python
2+
import unicodedata
3+
from typing import Optional, List, Tuple
4+
5+
ATTRIBUTE: str = "text" # only text attributes
6+
LABEL: str = "has_special_character"
7+
ALLOWED_RANGE: List[int] = None # list of integers that represent Unicode code points
8+
9+
def special_character_classifier(record):
10+
text = record[ATTRIBUTE].text
11+
12+
allowed = ALLOWED_RANGE
13+
if not allowed:
14+
allowed = default_allowed_values
15+
for char in text:
16+
if ord(char) not in allowed and unicodedata.category(char) != "Zs":
17+
return LABEL
18+
19+
default_allowed_values = set(range(32, 127)).union( # Basic Latin
20+
set(range(160, 255)), # Latin-1 Supplement
21+
set(range(256, 384)), # Latin Extended-A
22+
set(range(384, 592)), # Latin Extended-B
23+
set(range(8192, 8303)), # General Punctuation
24+
set(range(8352, 8399)), # Currency Symbols
25+
set([ord("\t"), ord("\n"), ord("\r")])# common stop chars
26+
)
27+
```
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from util.configs import build_classifier_function_config
2+
from util.enums import State, BricksVariableType, RefineryDataType, SelectionType
3+
from . import special_character_classifier, INPUT_EXAMPLE
4+
5+
6+
def get_config():
7+
return build_classifier_function_config(
8+
# strapi information
9+
function=special_character_classifier,
10+
input_example=INPUT_EXAMPLE,
11+
issue_id=345,
12+
tabler_icon="LanguageKatakana",
13+
min_refinery_version="1.7.0",
14+
state=State.PUBLIC.value,
15+
type="python_function",
16+
available_for=["refinery", "common"],
17+
part_of_group=[
18+
"reference_quality",
19+
], # first entry should be parent directory
20+
# bricks integrator information
21+
cognition_init_mapping = {
22+
"@@LABEL@@": "Needs fix",
23+
},
24+
integrator_inputs={
25+
"name": "special_character_classifier",
26+
"refineryDataType": RefineryDataType.TEXT.value,
27+
"variables": {
28+
"ATTRIBUTE": {
29+
"selectionType": SelectionType.CHOICE.value,
30+
"addInfo": [
31+
BricksVariableType.ATTRIBUTE.value,
32+
BricksVariableType.GENERIC_STRING.value
33+
]
34+
},
35+
"ALLOWED_RANGE": {
36+
"selectionType": SelectionType.LIST.value,
37+
"optional": "true",
38+
"addInfo": [
39+
BricksVariableType.GENERIC_INT.value
40+
]
41+
},
42+
"LABEL": {
43+
"selectionType": SelectionType.CHOICE.value,
44+
"defaultValue": "has_special_character",
45+
"addInfo": [
46+
BricksVariableType.LABEL.value,
47+
BricksVariableType.GENERIC_STRING.value
48+
]
49+
}
50+
}
51+
}
52+
)

classifiers/text_analysis/language_detection/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ def language_detection(request: LanguageDetectionModel):
1717
"""Detects the language of a given text."""
1818

1919
text = request.text
20-
try:
21-
language = detect(text)
22-
return {"language": language}
23-
except LangDetectException:
24-
return "No language detected."
20+
if not text or not text.strip():
21+
return {"language": "unknown"}
22+
return {"language": detect(text)}

classifiers/text_analysis/language_detection/code_snippet_common.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ def language_detection(text:str)->str:
66
@param text: text to check
77
@return: language iso code. Full list here https://github.com/Mimino666/langdetect#languages
88
"""
9+
if not text or not text.strip():
10+
return "unknown"
911
return detect(text)
1012

1113
# ↑ necessary bricks function

classifiers/text_analysis/language_detection/code_snippet_refinery.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ ATTRIBUTE: str = "text" #only text attributes
55

66
def language_detection(record):
77
text = record[ATTRIBUTE].text # SpaCy document, hence we need to call .text to get the string
8-
return detect(text) # e.g. "en"
8+
if not text or not text.strip():
9+
return "unknown"
10+
return detect(text)
911
```

0 commit comments

Comments
 (0)