Merge pull request #362 from code-kern-ai/tiktoken-token-counter

FelixKirschKern · web-flow · commit c4c6ec7fb127 · 2023-10-23T15:57:04.000+02:00
Tiktoken token counter
diff --git a/generators/__init__.py b/generators/__init__.py
@@ -16,9 +16,7 @@
     domain_parser,
 )
 
-from .reference_chunking import (
-    newline_splitter
-)
+from .reference_chunking import newline_splitter
 
 from .search import (
     bing_news_search,
@@ -47,6 +45,7 @@
     phonetic_soundex,
     reading_time,
     syllable_count,
+    tiktoken_token_counter,
 )
 
 from .text_cleaning import html_cleanser, html_unescape
@@ -100,7 +99,8 @@
     bert_toxicity_detector,
     gpt_grammar_correction,
     gpt_tldr_summarization,
-    newline_splitter
+    tiktoken_token_counter,
+    newline_splitter,
 ]:
     module_name = module.__name__.split(".")[-1]
     model_name = (
diff --git a/generators/text_analytics/tiktoken_token_counter/README.md b/generators/text_analytics/tiktoken_token_counter/README.md
@@ -0,0 +1 @@
+Uses OpenAI's tiktoken tokenizer library to count the amount of tokens in a string. The tokenizer is used for the GPT models and converts words into integers. The conversion is reversible and lossless, meaning that a tokenized sentence can be converted back. This brick returns the amount of tokens in a given text.
diff --git a/generators/text_analytics/tiktoken_token_counter/__init__.py b/generators/text_analytics/tiktoken_token_counter/__init__.py
@@ -0,0 +1,20 @@
+from pydantic import BaseModel
+import tiktoken
+
+INPUT_EXAMPLE = {
+    "text": "What a beautiful day to count tokens."
+}
+
+
+class TiktokenTokenCounterModel(BaseModel):
+    text: str
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+
+def tiktoken_token_counter(req: TiktokenTokenCounterModel):
+    """Uses the Tiktoken library to count tokens in a string"""
+    encoding = tiktoken.get_encoding("cl100k_base")
+    tokens = encoding.encode(req.text)
+    return {"token_length": len(tokens)}
diff --git a/generators/text_analytics/tiktoken_token_counter/code_snippet_common.md b/generators/text_analytics/tiktoken_token_counter/code_snippet_common.md
@@ -0,0 +1,23 @@
+```python
+import tiktoken
+
+def tiktoken_token_counter(text: str, encoding_name: str = "cl100k_base") -> int:
+    """
+    @param text: Text you want to count the number of tokens in
+    @return: Integer with the token count
+    """
+    encoding = tiktoken.get_encoding(encoding_name)
+    tokens = encoding.encode(text)
+    return len(tokens)
+
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation 
+
+def example_integration():
+    texts = ["This is a short text with few tokens.", "This is a second short text"]
+    for text in texts:
+        print(f"\"{text}\" -> {tiktoken_token_counter(text)}")
+
+example_integration()
+```
diff --git a/generators/text_analytics/tiktoken_token_counter/code_snippet_refinery.md b/generators/text_analytics/tiktoken_token_counter/code_snippet_refinery.md
@@ -0,0 +1,11 @@
+```python
+import tiktoken 
+
+ATTRIBUTE: str = "text" # only text attributes
+ENCODING_NAME: str = "cl100k_base"
+
+def tiktoken_token_counter(record):
+    encoding = tiktoken.get_encoding(ENCODING_NAME)
+    tokens = encoding.encode(record[ATTRIBUTE].text)
+    return len(tokens)
+```
diff --git a/generators/text_analytics/tiktoken_token_counter/config.py b/generators/text_analytics/tiktoken_token_counter/config.py
@@ -0,0 +1,42 @@
+from util.configs import build_generator_function_config
+from util.enums import State, BricksVariableType, RefineryDataType, SelectionType
+from . import tiktoken_token_counter, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_generator_function_config(
+        # strapi information
+        function=tiktoken_token_counter,
+        input_example=INPUT_EXAMPLE,
+        issue_id=359,
+        tabler_icon="SortAscendingNumbers",
+        min_refinery_version="2.0.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "text_analytics",
+        ],  # first entry should be parent directory
+        # bricks integrator information
+        integrator_inputs={
+            "name": "tiktoken_token_counter",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value
+                    ]
+                },
+                "ENCODING_NAME": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "defaultValue": "cl100k_base",
+                    "allowedValues": ["cl100k_base", "p50k_base", "r50k_base"],
+                    "addInfo": [
+                        BricksVariableType.GENERIC_STRING.value
+                    ]
+                }
+            }
+        }
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Uses OpenAI's tiktoken tokenizer library to count the amount of tokens in a string. The tokenizer is used for the GPT models and converts words into integers. The conversion is reversible and lossless, meaning that a tokenized sentence can be converted back. This brick returns the amount of tokens in a given text.`