Skip to content

Commit c4c6ec7

Browse files
Merge pull request #362 from code-kern-ai/tiktoken-token-counter
Tiktoken token counter
2 parents e060620 + b319c29 commit c4c6ec7

File tree

6 files changed

+101
-4
lines changed

6 files changed

+101
-4
lines changed

generators/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
domain_parser,
1717
)
1818

19-
from .reference_chunking import (
20-
newline_splitter
21-
)
19+
from .reference_chunking import newline_splitter
2220

2321
from .search import (
2422
bing_news_search,
@@ -47,6 +45,7 @@
4745
phonetic_soundex,
4846
reading_time,
4947
syllable_count,
48+
tiktoken_token_counter,
5049
)
5150

5251
from .text_cleaning import html_cleanser, html_unescape
@@ -100,7 +99,8 @@
10099
bert_toxicity_detector,
101100
gpt_grammar_correction,
102101
gpt_tldr_summarization,
103-
newline_splitter
102+
tiktoken_token_counter,
103+
newline_splitter,
104104
]:
105105
module_name = module.__name__.split(".")[-1]
106106
model_name = (
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Uses OpenAI's tiktoken tokenizer library to count the amount of tokens in a string. The tokenizer is used for the GPT models and converts words into integers. The conversion is reversible and lossless, meaning that a tokenized sentence can be converted back. This brick returns the amount of tokens in a given text.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pydantic import BaseModel
2+
import tiktoken
3+
4+
INPUT_EXAMPLE = {
5+
"text": "What a beautiful day to count tokens."
6+
}
7+
8+
9+
class TiktokenTokenCounterModel(BaseModel):
10+
text: str
11+
12+
class Config:
13+
schema_extra = {"example": INPUT_EXAMPLE}
14+
15+
16+
def tiktoken_token_counter(req: TiktokenTokenCounterModel):
17+
"""Uses the Tiktoken library to count tokens in a string"""
18+
encoding = tiktoken.get_encoding("cl100k_base")
19+
tokens = encoding.encode(req.text)
20+
return {"token_length": len(tokens)}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
```python
2+
import tiktoken
3+
4+
def tiktoken_token_counter(text: str, encoding_name: str = "cl100k_base") -> int:
5+
"""
6+
@param text: Text you want to count the number of tokens in
7+
@return: Integer with the token count
8+
"""
9+
encoding = tiktoken.get_encoding(encoding_name)
10+
tokens = encoding.encode(text)
11+
return len(tokens)
12+
13+
# ↑ necessary bricks function
14+
# -----------------------------------------------------------------------------------------
15+
# ↓ example implementation
16+
17+
def example_integration():
18+
texts = ["This is a short text with few tokens.", "This is a second short text"]
19+
for text in texts:
20+
print(f"\"{text}\" -> {tiktoken_token_counter(text)}")
21+
22+
example_integration()
23+
```
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
```python
2+
import tiktoken
3+
4+
ATTRIBUTE: str = "text" # only text attributes
5+
ENCODING_NAME: str = "cl100k_base"
6+
7+
def tiktoken_token_counter(record):
8+
encoding = tiktoken.get_encoding(ENCODING_NAME)
9+
tokens = encoding.encode(record[ATTRIBUTE].text)
10+
return len(tokens)
11+
```
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from util.configs import build_generator_function_config
2+
from util.enums import State, BricksVariableType, RefineryDataType, SelectionType
3+
from . import tiktoken_token_counter, INPUT_EXAMPLE
4+
5+
6+
def get_config():
7+
return build_generator_function_config(
8+
# strapi information
9+
function=tiktoken_token_counter,
10+
input_example=INPUT_EXAMPLE,
11+
issue_id=359,
12+
tabler_icon="SortAscendingNumbers",
13+
min_refinery_version="2.0.0",
14+
state=State.PUBLIC.value,
15+
type="python_function",
16+
available_for=["refinery", "common"],
17+
part_of_group=[
18+
"text_analytics",
19+
], # first entry should be parent directory
20+
# bricks integrator information
21+
integrator_inputs={
22+
"name": "tiktoken_token_counter",
23+
"refineryDataType": RefineryDataType.TEXT.value,
24+
"variables": {
25+
"ATTRIBUTE": {
26+
"selectionType": SelectionType.CHOICE.value,
27+
"addInfo": [
28+
BricksVariableType.ATTRIBUTE.value,
29+
BricksVariableType.GENERIC_STRING.value
30+
]
31+
},
32+
"ENCODING_NAME": {
33+
"selectionType": SelectionType.CHOICE.value,
34+
"defaultValue": "cl100k_base",
35+
"allowedValues": ["cl100k_base", "p50k_base", "r50k_base"],
36+
"addInfo": [
37+
BricksVariableType.GENERIC_STRING.value
38+
]
39+
}
40+
}
41+
}
42+
)

0 commit comments

Comments
 (0)