From 3e3177e4152c8543f859b99b4ec5238c4e5a1f8c Mon Sep 17 00:00:00 2001 From: questcollector Date: Fri, 4 Jul 2025 16:02:35 +0900 Subject: [PATCH] fix: add disallowed_special on tiktoken encode --- ragas/src/ragas/testset/transforms/base.py | 2 +- ragas/src/ragas/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ragas/src/ragas/testset/transforms/base.py b/ragas/src/ragas/testset/transforms/base.py index 49945e482..537b8e153 100644 --- a/ragas/src/ragas/testset/transforms/base.py +++ b/ragas/src/ragas/testset/transforms/base.py @@ -199,7 +199,7 @@ class LLMBasedExtractor(Extractor, PromptMixin): def split_text_by_token_limit(self, text, max_token_limit): # Tokenize the entire input string - tokens = self.tokenizer.encode(text) + tokens = self.tokenizer.encode(text, disallowed_special=()) # Split tokens into chunks of max_token_limit or less chunks = [] diff --git a/ragas/src/ragas/utils.py b/ragas/src/ragas/utils.py index b24818427..82c244419 100644 --- a/ragas/src/ragas/utils.py +++ b/ragas/src/ragas/utils.py @@ -225,7 +225,7 @@ def camel_to_snake(name): def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) - num_tokens = len(encoding.encode(string)) + num_tokens = len(encoding.encode(string, disallowed_special=())) return num_tokens