Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit d792919

Browse files
xfalcoxromanrizzi
andauthored
DEV: Move tokenizers to a gem (#1481)
Also renames the Mixtral tokenizer to Mistral. See gem at github.com/discourse/discourse_ai-tokenizers Co-authored-by: Roman Rizzi <roman@discourse.org>
1 parent 75fb371 commit d792919

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+132
-5670685
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,4 @@ evals/log
66
evals/cases
77
config/eval-llms.local.yml
88
# this gets rid of search results from ag, ripgrep, etc
9-
tokenizers/
109
public/ai-share/highlight.min.js

app/models/embedding_definition.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def tokenizer_names
2323
DiscourseAi::Tokenizer::GeminiTokenizer,
2424
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
2525
DiscourseAi::Tokenizer::OpenAiTokenizer,
26-
DiscourseAi::Tokenizer::MixtralTokenizer,
26+
DiscourseAi::Tokenizer::MistralTokenizer,
2727
DiscourseAi::Tokenizer::QwenTokenizer,
2828
].map(&:name)
2929
end
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# frozen_string_literal: true
2+
3+
class RenameMixtralTokenizerToMistralTokenizer < ActiveRecord::Migration[7.2]
4+
def up
5+
execute <<~SQL
6+
UPDATE
7+
llm_models
8+
SET
9+
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
10+
WHERE
11+
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
12+
SQL
13+
14+
execute <<~SQL
15+
UPDATE
16+
embedding_definitions
17+
SET
18+
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
19+
WHERE
20+
tokenizer_class = 'DiscourseAi::Tokenizer::Mixtral'
21+
SQL
22+
end
23+
24+
def down
25+
execute <<~SQL
26+
UPDATE
27+
llm_models
28+
SET
29+
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
30+
WHERE
31+
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
32+
SQL
33+
34+
execute <<~SQL
35+
UPDATE
36+
embedding_definitions
37+
SET
38+
tokenizer_class = 'DiscourseAi::Tokenizer::Mixtral'
39+
WHERE
40+
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
41+
SQL
42+
end
43+
end

lib/automation/llm_triage.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,12 @@ def self.handle(
4040

4141
content = "title: #{post.topic.title}\n#{post.raw}"
4242

43-
content = llm.tokenizer.truncate(content, max_post_tokens) if max_post_tokens.present?
43+
content =
44+
llm.tokenizer.truncate(
45+
content,
46+
max_post_tokens,
47+
strict: SiteSetting.ai_strict_token_counting,
48+
) if max_post_tokens.present?
4449

4550
if post.upload_ids.present?
4651
content = [content]

lib/automation/report_context_generator.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,12 @@ def format_post(post)
9999
buffer << post.created_at.strftime("%Y-%m-%d %H:%M")
100100
buffer << "user: #{post.user&.username}"
101101
buffer << "likes: #{post.like_count}"
102-
excerpt = @tokenizer.truncate(post.raw, @tokens_per_post)
102+
excerpt =
103+
@tokenizer.truncate(
104+
post.raw,
105+
@tokens_per_post,
106+
strict: SiteSetting.ai_strict_token_counting,
107+
)
103108
excerpt = "excerpt: #{excerpt}..." if excerpt.length < post.raw.length
104109
buffer << "#{excerpt}"
105110
{ likes: post.like_count, info: buffer.join("\n") }

lib/completions/dialects/dialect.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def trim_messages(messages)
147147
system_message[:content] = tokenizer.truncate(
148148
system_message[:content],
149149
max_system_tokens,
150+
strict: SiteSetting.ai_strict_token_counting,
150151
)
151152
end
152153

lib/completions/llm.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def presets
190190
display_name: "Pixtral Large",
191191
},
192192
],
193-
tokenizer: DiscourseAi::Tokenizer::MixtralTokenizer,
193+
tokenizer: DiscourseAi::Tokenizer::MistralTokenizer,
194194
endpoint: "https://api.mistral.ai/v1/chat/completions",
195195
provider: "mistral",
196196
},

lib/embeddings/strategies/truncation.rb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ def prepare_target_text(target, vdef)
2222
when Post
2323
post_truncation(target, vdef.tokenizer, max_length)
2424
when RagDocumentFragment
25-
vdef.tokenizer.truncate(target.fragment, max_length)
25+
vdef.tokenizer.truncate(
26+
target.fragment,
27+
max_length,
28+
strict: SiteSetting.ai_strict_token_counting,
29+
)
2630
else
2731
raise ArgumentError, "Invalid target type"
2832
end
@@ -36,7 +40,7 @@ def prepare_query_text(text, vdef, asymetric: false)
3640
qtext = asymetric ? "#{vdef.search_prompt} #{text}" : text
3741
max_length = vdef.max_sequence_length - 2
3842

39-
vdef.tokenizer.truncate(qtext, max_length)
43+
vdef.tokenizer.truncate(qtext, max_length, strict: SiteSetting.ai_strict_token_counting)
4044
end
4145

4246
private
@@ -74,7 +78,7 @@ def topic_truncation(topic, tokenizer, max_length)
7478
text << "\n\n"
7579
end
7680

77-
tokenizer.truncate(text, max_length)
81+
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
7882
end
7983

8084
def post_truncation(post, tokenizer, max_length)
@@ -86,7 +90,7 @@ def post_truncation(post, tokenizer, max_length)
8690
text << Nokogiri::HTML5.fragment(post.cooked).text
8791
end
8892

89-
tokenizer.truncate(text, max_length)
93+
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
9094
end
9195
end
9296
end

lib/personas/question_consolidator.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,12 @@ def revised_prompt
4242
truncated_content = content
4343

4444
if current_tokens > allowed_tokens
45-
truncated_content = @llm.tokenizer.truncate(content, allowed_tokens)
45+
truncated_content =
46+
@llm.tokenizer.truncate(
47+
content,
48+
allowed_tokens,
49+
strict: SiteSetting.ai_strict_token_counting,
50+
)
4651
current_tokens = allowed_tokens
4752
end
4853

lib/personas/tool_runner.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,9 @@ def rag_search(query, filenames: nil, limit: 10)
278278
def attach_truncate(mini_racer_context)
279279
mini_racer_context.attach(
280280
"_llm_truncate",
281-
->(text, length) { @llm.tokenizer.truncate(text, length) },
281+
->(text, length) do
282+
@llm.tokenizer.truncate(text, length, strict: SiteSetting.ai_strict_token_counting)
283+
end,
282284
)
283285

284286
mini_racer_context.attach(

0 commit comments

Comments
 (0)