Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ evals/log
evals/cases
config/eval-llms.local.yml
# this gets rid of search results from ag, ripgrep, etc
tokenizers/
public/ai-share/highlight.min.js
2 changes: 1 addition & 1 deletion app/models/embedding_definition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def tokenizer_names
DiscourseAi::Tokenizer::GeminiTokenizer,
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
DiscourseAi::Tokenizer::MixtralTokenizer,
DiscourseAi::Tokenizer::MistralTokenizer,
DiscourseAi::Tokenizer::QwenTokenizer,
].map(&:name)
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

class RenameMixtralTokenizerToMistralTokenizer < ActiveRecord::Migration[7.2]
def up
execute <<~SQL
UPDATE
llm_models
SET
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
WHERE
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
SQL

execute <<~SQL
UPDATE
embedding_definitions
SET
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
WHERE
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
SQL
end

def down
execute <<~SQL
UPDATE
llm_models
SET
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
WHERE
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
SQL

execute <<~SQL
UPDATE
embedding_definitions
SET
tokenizer_class = 'DiscourseAi::Tokenizer::Mixtral'
WHERE
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
SQL
end
end
7 changes: 6 additions & 1 deletion lib/automation/llm_triage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@ def self.handle(

content = "title: #{post.topic.title}\n#{post.raw}"

content = llm.tokenizer.truncate(content, max_post_tokens) if max_post_tokens.present?
content =
llm.tokenizer.truncate(
content,
max_post_tokens,
strict: SiteSetting.ai_strict_token_counting,
) if max_post_tokens.present?

if post.upload_ids.present?
content = [content]
Expand Down
7 changes: 6 additions & 1 deletion lib/automation/report_context_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ def format_post(post)
buffer << post.created_at.strftime("%Y-%m-%d %H:%M")
buffer << "user: #{post.user&.username}"
buffer << "likes: #{post.like_count}"
excerpt = @tokenizer.truncate(post.raw, @tokens_per_post)
excerpt =
@tokenizer.truncate(
post.raw,
@tokens_per_post,
strict: SiteSetting.ai_strict_token_counting,
)
excerpt = "excerpt: #{excerpt}..." if excerpt.length < post.raw.length
buffer << "#{excerpt}"
{ likes: post.like_count, info: buffer.join("\n") }
Expand Down
1 change: 1 addition & 0 deletions lib/completions/dialects/dialect.rb
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def trim_messages(messages)
system_message[:content] = tokenizer.truncate(
system_message[:content],
max_system_tokens,
strict: SiteSetting.ai_strict_token_counting,
)
end

Expand Down
2 changes: 1 addition & 1 deletion lib/completions/llm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def presets
display_name: "Pixtral Large",
},
],
tokenizer: DiscourseAi::Tokenizer::MixtralTokenizer,
tokenizer: DiscourseAi::Tokenizer::MistralTokenizer,
endpoint: "https://api.mistral.ai/v1/chat/completions",
provider: "mistral",
},
Expand Down
12 changes: 8 additions & 4 deletions lib/embeddings/strategies/truncation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def prepare_target_text(target, vdef)
when Post
post_truncation(target, vdef.tokenizer, max_length)
when RagDocumentFragment
vdef.tokenizer.truncate(target.fragment, max_length)
vdef.tokenizer.truncate(
target.fragment,
max_length,
strict: SiteSetting.ai_strict_token_counting,
)
else
raise ArgumentError, "Invalid target type"
end
Expand All @@ -36,7 +40,7 @@ def prepare_query_text(text, vdef, asymetric: false)
qtext = asymetric ? "#{vdef.search_prompt} #{text}" : text
max_length = vdef.max_sequence_length - 2

vdef.tokenizer.truncate(qtext, max_length)
vdef.tokenizer.truncate(qtext, max_length, strict: SiteSetting.ai_strict_token_counting)
end

private
Expand Down Expand Up @@ -74,7 +78,7 @@ def topic_truncation(topic, tokenizer, max_length)
text << "\n\n"
end

tokenizer.truncate(text, max_length)
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
end

def post_truncation(post, tokenizer, max_length)
Expand All @@ -86,7 +90,7 @@ def post_truncation(post, tokenizer, max_length)
text << Nokogiri::HTML5.fragment(post.cooked).text
end

tokenizer.truncate(text, max_length)
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
end
end
end
Expand Down
7 changes: 6 additions & 1 deletion lib/personas/question_consolidator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ def revised_prompt
truncated_content = content

if current_tokens > allowed_tokens
truncated_content = @llm.tokenizer.truncate(content, allowed_tokens)
truncated_content =
@llm.tokenizer.truncate(
content,
allowed_tokens,
strict: SiteSetting.ai_strict_token_counting,
)
current_tokens = allowed_tokens
end

Expand Down
4 changes: 3 additions & 1 deletion lib/personas/tool_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,9 @@ def rag_search(query, filenames: nil, limit: 10)
def attach_truncate(mini_racer_context)
mini_racer_context.attach(
"_llm_truncate",
->(text, length) { @llm.tokenizer.truncate(text, length) },
->(text, length) do
@llm.tokenizer.truncate(text, length, strict: SiteSetting.ai_strict_token_counting)
end,
)

mini_racer_context.attach(
Expand Down
5 changes: 4 additions & 1 deletion lib/personas/tools/google.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ def minimize_field(result, field, llm, max_tokens: 100)
data = result[field]
return "" if data.blank?

llm.tokenizer.truncate(data, max_tokens).squish
llm
.tokenizer
.truncate(data, max_tokens, strict: SiteSetting.ai_strict_token_counting)
.squish
end

def parse_search_json(json_data, escaped_query, llm)
Expand Down
7 changes: 6 additions & 1 deletion lib/personas/tools/setting_context.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ def invoke

result.gsub!(/^#{Regexp.escape(Rails.root.to_s)}/, "")

result = llm.tokenizer.truncate(result, MAX_CONTEXT_TOKENS)
result =
llm.tokenizer.truncate(
result,
MAX_CONTEXT_TOKENS,
strict: SiteSetting.ai_strict_token_counting,
)

{ setting_name: setting_name, context: result }
end
Expand Down
2 changes: 1 addition & 1 deletion lib/personas/tools/tool.rb
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def truncate(text, llm:, percent_length: nil, max_length: nil)
target = max_length if target > max_length
end

llm.tokenizer.truncate(text, target)
llm.tokenizer.truncate(text, target, strict: SiteSetting.ai_strict_token_counting)
end

def accepted_options
Expand Down
6 changes: 5 additions & 1 deletion lib/sentiment/post_classification.rb
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,11 @@ def prepare_text(target)
target.raw
end

Tokenizer::BertTokenizer.truncate(content, 512)
DiscourseAi::Tokenizer::BertTokenizer.truncate(
content,
512,
strict: SiteSetting.ai_strict_token_counting,
)
end

def request_with(client, content)
Expand Down
18 changes: 15 additions & 3 deletions lib/summarization/fold_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ def fold(items, user, &on_partial_blk)
items.each_with_index do |item, idx|
as_text = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "

if tokenizer.below_limit?(as_text, tokens_left)
if tokenizer.below_limit?(
as_text,
tokens_left,
strict: SiteSetting.ai_strict_token_counting,
)
content_in_window << item
tokens_left -= tokenizer.size(as_text)
else
Expand Down Expand Up @@ -151,8 +155,16 @@ def truncate(item)
tokenizer = llm_model.tokenizer_class

item[:text] = [
tokenizer.truncate(split_1, truncation_length),
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
tokenizer.truncate(
split_1,
truncation_length,
strict: SiteSetting.ai_strict_token_counting,
),
tokenizer.truncate(
split_2.reverse,
truncation_length,
strict: SiteSetting.ai_strict_token_counting,
).reverse,
].join(" ")

item
Expand Down
12 changes: 0 additions & 12 deletions lib/tokenizer/all_mpnet_base_v2_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/anthropic_tokenizer.rb

This file was deleted.

55 changes: 0 additions & 55 deletions lib/tokenizer/basic_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/bert_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/bge_large_en_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/bge_m3_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/gemini_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/llama3_tokenizer.rb

This file was deleted.

Loading