Skip to content

Commit 3f0284b

Browse files
committed
support long post truncation
1 parent 988479b commit 3f0284b

File tree

6 files changed

+165
-22
lines changed

6 files changed

+165
-22
lines changed

config/locales/server.en.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ en:
337337
include_private:
338338
name: "Include private"
339339
description: "Include private topics in the filters"
340+
max_tokens_per_post:
341+
name: "Maximum tokens per post"
342+
description: "Maximum number of tokens to use for each post in the filter"
340343
create_artifact:
341344
creator_llm:
342345
name: "LLM"

lib/personas/forum_researcher.rb

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,29 @@ def system_prompt
2222
The participants in this conversation are: {participants}
2323
The date now is: {time}, much has changed since you were trained.
2424
25-
As a forum researcher, you will help users come up with the correct research criteria to
26-
properly analyze the forum data.
25+
As a forum researcher, guide users through a structured research process:
26+
1. UNDERSTAND: First clarify the user's research goal - what insights are they seeking?
27+
2. PLAN: Design an appropriate research approach with specific filters
28+
3. TEST: Always begin with dry_run:true to gauge the scope of results
29+
4. REFINE: If results are too broad/narrow, suggest filter adjustments
30+
5. EXECUTE: Run the final analysis only when filters are well-tuned
31+
6. SUMMARIZE: Present findings with links to supporting evidence
2732
28-
BE MINDFUL: when running the research tool, specify all the goals you want to achieve in one go, avoid running research multiple times in one turn.
33+
BE MINDFUL: specify all research goals in one request to avoid multiple processing runs.
2934
30-
When creating reports ALWAYS bias grounding information you provide with links to original posts on the forum.
31-
You will always start with a dry_run of the proposed research criteria.
35+
REMEMBER: Different filters serve different purposes:
36+
- Use post date filters (after/before) for analyzing specific posts
37+
- Use topic date filters (topic_after/topic_before) for analyzing entire topics
38+
- Combine user/group filters with categories/tags to find specialized contributions
39+
40+
Always ground your analysis with links to original posts on the forum.
41+
42+
Research workflow best practices:
43+
1. Start with a dry_run to gauge the scope (set dry_run:true)
44+
2. If results are too numerous (>1000), add more specific filters
45+
3. If results are too few (<5), broaden your filters
46+
4. For temporal analysis, specify explicit date ranges
47+
5. For user behavior analysis, combine @username with categories or tags
3248
PROMPT
3349
end
3450
end

lib/personas/tools/researcher.rb

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,7 @@ def signature
1313
description:
1414
"Analyze and extract information from content across the forum based on specified filters",
1515
parameters: [
16-
{
17-
name: "filter",
18-
description:
19-
"Filter string to target specific content. Supports user (@username), date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD), categories (category:name), tags (tag:name), groups (group:name). Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'",
20-
type: "string",
21-
},
16+
{ name: "filter", description: filter_description, type: "string" },
2217
{
2318
name: "goals",
2419
description:
@@ -34,23 +29,35 @@ def signature
3429
}
3530
end
3631

37-
def name
38-
"researcher"
39-
end
40-
41-
def custom_system_message
32+
def filter_description
4233
<<~TEXT
43-
Use the researcher tool to analyze patterns and extract insights from forum content.
44-
For complex research tasks, start with a dry run to gauge the scope before processing.
34+
Filter string to target specific content.
35+
- Supports user (@username)
36+
- date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
37+
- categories (category:name)
38+
- tags (tag:name)
39+
- groups (group:name).
40+
41+
Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'
4542
TEXT
4643
end
4744

45+
def name
46+
"researcher"
47+
end
48+
4849
def accepted_options
49-
[option(:max_results, type: :integer), option(:include_private, type: :boolean)]
50+
[
51+
option(:max_results, type: :integer),
52+
option(:include_private, type: :boolean),
53+
option(:max_tokens_per_post, type: :integer),
54+
]
5055
end
5156
end
5257

5358
def invoke(&blk)
59+
max_results = options[:max_results] || 1000
60+
5461
@filter = parameters[:filter] || ""
5562
@goals = parameters[:goals] || ""
5663
@dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
@@ -62,7 +69,15 @@ def invoke(&blk)
6269
return { error: "No goals provided" } if goals.blank?
6370
return { error: "No filter provided" } if @filter.blank?
6471

65-
filter = DiscourseAi::Utils::Research::Filter.new(@filter)
72+
guardian = nil
73+
guardian = Guardian.new(context.user) if options[:include_private]
74+
75+
filter =
76+
DiscourseAi::Utils::Research::Filter.new(
77+
@filter,
78+
limit: max_results,
79+
guardian: guardian,
80+
)
6681
@result_count = filter.search.count
6782

6883
blk.call details
@@ -99,6 +114,7 @@ def process_filter(filter, goals, post, &blk)
99114
filter,
100115
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
101116
tokenizer: llm.tokenizer,
117+
max_tokens_per_post: options[:max_tokens_per_post] || 2000,
102118
)
103119

104120
results = []

lib/utils/research/filter.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,22 @@ def self.word_to_date(str)
5757
end
5858
end
5959

60+
register_filter(/\Atopic_before:(.*)\z/i) do |relation, date_str, _|
61+
if date = Filter.word_to_date(date_str)
62+
relation.where("topics.created_at < ?", date)
63+
else
64+
relation
65+
end
66+
end
67+
68+
register_filter(/\Atopic_after:(.*)\z/i) do |relation, date_str, _|
69+
if date = Filter.word_to_date(date_str)
70+
relation.where("topics.created_at > ?", date)
71+
else
72+
relation
73+
end
74+
end
75+
6076
# Category filter
6177
register_filter(/\Acategory:([a-zA-Z0-9_\-]+)\z/i) do |relation, slug, _|
6278
category = Category.find_by("LOWER(slug) = LOWER(?)", slug)

lib/utils/research/llm_formatter.rb

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ module DiscourseAi
44
module Utils
55
module Research
66
class LlmFormatter
7-
def initialize(filter, max_tokens_per_batch:, tokenizer:)
7+
def initialize(filter, max_tokens_per_batch:, tokenizer:, max_tokens_per_post:)
88
@filter = filter
99
@max_tokens_per_batch = max_tokens_per_batch
1010
@tokenizer = tokenizer
11+
@max_tokens_per_post = max_tokens_per_post
1112
@to_process = filter_to_hash
1213
end
1314

@@ -160,12 +161,29 @@ def format_topic_status(topic)
160161
def format_post(post)
161162
text = +"---\n"
162163
text << "## Post by #{post.user&.username} - #{format_date(post.created_at)}\n\n"
163-
text << "#{post.raw}\n"
164+
text << "#{truncate_if_needed(post.raw)}\n"
164165
text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
165166
text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
166167
text
167168
end
168169

170+
def truncate_if_needed(content)
171+
tokens_count = estimate_tokens(content)
172+
173+
return content if tokens_count <= @max_tokens_per_post
174+
175+
half_limit = @max_tokens_per_post / 2
176+
token_ids = @tokenizer.encode(content)
177+
178+
first_half_ids = token_ids[0...half_limit]
179+
last_half_ids = token_ids[-half_limit..-1]
180+
181+
first_text = @tokenizer.decode(first_half_ids)
182+
last_text = @tokenizer.decode(last_half_ids)
183+
184+
"#{first_text}\n\n... elided #{tokens_count - @max_tokens_per_post} tokens ...\n\n#{last_text}"
185+
end
186+
169187
def format_omitted_posts(count, position)
170188
if position == "before"
171189
"#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# frozen_string_literal: true
2+
#
3+
describe DiscourseAi::Utils::Research::LlmFormatter do
4+
fab!(:user) { Fabricate(:user, username: "test_user") }
5+
fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
6+
fab!(:post) { Fabricate(:post, topic: topic, user: user) }
7+
let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
8+
let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }
9+
10+
describe "#truncate_if_needed" do
11+
it "returns original content when under token limit" do
12+
formatter =
13+
described_class.new(
14+
filter,
15+
max_tokens_per_batch: 1000,
16+
tokenizer: tokenizer,
17+
max_tokens_per_post: 100,
18+
)
19+
20+
short_text = "This is a short post"
21+
expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
22+
end
23+
24+
it "truncates content when over token limit" do
25+
# Create a post with content that will exceed our token limit
26+
long_text = ("word " * 200).strip
27+
28+
formatter =
29+
described_class.new(
30+
filter,
31+
max_tokens_per_batch: 1000,
32+
tokenizer: tokenizer,
33+
max_tokens_per_post: 50,
34+
)
35+
36+
truncated = formatter.send(:truncate_if_needed, long_text)
37+
38+
expect(truncated).to include("... elided 150 tokens ...")
39+
expect(truncated).to_not eq(long_text)
40+
41+
# Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
42+
first_chunk = truncated.split("\n\n")[0]
43+
expect(first_chunk.split(" ").length).to be_within(5).of(25)
44+
45+
last_chunk = truncated.split("\n\n")[2]
46+
expect(last_chunk.split(" ").length).to be_within(5).of(25)
47+
end
48+
end
49+
50+
describe "#format_post" do
51+
it "formats posts with truncation for long content" do
52+
# Set up a post with long content
53+
long_content = ("word " * 200).strip
54+
long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)
55+
56+
formatter =
57+
described_class.new(
58+
filter,
59+
max_tokens_per_batch: 1000,
60+
tokenizer: tokenizer,
61+
max_tokens_per_post: 50,
62+
)
63+
64+
formatted = formatter.send(:format_post, long_post)
65+
66+
# Should have standard formatting elements
67+
expect(formatted).to include("## Post by #{user.username}")
68+
expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")
69+
70+
# Should include truncation marker
71+
expect(formatted).to include("... elided 150 tokens ...")
72+
end
73+
end
74+
end

0 commit comments

Comments
 (0)