Skip to content

Commit 22d4624

Browse files
committed
Implement more features
1 parent 84e9286 commit 22d4624

File tree

4 files changed

+253
-25
lines changed

4 files changed

+253
-25
lines changed

lib/personas/tools/researcher.rb

+71-7
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@ def signature
2727
},
2828
{
2929
name: "dry_run",
30-
description:
31-
"When true, only count matching items without processing data (default: true)",
30+
description: "When true, only count matching items without processing data",
3231
type: "boolean",
3332
},
3433
],
@@ -51,21 +50,86 @@ def accepted_options
5150
end
5251
end
5352

54-
def invoke
53+
def invoke(&blk)
5554
@last_filter = parameters[:filter] || ""
55+
post = Post.find_by(id: context.post_id)
5656
goal = parameters[:goal] || ""
57-
58-
#dry_run = parameters[:dry_run].nil? ? true : parameters[:dry_run]
59-
#yield(I18n.t("discourse_ai.ai_bot.researching", filter: @last_filter, goal: goal))
57+
dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
6058

6159
filter = DiscourseAi::Utils::Research::Filter.new(@last_filter)
6260

6361
@result_count = filter.search.count
64-
{ dry_run: true, goal: goal, filter: @last_filter, number_of_results: @result_count }
62+
63+
if dry_run
64+
{ dry_run: true, goal: goal, filter: @last_filter, number_of_results: @result_count }
65+
else
66+
process_filter(filter, goal, post, &blk)
67+
end
6568
end
6669

6770
protected
6871

72+
MIN_TOKENS_FOR_RESEARCH = 8000
73+
def process_filter(filter, goal, post, &blk)
74+
if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
75+
raise ArgumentError,
76+
"LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
77+
end
78+
formatter =
79+
DiscourseAi::Utils::Research::LlmFormatter.new(
80+
filter,
81+
max_tokens_per_batch: llm.max_prompt_tokens - 2000,
82+
tokenizer: llm.tokenizer,
83+
)
84+
85+
results = []
86+
87+
formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goal, post, &blk) }
88+
{ dry_run: false, goal: goal, filter: @last_filter, results: results }
89+
end
90+
91+
def run_inference(chunk_text, goal, post, &blk)
92+
system_prompt = goal_system_prompt(goal)
93+
user_prompt = goal_user_prompt(goal, chunk_text)
94+
95+
prompt =
96+
DiscourseAi::Completions::Prompt.new(
97+
system_prompt,
98+
messages: [{ type: :user, content: user_prompt }],
99+
post_id: post.id,
100+
topic_id: post.topic_id,
101+
)
102+
103+
results = []
104+
llm.generate(prompt, user: post.user, feature_name: context.feature_name) do |partial|
105+
results << partial
106+
end
107+
108+
blk.call(".")
109+
results.join
110+
end
111+
112+
def goal_system_prompt(goal)
113+
<<~TEXT
114+
You are a researcher tool designed to analyze and extract information from forum content.
115+
Your task is to process the provided content and extract relevant information based on the specified goal.
116+
117+
Your goal is: #{goal}
118+
TEXT
119+
end
120+
121+
def goal_user_prompt(goal, chunk_text)
122+
<<~TEXT
123+
Here is the content to analyze:
124+
125+
{{{
126+
#{chunk_text}
127+
}}}
128+
129+
Your goal is: #{goal}
130+
TEXT
131+
end
132+
69133
def description_args
70134
{ count: @result_count || 0, filter: @last_filter || "" }
71135
end

lib/personas/tools/tool.rb

+3-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ def inject_prompt(prompt:, context:, persona:)
4747
end
4848
end
4949

50-
attr_accessor :custom_raw, :parameters
51-
attr_reader :tool_call_id, :persona_options, :bot_user, :llm, :context
50+
# llm being public makes it a bit easier to test
51+
attr_accessor :custom_raw, :parameters, :llm
52+
attr_reader :tool_call_id, :persona_options, :bot_user, :context
5253

5354
def initialize(
5455
parameters,

lib/utils/research/llm_formatter.rb

+142-16
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,71 @@ def initialize(filter, max_tokens_per_batch:, tokenizer:)
1111
@to_process = filter_to_hash
1212
end
1313

14-
def next_batch
15-
# return text that is easily consumed by the LLM containing:
16-
# - topic title, tags, category and creation date
17-
# - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18-
# - raw post content
19-
# - author name
20-
# - date
21-
# - info about future posts in the topic that are omitted
22-
#
23-
# always attempt to return entire topics (or multiple) if possible
24-
# return nil if we are done
25-
#
26-
# example_return:
27-
# { post_count: 12, topic_count: 3, text: "..." }
14+
def each_chunk
15+
return nil if @to_process.empty?
16+
17+
result = { post_count: 0, topic_count: 0, text: +"" }
18+
estimated_tokens = 0
19+
20+
@to_process.each do |topic_id, topic_data|
21+
topic = Topic.find_by(id: topic_id)
22+
next unless topic
23+
24+
topic_text, topic_tokens, post_count = format_topic(topic, topic_data[:posts])
25+
26+
# If this single topic exceeds our token limit and we haven't added anything yet,
27+
# we need to include at least this one topic (partial content)
28+
if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
29+
offset = 0
30+
while offset < topic_text.length
31+
chunk = +""
32+
chunk_tokens = 0
33+
lines = topic_text[offset..].lines
34+
lines.each do |line|
35+
line_tokens = estimate_tokens(line)
36+
break if chunk_tokens + line_tokens > @max_tokens_per_batch
37+
chunk << line
38+
chunk_tokens += line_tokens
39+
end
40+
break if chunk.empty?
41+
yield(
42+
{
43+
text: chunk,
44+
post_count: post_count, # This may overcount if split mid-topic, but preserves original logic
45+
topic_count: 1,
46+
}
47+
)
48+
offset += chunk.length
49+
end
50+
51+
next
52+
end
53+
54+
# If adding this topic would exceed our token limit and we already have content, skip it
55+
if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
56+
yield result if result[:text].present?
57+
estimated_tokens = 0
58+
result = { post_count: 0, topic_count: 0, text: +"" }
59+
else
60+
# Add this topic to the result
61+
result[:text] << topic_text
62+
result[:post_count] += post_count
63+
result[:topic_count] += 1
64+
estimated_tokens += topic_tokens
65+
end
66+
end
67+
yield result if result[:text].present?
68+
69+
@to_process.clear
2870
end
2971

3072
private
3173

3274
def filter_to_hash
3375
hash = {}
34-
filter
76+
@filter
3577
.search
36-
.pluck(:topic_id, :post_id, :post_number)
78+
.pluck(:topic_id, :id, :post_number)
3779
.each do |topic_id, post_id, post_number|
3880
hash[topic_id] ||= { posts: [] }
3981
hash[topic_id][:posts] << [post_id, post_number]
@@ -42,6 +84,90 @@ def filter_to_hash
4284
hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
4385
hash
4486
end
87+
88+
def format_topic(topic, posts_data)
89+
text = ""
90+
total_tokens = 0
91+
post_count = 0
92+
93+
# Add topic header
94+
text += format_topic_header(topic)
95+
total_tokens += estimate_tokens(text)
96+
97+
# Get all post numbers in this topic
98+
all_post_numbers = topic.posts.pluck(:post_number).sort
99+
100+
# Format posts with omitted information
101+
first_post_number = posts_data.first[1]
102+
last_post_number = posts_data.last[1]
103+
104+
# Handle posts before our selection
105+
if first_post_number > 1
106+
omitted_before = first_post_number - 1
107+
text += format_omitted_posts(omitted_before, "before")
108+
total_tokens += estimate_tokens(format_omitted_posts(omitted_before, "before"))
109+
end
110+
111+
# Format each post
112+
posts_data.each do |post_id, post_number|
113+
post = Post.find_by(id: post_id)
114+
next unless post
115+
116+
text += format_post(post)
117+
total_tokens += estimate_tokens(format_post(post))
118+
post_count += 1
119+
end
120+
121+
# Handle posts after our selection
122+
if last_post_number < all_post_numbers.last
123+
omitted_after = all_post_numbers.last - last_post_number
124+
text += format_omitted_posts(omitted_after, "after")
125+
total_tokens += estimate_tokens(format_omitted_posts(omitted_after, "after"))
126+
end
127+
128+
[text, total_tokens, post_count]
129+
end
130+
131+
def format_topic_header(topic)
132+
header = +"# #{topic.title}\n"
133+
134+
# Add category
135+
header << "Category: #{topic.category.name}\n" if topic.category
136+
137+
# Add tags
138+
header << "Tags: #{topic.tags.map(&:name).join(", ")}\n" if topic.tags.present?
139+
140+
# Add creation date
141+
header << "Created: #{format_date(topic.created_at)}\n"
142+
header << "Topic url: /t/#{topic.id}\n\n"
143+
144+
header
145+
end
146+
147+
def format_post(post)
148+
text = +"---\n"
149+
text << "## Post by #{post.user.username} - #{format_date(post.created_at)}\n\n"
150+
text << "#{post.raw}\n"
151+
text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
152+
text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
153+
text
154+
end
155+
156+
def format_omitted_posts(count, position)
157+
if position == "before"
158+
"#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
159+
else
160+
"#{count} later #{count == 1 ? "post" : "posts"} omitted\n\n"
161+
end
162+
end
163+
164+
def format_date(date)
165+
date.strftime("%Y-%m-%d %H:%M")
166+
end
167+
168+
def estimate_tokens(text)
169+
@tokenizer.tokenize(text).length
170+
end
45171
end
46172
end
47173
end

spec/lib/personas/tools/researcher_spec.rb

+37
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,42 @@
6464

6565
expect(researcher.options[:max_results]).to eq(50)
6666
end
67+
68+
it "returns correct results for non-dry-run with filtered posts" do
69+
# Stage 2 topics, each with 2 posts
70+
topics = Array.new(2) { Fabricate(:topic, category: category, tags: [tag_research]) }
71+
topics.flat_map do |topic|
72+
[
73+
Fabricate(:post, topic: topic, raw: "Relevant content 1", user: user),
74+
Fabricate(:post, topic: topic, raw: "Relevant content 2", user: admin),
75+
]
76+
end
77+
78+
# Filter to posts by user in research-category
79+
researcher =
80+
described_class.new(
81+
{
82+
filter: "category:research-category @#{user.username}",
83+
goal: "find relevant content",
84+
dry_run: false,
85+
},
86+
bot_user: bot_user,
87+
llm: llm,
88+
context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
89+
)
90+
91+
responses = 10.times.map { |i| ["Found: Relevant content #{i + 1}"] }
92+
results = nil
93+
94+
DiscourseAi::Completions::Llm.with_prepared_responses(responses) do
95+
researcher.llm = llm_model.to_llm
96+
results = researcher.invoke(&progress_blk)
97+
end
98+
99+
expect(results[:dry_run]).to eq(false)
100+
expect(results[:goal]).to eq("find relevant content")
101+
expect(results[:filter]).to eq("category:research-category @#{user.username}")
102+
expect(results[:results].first).to include("Found: Relevant content 1")
103+
end
67104
end
68105
end

0 commit comments

Comments
 (0)