Skip to content

Commit 6ed6b52

Browse files
committed
wip researcher
1 parent 2a62658 commit 6ed6b52

File tree

3 files changed

+381
-0
lines changed

3 files changed

+381
-0
lines changed

lib/personas/tools/researcher.rb

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Personas
5+
module Tools
6+
class Researcher < Tool
7+
attr_reader :last_filter, :result_count
8+
9+
class << self
10+
def signature
11+
{
12+
name: name,
13+
description:
14+
"Analyze and extract information from content across the forum based on specified filters",
15+
parameters: [
16+
{
17+
name: "filter",
18+
description:
19+
"Filter string to target specific content. Supports user (@username), date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD), categories (category:name), tags (tag:name), groups (group:name). Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'",
20+
type: "string",
21+
},
22+
{
23+
name: "goal",
24+
description:
25+
"The specific information you want to extract or analyze from the filtered content",
26+
type: "string",
27+
},
28+
{
29+
name: "dry_run",
30+
description:
31+
"When true, only count matching items without processing data (default: true)",
32+
type: "boolean",
33+
},
34+
],
35+
}
36+
end
37+
38+
def name
39+
"researcher"
40+
end
41+
42+
def custom_system_message
43+
<<~TEXT
44+
Use the researcher tool to analyze patterns and extract insights from forum content.
45+
For complex research tasks, start with a dry run to gauge the scope before processing.
46+
TEXT
47+
end
48+
49+
def accepted_options
50+
[option(:max_results, type: :integer), option(:include_private, type: :boolean)]
51+
end
52+
end
53+
54+
def invoke
55+
@last_filter = parameters[:filter] || ""
56+
goal = parameters[:goal] || ""
57+
dry_run = parameters[:dry_run].nil? ? true : parameters[:dry_run]
58+
59+
yield(I18n.t("discourse_ai.ai_bot.researching", filter: @last_filter, goal: goal))
60+
61+
# Parse the filter string to extract components
62+
filter_components = parse_filter(@last_filter)
63+
64+
# Determine max results
65+
max_results = calculate_max_results(llm)
66+
67+
# In a real implementation, we would query the database here
68+
# For now, just simulate the behavior
69+
if dry_run
70+
@result_count = simulate_count(filter_components)
71+
{ count: @result_count, filter: @last_filter, goal: goal, dry_run: true }
72+
else
73+
results = perform_research(filter_components, goal, max_results)
74+
@result_count = results[:rows]&.length || 0
75+
results
76+
end
77+
end
78+
79+
protected
80+
81+
def description_args
82+
{ count: @result_count || 0, filter: @last_filter || "" }
83+
end
84+
85+
private
86+
87+
def parse_filter(filter_string)
88+
# This would parse the filter string into components
89+
# For example, extracting username, date ranges, categories, tags, etc.
90+
# Simplified implementation for now
91+
components = {}
92+
components[:raw] = filter_string
93+
components
94+
end
95+
96+
def simulate_count(filter_components)
97+
# In a real implementation, this would query the database to get a count
98+
# For now, return a simulated count
99+
rand(10..100)
100+
end
101+
102+
def perform_research(filter_components, goal, max_results)
103+
# This would perform the actual research based on the filter and goal
104+
# For now, return a simplified result structure
105+
format_results([], %w[content url author date])
106+
end
107+
108+
def calculate_max_results(llm)
109+
max_results = options[:max_results].to_i
110+
return [max_results, 100].min if max_results > 0
111+
112+
if llm.max_prompt_tokens > 30_000
113+
50
114+
elsif llm.max_prompt_tokens > 10_000
115+
30
116+
else
117+
15
118+
end
119+
end
120+
end
121+
end
122+
end
123+
end

lib/utils/research/filter.rb

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Utils
5+
module Research
6+
class Filter
7+
attr_reader :raw_filter, :parsed_components, :current_offset, :batch_size
8+
9+
VALID_FILTER_PATTERNS = {
10+
user: /\@(\w+)/,
11+
before: /before:(\d{4}-\d{2}-\d{2})/,
12+
after: /after:(\d{4}-\d{2}-\d{2})/,
13+
category: /category:([a-zA-Z0-9_\-]+)/,
14+
tag: /tag:([a-zA-Z0-9_\-]+)/,
15+
group: /group:([a-zA-Z0-9_\-]+)/,
16+
status: /status:(open|closed|archived|noreplies|single_user)/,
17+
}
18+
19+
DEFAULT_BATCH_SIZE = 20
20+
21+
def initialize(filter_string, batch_size: DEFAULT_BATCH_SIZE)
22+
@raw_filter = filter_string.to_s
23+
@batch_size = batch_size
24+
@current_offset = 0
25+
@parsed_components = parse_filter
26+
end
27+
28+
def parse_filter
29+
components = {
30+
users: [],
31+
categories: [],
32+
tags: [],
33+
groups: [],
34+
date_range: {
35+
},
36+
status: nil,
37+
raw: @raw_filter,
38+
}
39+
40+
# Extract user mentions
41+
@raw_filter
42+
.scan(VALID_FILTER_PATTERNS[:user])
43+
.each { |match| components[:users] << match[0] }
44+
45+
# Extract date ranges
46+
if before_match = @raw_filter.match(VALID_FILTER_PATTERNS[:before])
47+
components[:date_range][:before] = before_match[1]
48+
end
49+
50+
if after_match = @raw_filter.match(VALID_FILTER_PATTERNS[:after])
51+
components[:date_range][:after] = after_match[1]
52+
end
53+
54+
# Extract categories
55+
@raw_filter
56+
.scan(VALID_FILTER_PATTERNS[:category])
57+
.each { |match| components[:categories] << match[0] }
58+
59+
# Extract tags
60+
@raw_filter
61+
.scan(VALID_FILTER_PATTERNS[:tag])
62+
.each { |match| components[:tags] << match[0] }
63+
64+
# Extract groups
65+
@raw_filter
66+
.scan(VALID_FILTER_PATTERNS[:group])
67+
.each { |match| components[:groups] << match[0] }
68+
69+
# Extract status
70+
if status_match = @raw_filter.match(VALID_FILTER_PATTERNS[:status])
71+
components[:status] = status_match[1]
72+
end
73+
74+
components
75+
end
76+
77+
def next_batch
78+
previous_offset = @current_offset
79+
@current_offset += @batch_size
80+
previous_offset
81+
end
82+
83+
def reset_batch
84+
@current_offset = 0
85+
end
86+
87+
def to_query_params
88+
params = {}
89+
params[:username] = parsed_components[:users].first if parsed_components[:users].any?
90+
params[:before] = parsed_components[:date_range][:before] if parsed_components[
91+
:date_range
92+
][
93+
:before
94+
]
95+
params[:after] = parsed_components[:date_range][:after] if parsed_components[:date_range][
96+
:after
97+
]
98+
params[:category] = parsed_components[:categories].first if parsed_components[
99+
:categories
100+
].any?
101+
params[:tags] = parsed_components[:tags].join(",") if parsed_components[:tags].any?
102+
params[:status] = parsed_components[:status] if parsed_components[:status]
103+
params
104+
end
105+
end
106+
end
107+
end
108+
end

lib/utils/research/llm_formatter.rb

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Utils
5+
module Research
6+
class LlmFormatter
7+
attr_reader :processed_count, :total_count, :filter
8+
9+
def initialize(filter, goal, llm, max_tokens = nil)
10+
@filter = filter
11+
@goal = goal
12+
@llm = llm
13+
@max_tokens = max_tokens || calculate_default_max_tokens(llm)
14+
@processed_count = 0
15+
@total_count = 0
16+
end
17+
18+
def format_and_yield(results, &block)
19+
@total_count = results[:total] if results[:total]
20+
21+
if results[:rows].nil? || results[:rows].empty?
22+
yield format_empty_result
23+
return
24+
end
25+
26+
# For summarization or analysis goals
27+
if analysis_goal?
28+
yield format_analysis_result(results[:rows])
29+
return
30+
end
31+
32+
# For standard listing with potential chunking
33+
formatted_results = format_standard_result(results[:rows])
34+
@processed_count += results[:rows].length
35+
36+
if block_given?
37+
yield formatted_results
38+
else
39+
formatted_results
40+
end
41+
end
42+
43+
def format_progress
44+
{
45+
processed: @processed_count,
46+
total: @total_count,
47+
filter: @filter.raw_filter,
48+
goal: @goal,
49+
percent_complete: @total_count > 0 ? (@processed_count.to_f / @total_count * 100).round(1) : 0
50+
}
51+
end
52+
53+
private
54+
55+
def analysis_goal?
56+
@goal.to_s.downcase.match?(/(summarize|analyze|extract|identify pattern|trend|insight)/)
57+
end
58+
59+
def format_empty_result
60+
{
61+
message: "No results found for the given filter criteria",
62+
filter: @filter.raw_filter,
63+
goal: @goal
64+
}
65+
end
66+
67+
def format_standard_result(rows)
68+
formatted_rows = rows.map do |row|
69+
{
70+
title: row[:title],
71+
excerpt: truncate_text(row[:excerpt] || ""),
72+
url: row[:url],
73+
author: row[:username],
74+
date: row[:created_at],
75+
likes: row[:like_count],
76+
replies: row[:reply_count]
77+
}
78+
end
79+
80+
{
81+
goal: @goal,
82+
filter: @filter.raw_filter,
83+
count: formatted_rows.length,
84+
total: @total_count,
85+
offset: @filter.current_offset - formatted_rows.length,
86+
rows: formatted_rows
87+
}
88+
end
89+
90+
def format_analysis_result(rows)
91+
# Group by relevant attributes based on goal
92+
data_points = extract_data_points(rows)
93+
94+
{
95+
goal: @goal,
96+
filter: @filter.raw_filter,
97+
count: rows.length,
98+
total: @total_count,
99+
analysis: {
100+
sample_size: rows.length,
101+
data_points: data_points,
102+
time_range: extract_time_range(rows)
103+
}
104+
}
105+
end
106+
107+
def extract_data_points(rows)
108+
# This would extract relevant data based on the goal
109+
# Simplified implementation for now
110+
{
111+
authors: rows.map { |r| r[:username] }.uniq.count,
112+
categories: rows.map { |r| r[:category_name] }.uniq.count,
113+
earliest_post: rows.map { |r| r[:created_at] }.min,
114+
latest_post: rows.map { |r| r[:created_at] }.max,
115+
avg_likes: (rows.sum { |r| r[:like_count].to_i } / [rows.length, 1].max.to_f).round(1)
116+
}
117+
end
118+
119+
def extract_time_range(rows)
120+
dates = rows.map { |r| r[:created_at] }.compact
121+
return nil if dates.empty?
122+
123+
{
124+
earliest: dates.min,
125+
latest: dates.max,
126+
span_days: ((dates.max - dates.min) / 86400).to_i rescue nil
127+
}
128+
end
129+
130+
def truncate_text(text, max_length = 300)
131+
return text if text.length <= max_length
132+
text[0...max_length] + "..."
133+
end
134+
135+
def calculate_default_max_tokens(llm)
136+
# Use a percentage of available tokens for results
137+
max_prompt_tokens = llm.max_prompt_tokens
138+
139+
if max_prompt_tokens > 30_000
140+
max_prompt_tokens * 0.7
141+
elsif max_prompt_tokens > 10_000
142+
max_prompt_tokens * 0.6
143+
else
144+
max_prompt_tokens * 0.5
145+
end.to_i
146+
end
147+
end
148+
end
149+
end
150+
end

0 commit comments

Comments
 (0)