Skip to content

Commit a10d1e9

Browse files
committed
working on llm formatter
1 parent 5dd52da commit a10d1e9

File tree

1 file changed

+32
-134
lines changed

1 file changed

+32
-134
lines changed

lib/utils/research/llm_formatter.rb

+32-134
Original file line numberDiff line numberDiff line change
@@ -4,145 +4,43 @@ module DiscourseAi
44
module Utils
55
module Research
66
class LlmFormatter
7-
attr_reader :processed_count, :total_count, :filter
8-
9-
def initialize(filter, goal, llm, max_tokens = nil)
7+
def initialize(filter, max_tokens_per_batch:, tokenizer:)
108
@filter = filter
11-
@goal = goal
12-
@llm = llm
13-
@max_tokens = max_tokens || calculate_default_max_tokens(llm)
14-
@processed_count = 0
15-
@total_count = 0
16-
end
17-
18-
def format_and_yield(results, &block)
19-
@total_count = results[:total] if results[:total]
20-
21-
if results[:rows].nil? || results[:rows].empty?
22-
yield format_empty_result
23-
return
24-
end
25-
26-
# For summarization or analysis goals
27-
if analysis_goal?
28-
yield format_analysis_result(results[:rows])
29-
return
30-
end
31-
32-
# For standard listing with potential chunking
33-
formatted_results = format_standard_result(results[:rows])
34-
@processed_count += results[:rows].length
35-
36-
if block_given?
37-
yield formatted_results
38-
else
39-
formatted_results
40-
end
41-
end
42-
43-
def format_progress
44-
{
45-
processed: @processed_count,
46-
total: @total_count,
47-
filter: @filter.raw_filter,
48-
goal: @goal,
49-
percent_complete: @total_count > 0 ? (@processed_count.to_f / @total_count * 100).round(1) : 0
50-
}
9+
@max_tokens_per_batch = max_tokens_per_batch
10+
@tokenizer = tokenizer
11+
@to_process = filter_to_hash
12+
end
13+
14+
def next_batch
15+
# return text that is easily consumed by the LLM containing:
16+
# - topic title, tags, category and creation date
17+
# - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18+
# - raw post content
19+
# - author name
20+
# - date
21+
# - info about future posts in the topic that are omitted
22+
#
23+
# always attempt to return entire topics (or multiple) if possible
24+
# return nil if we are done
25+
#
26+
# example_return:
27+
# { post_count: 12, topic_count: 3, text: "..." }
5128
end
5229

5330
private
5431

55-
def analysis_goal?
56-
@goal.to_s.downcase.match?(/(summarize|analyze|extract|identify pattern|trend|insight)/)
57-
end
58-
59-
def format_empty_result
60-
{
61-
message: "No results found for the given filter criteria",
62-
filter: @filter.raw_filter,
63-
goal: @goal
64-
}
65-
end
66-
67-
def format_standard_result(rows)
68-
formatted_rows = rows.map do |row|
69-
{
70-
title: row[:title],
71-
excerpt: truncate_text(row[:excerpt] || ""),
72-
url: row[:url],
73-
author: row[:username],
74-
date: row[:created_at],
75-
likes: row[:like_count],
76-
replies: row[:reply_count]
77-
}
78-
end
79-
80-
{
81-
goal: @goal,
82-
filter: @filter.raw_filter,
83-
count: formatted_rows.length,
84-
total: @total_count,
85-
offset: @filter.current_offset - formatted_rows.length,
86-
rows: formatted_rows
87-
}
88-
end
89-
90-
def format_analysis_result(rows)
91-
# Group by relevant attributes based on goal
92-
data_points = extract_data_points(rows)
93-
94-
{
95-
goal: @goal,
96-
filter: @filter.raw_filter,
97-
count: rows.length,
98-
total: @total_count,
99-
analysis: {
100-
sample_size: rows.length,
101-
data_points: data_points,
102-
time_range: extract_time_range(rows)
103-
}
104-
}
105-
end
106-
107-
def extract_data_points(rows)
108-
# This would extract relevant data based on the goal
109-
# Simplified implementation for now
110-
{
111-
authors: rows.map { |r| r[:username] }.uniq.count,
112-
categories: rows.map { |r| r[:category_name] }.uniq.count,
113-
earliest_post: rows.map { |r| r[:created_at] }.min,
114-
latest_post: rows.map { |r| r[:created_at] }.max,
115-
avg_likes: (rows.sum { |r| r[:like_count].to_i } / [rows.length, 1].max.to_f).round(1)
116-
}
117-
end
118-
119-
def extract_time_range(rows)
120-
dates = rows.map { |r| r[:created_at] }.compact
121-
return nil if dates.empty?
122-
123-
{
124-
earliest: dates.min,
125-
latest: dates.max,
126-
span_days: ((dates.max - dates.min) / 86400).to_i rescue nil
127-
}
128-
end
129-
130-
def truncate_text(text, max_length = 300)
131-
return text if text.length <= max_length
132-
text[0...max_length] + "..."
133-
end
134-
135-
def calculate_default_max_tokens(llm)
136-
# Use a percentage of available tokens for results
137-
max_prompt_tokens = llm.max_prompt_tokens
138-
139-
if max_prompt_tokens > 30_000
140-
max_prompt_tokens * 0.7
141-
elsif max_prompt_tokens > 10_000
142-
max_prompt_tokens * 0.6
143-
else
144-
max_prompt_tokens * 0.5
145-
end.to_i
32+
def filter_to_hash
33+
hash = {}
34+
filter
35+
.search
36+
.pluck(:topic_id, :post_id, :post_number)
37+
.each do |topic_id, post_id, post_number|
38+
hash[topic_id] ||= { posts: [] }
39+
hash[topic_id][:posts] << [post_id, post_number]
40+
end
41+
42+
hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
43+
hash
14644
end
14745
end
14846
end

0 commit comments

Comments
 (0)