@@ -4,145 +4,43 @@ module DiscourseAi
44 module Utils
55 module Research
66 class LlmFormatter
7- attr_reader :processed_count , :total_count , :filter
8-
9- def initialize ( filter , goal , llm , max_tokens = nil )
7+ def initialize ( filter , max_tokens_per_batch :, tokenizer :)
108 @filter = filter
11- @goal = goal
12- @llm = llm
13- @max_tokens = max_tokens || calculate_default_max_tokens ( llm )
14- @processed_count = 0
15- @total_count = 0
16- end
17-
18- def format_and_yield ( results , &block )
19- @total_count = results [ :total ] if results [ :total ]
20-
21- if results [ :rows ] . nil? || results [ :rows ] . empty?
22- yield format_empty_result
23- return
24- end
25-
26- # For summarization or analysis goals
27- if analysis_goal?
28- yield format_analysis_result ( results [ :rows ] )
29- return
30- end
31-
32- # For standard listing with potential chunking
33- formatted_results = format_standard_result ( results [ :rows ] )
34- @processed_count += results [ :rows ] . length
35-
36- if block_given?
37- yield formatted_results
38- else
39- formatted_results
40- end
41- end
42-
43- def format_progress
44- {
45- processed : @processed_count ,
46- total : @total_count ,
47- filter : @filter . raw_filter ,
48- goal : @goal ,
49- percent_complete : @total_count > 0 ? ( @processed_count . to_f / @total_count * 100 ) . round ( 1 ) : 0
50- }
9+ @max_tokens_per_batch = max_tokens_per_batch
10+ @tokenizer = tokenizer
11+ @to_process = filter_to_hash
12+ end
13+
14+ def next_batch
15+ # return text that is easily consumed by the LLM containing:
16+ # - topic title, tags, category and creation date
17+ # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18+ # - raw post content
19+ # - author name
20+ # - date
21+ # - info about future posts in the topic that are omitted
22+ #
23+ # always attempt to return entire topics (or multiple) if possible
24+ # return nil if we are done
25+ #
26+ # example_return:
27+ # { post_count: 12, topic_count: 3, text: "..." }
5128 end
5229
5330 private
5431
55- def analysis_goal?
56- @goal . to_s . downcase . match? ( /(summarize|analyze|extract|identify pattern|trend|insight)/ )
57- end
58-
59- def format_empty_result
60- {
61- message : "No results found for the given filter criteria" ,
62- filter : @filter . raw_filter ,
63- goal : @goal
64- }
65- end
66-
67- def format_standard_result ( rows )
68- formatted_rows = rows . map do |row |
69- {
70- title : row [ :title ] ,
71- excerpt : truncate_text ( row [ :excerpt ] || "" ) ,
72- url : row [ :url ] ,
73- author : row [ :username ] ,
74- date : row [ :created_at ] ,
75- likes : row [ :like_count ] ,
76- replies : row [ :reply_count ]
77- }
78- end
79-
80- {
81- goal : @goal ,
82- filter : @filter . raw_filter ,
83- count : formatted_rows . length ,
84- total : @total_count ,
85- offset : @filter . current_offset - formatted_rows . length ,
86- rows : formatted_rows
87- }
88- end
89-
90- def format_analysis_result ( rows )
91- # Group by relevant attributes based on goal
92- data_points = extract_data_points ( rows )
93-
94- {
95- goal : @goal ,
96- filter : @filter . raw_filter ,
97- count : rows . length ,
98- total : @total_count ,
99- analysis : {
100- sample_size : rows . length ,
101- data_points : data_points ,
102- time_range : extract_time_range ( rows )
103- }
104- }
105- end
106-
107- def extract_data_points ( rows )
108- # This would extract relevant data based on the goal
109- # Simplified implementation for now
110- {
111- authors : rows . map { |r | r [ :username ] } . uniq . count ,
112- categories : rows . map { |r | r [ :category_name ] } . uniq . count ,
113- earliest_post : rows . map { |r | r [ :created_at ] } . min ,
114- latest_post : rows . map { |r | r [ :created_at ] } . max ,
115- avg_likes : ( rows . sum { |r | r [ :like_count ] . to_i } / [ rows . length , 1 ] . max . to_f ) . round ( 1 )
116- }
117- end
118-
119- def extract_time_range ( rows )
120- dates = rows . map { |r | r [ :created_at ] } . compact
121- return nil if dates . empty?
122-
123- {
124- earliest : dates . min ,
125- latest : dates . max ,
126- span_days : ( ( dates . max - dates . min ) / 86400 ) . to_i rescue nil
127- }
128- end
129-
130- def truncate_text ( text , max_length = 300 )
131- return text if text . length <= max_length
132- text [ 0 ...max_length ] + "..."
133- end
134-
135- def calculate_default_max_tokens ( llm )
136- # Use a percentage of available tokens for results
137- max_prompt_tokens = llm . max_prompt_tokens
138-
139- if max_prompt_tokens > 30_000
140- max_prompt_tokens * 0.7
141- elsif max_prompt_tokens > 10_000
142- max_prompt_tokens * 0.6
143- else
144- max_prompt_tokens * 0.5
145- end . to_i
32+ def filter_to_hash
33+ hash = { }
34+ filter
35+ . search
36+ . pluck ( :topic_id , :post_id , :post_number )
37+ . each do |topic_id , post_id , post_number |
38+ hash [ topic_id ] ||= { posts : [ ] }
39+ hash [ topic_id ] [ :posts ] << [ post_id , post_number ]
40+ end
41+
42+ hash . each_value { |topic | topic [ :posts ] . sort_by! { |_ , post_number | post_number } }
43+ hash
14644 end
14745 end
14846 end
0 commit comments