@@ -4,145 +4,43 @@ module DiscourseAi
4
4
module Utils
5
5
module Research
6
6
class LlmFormatter
7
- attr_reader :processed_count , :total_count , :filter
8
-
9
- def initialize ( filter , goal , llm , max_tokens = nil )
7
+ def initialize ( filter , max_tokens_per_batch :, tokenizer :)
10
8
@filter = filter
11
- @goal = goal
12
- @llm = llm
13
- @max_tokens = max_tokens || calculate_default_max_tokens ( llm )
14
- @processed_count = 0
15
- @total_count = 0
16
- end
17
-
18
- def format_and_yield ( results , &block )
19
- @total_count = results [ :total ] if results [ :total ]
20
-
21
- if results [ :rows ] . nil? || results [ :rows ] . empty?
22
- yield format_empty_result
23
- return
24
- end
25
-
26
- # For summarization or analysis goals
27
- if analysis_goal?
28
- yield format_analysis_result ( results [ :rows ] )
29
- return
30
- end
31
-
32
- # For standard listing with potential chunking
33
- formatted_results = format_standard_result ( results [ :rows ] )
34
- @processed_count += results [ :rows ] . length
35
-
36
- if block_given?
37
- yield formatted_results
38
- else
39
- formatted_results
40
- end
41
- end
42
-
43
- def format_progress
44
- {
45
- processed : @processed_count ,
46
- total : @total_count ,
47
- filter : @filter . raw_filter ,
48
- goal : @goal ,
49
- percent_complete : @total_count > 0 ? ( @processed_count . to_f / @total_count * 100 ) . round ( 1 ) : 0
50
- }
9
+ @max_tokens_per_batch = max_tokens_per_batch
10
+ @tokenizer = tokenizer
11
+ @to_process = filter_to_hash
12
+ end
13
+
14
+ def next_batch
15
+ # return text that is easily consumed by the LLM containing:
16
+ # - topic title, tags, category and creation date
17
+ # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18
+ # - raw post content
19
+ # - author name
20
+ # - date
21
+ # - info about future posts in the topic that are omitted
22
+ #
23
+ # always attempt to return entire topics (or multiple) if possible
24
+ # return nil if we are done
25
+ #
26
+ # example_return:
27
+ # { post_count: 12, topic_count: 3, text: "..." }
51
28
end
52
29
53
30
private
54
31
55
- def analysis_goal?
56
- @goal . to_s . downcase . match? ( /(summarize|analyze|extract|identify pattern|trend|insight)/ )
57
- end
58
-
59
- def format_empty_result
60
- {
61
- message : "No results found for the given filter criteria" ,
62
- filter : @filter . raw_filter ,
63
- goal : @goal
64
- }
65
- end
66
-
67
- def format_standard_result ( rows )
68
- formatted_rows = rows . map do |row |
69
- {
70
- title : row [ :title ] ,
71
- excerpt : truncate_text ( row [ :excerpt ] || "" ) ,
72
- url : row [ :url ] ,
73
- author : row [ :username ] ,
74
- date : row [ :created_at ] ,
75
- likes : row [ :like_count ] ,
76
- replies : row [ :reply_count ]
77
- }
78
- end
79
-
80
- {
81
- goal : @goal ,
82
- filter : @filter . raw_filter ,
83
- count : formatted_rows . length ,
84
- total : @total_count ,
85
- offset : @filter . current_offset - formatted_rows . length ,
86
- rows : formatted_rows
87
- }
88
- end
89
-
90
- def format_analysis_result ( rows )
91
- # Group by relevant attributes based on goal
92
- data_points = extract_data_points ( rows )
93
-
94
- {
95
- goal : @goal ,
96
- filter : @filter . raw_filter ,
97
- count : rows . length ,
98
- total : @total_count ,
99
- analysis : {
100
- sample_size : rows . length ,
101
- data_points : data_points ,
102
- time_range : extract_time_range ( rows )
103
- }
104
- }
105
- end
106
-
107
- def extract_data_points ( rows )
108
- # This would extract relevant data based on the goal
109
- # Simplified implementation for now
110
- {
111
- authors : rows . map { |r | r [ :username ] } . uniq . count ,
112
- categories : rows . map { |r | r [ :category_name ] } . uniq . count ,
113
- earliest_post : rows . map { |r | r [ :created_at ] } . min ,
114
- latest_post : rows . map { |r | r [ :created_at ] } . max ,
115
- avg_likes : ( rows . sum { |r | r [ :like_count ] . to_i } / [ rows . length , 1 ] . max . to_f ) . round ( 1 )
116
- }
117
- end
118
-
119
- def extract_time_range ( rows )
120
- dates = rows . map { |r | r [ :created_at ] } . compact
121
- return nil if dates . empty?
122
-
123
- {
124
- earliest : dates . min ,
125
- latest : dates . max ,
126
- span_days : ( ( dates . max - dates . min ) / 86400 ) . to_i rescue nil
127
- }
128
- end
129
-
130
- def truncate_text ( text , max_length = 300 )
131
- return text if text . length <= max_length
132
- text [ 0 ...max_length ] + "..."
133
- end
134
-
135
- def calculate_default_max_tokens ( llm )
136
- # Use a percentage of available tokens for results
137
- max_prompt_tokens = llm . max_prompt_tokens
138
-
139
- if max_prompt_tokens > 30_000
140
- max_prompt_tokens * 0.7
141
- elsif max_prompt_tokens > 10_000
142
- max_prompt_tokens * 0.6
143
- else
144
- max_prompt_tokens * 0.5
145
- end . to_i
32
+ def filter_to_hash
33
+ hash = { }
34
+ filter
35
+ . search
36
+ . pluck ( :topic_id , :post_id , :post_number )
37
+ . each do |topic_id , post_id , post_number |
38
+ hash [ topic_id ] ||= { posts : [ ] }
39
+ hash [ topic_id ] [ :posts ] << [ post_id , post_number ]
40
+ end
41
+
42
+ hash . each_value { |topic | topic [ :posts ] . sort_by! { |_ , post_number | post_number } }
43
+ hash
146
44
end
147
45
end
148
46
end
0 commit comments