working on llm formatter

SamSaffron · SamSaffron · commit 84e9286252c2 · 2025-05-08T14:11:04.000+10:00
diff --git a/lib/utils/research/llm_formatter.rb b/lib/utils/research/llm_formatter.rb
@@ -4,145 +4,43 @@ module DiscourseAi
   module Utils
     module Research
       class LlmFormatter
-        attr_reader :processed_count, :total_count, :filter
-
-        def initialize(filter, goal, llm, max_tokens = nil)
+        def initialize(filter, max_tokens_per_batch:, tokenizer:)
           @filter = filter
-          @goal = goal
-          @llm = llm
-          @max_tokens = max_tokens || calculate_default_max_tokens(llm)
-          @processed_count = 0
-          @total_count = 0
-        end
-
-        def format_and_yield(results, &block)
-          @total_count = results[:total] if results[:total]
-
-          if results[:rows].nil? || results[:rows].empty?
-            yield format_empty_result
-            return
-          end
-
-          # For summarization or analysis goals
-          if analysis_goal?
-            yield format_analysis_result(results[:rows])
-            return
-          end
-
-          # For standard listing with potential chunking
-          formatted_results = format_standard_result(results[:rows])
-          @processed_count += results[:rows].length
-
-          if block_given?
-            yield formatted_results
-          else
-            formatted_results
-          end
-        end
-
-        def format_progress
-          {
-            processed: @processed_count,
-            total: @total_count,
-            filter: @filter.raw_filter,
-            goal: @goal,
-            percent_complete: @total_count > 0 ? (@processed_count.to_f / @total_count * 100).round(1) : 0
-          }
+          @max_tokens_per_batch = max_tokens_per_batch
+          @tokenizer = tokenizer
+          @to_process = filter_to_hash
+        end
+
+        def next_batch
+          # return text that is easily consumed by the LLM containing:
+          # - topic title, tags, category and creation date
+          # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
+          # - raw post content
+          # - author name
+          # - date
+          # - info about future posts in the topic that are omitted
+          #
+          # always attempt to return entire topics (or multiple) if possible
+          # return nil if we are done
+          #
+          # example_return:
+          # { post_count: 12, topic_count: 3, text: "..." }
         end
 
         private
 
-        def analysis_goal?
-          @goal.to_s.downcase.match?(/(summarize|analyze|extract|identify pattern|trend|insight)/)
-        end
-
-        def format_empty_result
-          {
-            message: "No results found for the given filter criteria",
-            filter: @filter.raw_filter,
-            goal: @goal
-          }
-        end
-
-        def format_standard_result(rows)
-          formatted_rows = rows.map do |row|
-            {
-              title: row[:title],
-              excerpt: truncate_text(row[:excerpt] || ""),
-              url: row[:url],
-              author: row[:username],
-              date: row[:created_at],
-              likes: row[:like_count],
-              replies: row[:reply_count]
-            }
-          end
-
-          {
-            goal: @goal,
-            filter: @filter.raw_filter,
-            count: formatted_rows.length,
-            total: @total_count,
-            offset: @filter.current_offset - formatted_rows.length,
-            rows: formatted_rows
-          }
-        end
-
-        def format_analysis_result(rows)
-          # Group by relevant attributes based on goal
-          data_points = extract_data_points(rows)
-
-          {
-            goal: @goal,
-            filter: @filter.raw_filter,
-            count: rows.length,
-            total: @total_count,
-            analysis: {
-              sample_size: rows.length,
-              data_points: data_points,
-              time_range: extract_time_range(rows)
-            }
-          }
-        end
-
-        def extract_data_points(rows)
-          # This would extract relevant data based on the goal
-          # Simplified implementation for now
-          {
-            authors: rows.map { |r| r[:username] }.uniq.count,
-            categories: rows.map { |r| r[:category_name] }.uniq.count,
-            earliest_post: rows.map { |r| r[:created_at] }.min,
-            latest_post: rows.map { |r| r[:created_at] }.max,
-            avg_likes: (rows.sum { |r| r[:like_count].to_i } / [rows.length, 1].max.to_f).round(1)
-          }
-        end
-
-        def extract_time_range(rows)
-          dates = rows.map { |r| r[:created_at] }.compact
-          return nil if dates.empty?
-
-          {
-            earliest: dates.min,
-            latest: dates.max,
-            span_days: ((dates.max - dates.min) / 86400).to_i rescue nil
-          }
-        end
-
-        def truncate_text(text, max_length = 300)
-          return text if text.length <= max_length
-          text[0...max_length] + "..."
-        end
-
-        def calculate_default_max_tokens(llm)
-          # Use a percentage of available tokens for results
-          max_prompt_tokens = llm.max_prompt_tokens
-
-          if max_prompt_tokens > 30_000
-            max_prompt_tokens * 0.7
-          elsif max_prompt_tokens > 10_000
-            max_prompt_tokens * 0.6
-          else
-            max_prompt_tokens * 0.5
-          end.to_i
+        def filter_to_hash
+          hash = {}
+          filter
+            .search
+            .pluck(:topic_id, :post_id, :post_number)
+            .each do |topic_id, post_id, post_number|
+              hash[topic_id] ||= { posts: [] }
+              hash[topic_id][:posts] << [post_id, post_number]
+            end
+
+          hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
+          hash
         end
       end
     end