wip researcher

SamSaffron · SamSaffron · commit 6ed6b5242f4b · 2025-05-08T11:06:08.000+10:00
diff --git a/lib/personas/tools/researcher.rb b/lib/personas/tools/researcher.rb
@@ -0,0 +1,123 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Personas
+    module Tools
+      class Researcher < Tool
+        attr_reader :last_filter, :result_count
+
+        class << self
+          def signature
+            {
+              name: name,
+              description:
+                "Analyze and extract information from content across the forum based on specified filters",
+              parameters: [
+                {
+                  name: "filter",
+                  description:
+                    "Filter string to target specific content. Supports user (@username), date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD), categories (category:name), tags (tag:name), groups (group:name). Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'",
+                  type: "string",
+                },
+                {
+                  name: "goal",
+                  description:
+                    "The specific information you want to extract or analyze from the filtered content",
+                  type: "string",
+                },
+                {
+                  name: "dry_run",
+                  description:
+                    "When true, only count matching items without processing data (default: true)",
+                  type: "boolean",
+                },
+              ],
+            }
+          end
+
+          def name
+            "researcher"
+          end
+
+          def custom_system_message
+            <<~TEXT
+            Use the researcher tool to analyze patterns and extract insights from forum content.
+            For complex research tasks, start with a dry run to gauge the scope before processing.
+            TEXT
+          end
+
+          def accepted_options
+            [option(:max_results, type: :integer), option(:include_private, type: :boolean)]
+          end
+        end
+
+        def invoke
+          @last_filter = parameters[:filter] || ""
+          goal = parameters[:goal] || ""
+          dry_run = parameters[:dry_run].nil? ? true : parameters[:dry_run]
+
+          yield(I18n.t("discourse_ai.ai_bot.researching", filter: @last_filter, goal: goal))
+
+          # Parse the filter string to extract components
+          filter_components = parse_filter(@last_filter)
+
+          # Determine max results
+          max_results = calculate_max_results(llm)
+
+          # In a real implementation, we would query the database here
+          # For now, just simulate the behavior
+          if dry_run
+            @result_count = simulate_count(filter_components)
+            { count: @result_count, filter: @last_filter, goal: goal, dry_run: true }
+          else
+            results = perform_research(filter_components, goal, max_results)
+            @result_count = results[:rows]&.length || 0
+            results
+          end
+        end
+
+        protected
+
+        def description_args
+          { count: @result_count || 0, filter: @last_filter || "" }
+        end
+
+        private
+
+        def parse_filter(filter_string)
+          # This would parse the filter string into components
+          # For example, extracting username, date ranges, categories, tags, etc.
+          # Simplified implementation for now
+          components = {}
+          components[:raw] = filter_string
+          components
+        end
+
+        def simulate_count(filter_components)
+          # In a real implementation, this would query the database to get a count
+          # For now, return a simulated count
+          rand(10..100)
+        end
+
+        def perform_research(filter_components, goal, max_results)
+          # This would perform the actual research based on the filter and goal
+          # For now, return a simplified result structure
+          format_results([], %w[content url author date])
+        end
+
+        def calculate_max_results(llm)
+          max_results = options[:max_results].to_i
+          return [max_results, 100].min if max_results > 0
+
+          if llm.max_prompt_tokens > 30_000
+            50
+          elsif llm.max_prompt_tokens > 10_000
+            30
+          else
+            15
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/utils/research/filter.rb b/lib/utils/research/filter.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Utils
+    module Research
+      class Filter
+        attr_reader :raw_filter, :parsed_components, :current_offset, :batch_size
+
+        VALID_FILTER_PATTERNS = {
+          user: /\@(\w+)/,
+          before: /before:(\d{4}-\d{2}-\d{2})/,
+          after: /after:(\d{4}-\d{2}-\d{2})/,
+          category: /category:([a-zA-Z0-9_\-]+)/,
+          tag: /tag:([a-zA-Z0-9_\-]+)/,
+          group: /group:([a-zA-Z0-9_\-]+)/,
+          status: /status:(open|closed|archived|noreplies|single_user)/,
+        }
+
+        DEFAULT_BATCH_SIZE = 20
+
+        def initialize(filter_string, batch_size: DEFAULT_BATCH_SIZE)
+          @raw_filter = filter_string.to_s
+          @batch_size = batch_size
+          @current_offset = 0
+          @parsed_components = parse_filter
+        end
+
+        def parse_filter
+          components = {
+            users: [],
+            categories: [],
+            tags: [],
+            groups: [],
+            date_range: {
+            },
+            status: nil,
+            raw: @raw_filter,
+          }
+
+          # Extract user mentions
+          @raw_filter
+            .scan(VALID_FILTER_PATTERNS[:user])
+            .each { |match| components[:users] << match[0] }
+
+          # Extract date ranges
+          if before_match = @raw_filter.match(VALID_FILTER_PATTERNS[:before])
+            components[:date_range][:before] = before_match[1]
+          end
+
+          if after_match = @raw_filter.match(VALID_FILTER_PATTERNS[:after])
+            components[:date_range][:after] = after_match[1]
+          end
+
+          # Extract categories
+          @raw_filter
+            .scan(VALID_FILTER_PATTERNS[:category])
+            .each { |match| components[:categories] << match[0] }
+
+          # Extract tags
+          @raw_filter
+            .scan(VALID_FILTER_PATTERNS[:tag])
+            .each { |match| components[:tags] << match[0] }
+
+          # Extract groups
+          @raw_filter
+            .scan(VALID_FILTER_PATTERNS[:group])
+            .each { |match| components[:groups] << match[0] }
+
+          # Extract status
+          if status_match = @raw_filter.match(VALID_FILTER_PATTERNS[:status])
+            components[:status] = status_match[1]
+          end
+
+          components
+        end
+
+        def next_batch
+          previous_offset = @current_offset
+          @current_offset += @batch_size
+          previous_offset
+        end
+
+        def reset_batch
+          @current_offset = 0
+        end
+
+        def to_query_params
+          params = {}
+          params[:username] = parsed_components[:users].first if parsed_components[:users].any?
+          params[:before] = parsed_components[:date_range][:before] if parsed_components[
+            :date_range
+          ][
+            :before
+          ]
+          params[:after] = parsed_components[:date_range][:after] if parsed_components[:date_range][
+            :after
+          ]
+          params[:category] = parsed_components[:categories].first if parsed_components[
+            :categories
+          ].any?
+          params[:tags] = parsed_components[:tags].join(",") if parsed_components[:tags].any?
+          params[:status] = parsed_components[:status] if parsed_components[:status]
+          params
+        end
+      end
+    end
+  end
+end
diff --git a/lib/utils/research/llm_formatter.rb b/lib/utils/research/llm_formatter.rb
@@ -0,0 +1,150 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Utils
+    module Research
+      class LlmFormatter
+        attr_reader :processed_count, :total_count, :filter
+
+        def initialize(filter, goal, llm, max_tokens = nil)
+          @filter = filter
+          @goal = goal
+          @llm = llm
+          @max_tokens = max_tokens || calculate_default_max_tokens(llm)
+          @processed_count = 0
+          @total_count = 0
+        end
+
+        def format_and_yield(results, &block)
+          @total_count = results[:total] if results[:total]
+
+          if results[:rows].nil? || results[:rows].empty?
+            yield format_empty_result
+            return
+          end
+
+          # For summarization or analysis goals
+          if analysis_goal?
+            yield format_analysis_result(results[:rows])
+            return
+          end
+
+          # For standard listing with potential chunking
+          formatted_results = format_standard_result(results[:rows])
+          @processed_count += results[:rows].length
+
+          if block_given?
+            yield formatted_results
+          else
+            formatted_results
+          end
+        end
+
+        def format_progress
+          {
+            processed: @processed_count,
+            total: @total_count,
+            filter: @filter.raw_filter,
+            goal: @goal,
+            percent_complete: @total_count > 0 ? (@processed_count.to_f / @total_count * 100).round(1) : 0
+          }
+        end
+
+        private
+
+        def analysis_goal?
+          @goal.to_s.downcase.match?(/(summarize|analyze|extract|identify pattern|trend|insight)/)
+        end
+
+        def format_empty_result
+          {
+            message: "No results found for the given filter criteria",
+            filter: @filter.raw_filter,
+            goal: @goal
+          }
+        end
+
+        def format_standard_result(rows)
+          formatted_rows = rows.map do |row|
+            {
+              title: row[:title],
+              excerpt: truncate_text(row[:excerpt] || ""),
+              url: row[:url],
+              author: row[:username],
+              date: row[:created_at],
+              likes: row[:like_count],
+              replies: row[:reply_count]
+            }
+          end
+
+          {
+            goal: @goal,
+            filter: @filter.raw_filter,
+            count: formatted_rows.length,
+            total: @total_count,
+            offset: @filter.current_offset - formatted_rows.length,
+            rows: formatted_rows
+          }
+        end
+
+        def format_analysis_result(rows)
+          # Group by relevant attributes based on goal
+          data_points = extract_data_points(rows)
+
+          {
+            goal: @goal,
+            filter: @filter.raw_filter,
+            count: rows.length,
+            total: @total_count,
+            analysis: {
+              sample_size: rows.length,
+              data_points: data_points,
+              time_range: extract_time_range(rows)
+            }
+          }
+        end
+
+        def extract_data_points(rows)
+          # This would extract relevant data based on the goal
+          # Simplified implementation for now
+          {
+            authors: rows.map { |r| r[:username] }.uniq.count,
+            categories: rows.map { |r| r[:category_name] }.uniq.count,
+            earliest_post: rows.map { |r| r[:created_at] }.min,
+            latest_post: rows.map { |r| r[:created_at] }.max,
+            avg_likes: (rows.sum { |r| r[:like_count].to_i } / [rows.length, 1].max.to_f).round(1)
+          }
+        end
+
+        def extract_time_range(rows)
+          dates = rows.map { |r| r[:created_at] }.compact
+          return nil if dates.empty?
+
+          {
+            earliest: dates.min,
+            latest: dates.max,
+            span_days: ((dates.max - dates.min) / 86400).to_i rescue nil
+          }
+        end
+
+        def truncate_text(text, max_length = 300)
+          return text if text.length <= max_length
+          text[0...max_length] + "..."
+        end
+
+        def calculate_default_max_tokens(llm)
+          # Use a percentage of available tokens for results
+          max_prompt_tokens = llm.max_prompt_tokens
+
+          if max_prompt_tokens > 30_000
+            max_prompt_tokens * 0.7
+          elsif max_prompt_tokens > 10_000
+            max_prompt_tokens * 0.6
+          else
+            max_prompt_tokens * 0.5
+          end.to_i
+        end
+      end
+    end
+  end
+end