Implement more features

SamSaffron · SamSaffron · commit 22d4624cf7e6 · 2025-05-09T14:25:36.000+10:00
diff --git a/lib/personas/tools/researcher.rb b/lib/personas/tools/researcher.rb
@@ -27,8 +27,7 @@ def signature
                 },
                 {
                   name: "dry_run",
-                  description:
-                    "When true, only count matching items without processing data (default: true)",
+                  description: "When true, only count matching items without processing data",
                   type: "boolean",
                 },
               ],
@@ -51,21 +50,86 @@ def accepted_options
           end
         end
 
-        def invoke
+        def invoke(&blk)
           @last_filter = parameters[:filter] || ""
+          post = Post.find_by(id: context.post_id)
           goal = parameters[:goal] || ""
-
-          #dry_run = parameters[:dry_run].nil? ? true : parameters[:dry_run]
-          #yield(I18n.t("discourse_ai.ai_bot.researching", filter: @last_filter, goal: goal))
+          dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
 
           filter = DiscourseAi::Utils::Research::Filter.new(@last_filter)
 
           @result_count = filter.search.count
-          { dry_run: true, goal: goal, filter: @last_filter, number_of_results: @result_count }
+
+          if dry_run
+            { dry_run: true, goal: goal, filter: @last_filter, number_of_results: @result_count }
+          else
+            process_filter(filter, goal, post, &blk)
+          end
         end
 
         protected
 
+        MIN_TOKENS_FOR_RESEARCH = 8000
+        def process_filter(filter, goal, post, &blk)
+          if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
+            raise ArgumentError,
+                  "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
+          end
+          formatter =
+            DiscourseAi::Utils::Research::LlmFormatter.new(
+              filter,
+              max_tokens_per_batch: llm.max_prompt_tokens - 2000,
+              tokenizer: llm.tokenizer,
+            )
+
+          results = []
+
+          formatter.each_chunk { |chunk| results << run_inference(chunk[:text], goal, post, &blk) }
+          { dry_run: false, goal: goal, filter: @last_filter, results: results }
+        end
+
+        def run_inference(chunk_text, goal, post, &blk)
+          system_prompt = goal_system_prompt(goal)
+          user_prompt = goal_user_prompt(goal, chunk_text)
+
+          prompt =
+            DiscourseAi::Completions::Prompt.new(
+              system_prompt,
+              messages: [{ type: :user, content: user_prompt }],
+              post_id: post.id,
+              topic_id: post.topic_id,
+            )
+
+          results = []
+          llm.generate(prompt, user: post.user, feature_name: context.feature_name) do |partial|
+            results << partial
+          end
+
+          blk.call(".")
+          results.join
+        end
+
+        def goal_system_prompt(goal)
+          <<~TEXT
+            You are a researcher tool designed to analyze and extract information from forum content.
+            Your task is to process the provided content and extract relevant information based on the specified goal.
+
+            Your goal is: #{goal}
+          TEXT
+        end
+
+        def goal_user_prompt(goal, chunk_text)
+          <<~TEXT
+            Here is the content to analyze:
+
+            {{{
+            #{chunk_text}
+            }}}
+
+            Your goal is: #{goal}
+           TEXT
+        end
+
         def description_args
           { count: @result_count || 0, filter: @last_filter || "" }
         end
diff --git a/lib/personas/tools/tool.rb b/lib/personas/tools/tool.rb
@@ -47,8 +47,9 @@ def inject_prompt(prompt:, context:, persona:)
           end
         end
 
-        attr_accessor :custom_raw, :parameters
-        attr_reader :tool_call_id, :persona_options, :bot_user, :llm, :context
+        # llm being public makes it a bit easier to test
+        attr_accessor :custom_raw, :parameters, :llm
+        attr_reader :tool_call_id, :persona_options, :bot_user, :context
 
         def initialize(
           parameters,
diff --git a/lib/utils/research/llm_formatter.rb b/lib/utils/research/llm_formatter.rb
@@ -11,29 +11,71 @@ def initialize(filter, max_tokens_per_batch:, tokenizer:)
           @to_process = filter_to_hash
         end
 
-        def next_batch
-          # return text that is easily consumed by the LLM containing:
-          # - topic title, tags, category and creation date
-          # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
-          # - raw post content
-          # - author name
-          # - date
-          # - info about future posts in the topic that are omitted
-          #
-          # always attempt to return entire topics (or multiple) if possible
-          # return nil if we are done
-          #
-          # example_return:
-          # { post_count: 12, topic_count: 3, text: "..." }
+        def each_chunk
+          return nil if @to_process.empty?
+
+          result = { post_count: 0, topic_count: 0, text: +"" }
+          estimated_tokens = 0
+
+          @to_process.each do |topic_id, topic_data|
+            topic = Topic.find_by(id: topic_id)
+            next unless topic
+
+            topic_text, topic_tokens, post_count = format_topic(topic, topic_data[:posts])
+
+            # If this single topic exceeds our token limit and we haven't added anything yet,
+            # we need to include at least this one topic (partial content)
+            if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
+              offset = 0
+              while offset < topic_text.length
+                chunk = +""
+                chunk_tokens = 0
+                lines = topic_text[offset..].lines
+                lines.each do |line|
+                  line_tokens = estimate_tokens(line)
+                  break if chunk_tokens + line_tokens > @max_tokens_per_batch
+                  chunk << line
+                  chunk_tokens += line_tokens
+                end
+                break if chunk.empty?
+                yield(
+                  {
+                    text: chunk,
+                    post_count: post_count, # This may overcount if split mid-topic, but preserves original logic
+                    topic_count: 1,
+                  }
+                )
+                offset += chunk.length
+              end
+
+              next
+            end
+
+            # If adding this topic would exceed our token limit and we already have content, skip it
+            if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
+              yield result if result[:text].present?
+              estimated_tokens = 0
+              result = { post_count: 0, topic_count: 0, text: +"" }
+            else
+              # Add this topic to the result
+              result[:text] << topic_text
+              result[:post_count] += post_count
+              result[:topic_count] += 1
+              estimated_tokens += topic_tokens
+            end
+          end
+          yield result if result[:text].present?
+
+          @to_process.clear
         end
 
         private
 
         def filter_to_hash
           hash = {}
-          filter
+          @filter
             .search
-            .pluck(:topic_id, :post_id, :post_number)
+            .pluck(:topic_id, :id, :post_number)
             .each do |topic_id, post_id, post_number|
               hash[topic_id] ||= { posts: [] }
               hash[topic_id][:posts] << [post_id, post_number]
@@ -42,6 +84,90 @@ def filter_to_hash
           hash.each_value { |topic| topic[:posts].sort_by! { |_, post_number| post_number } }
           hash
         end
+
+        def format_topic(topic, posts_data)
+          text = ""
+          total_tokens = 0
+          post_count = 0
+
+          # Add topic header
+          text += format_topic_header(topic)
+          total_tokens += estimate_tokens(text)
+
+          # Get all post numbers in this topic
+          all_post_numbers = topic.posts.pluck(:post_number).sort
+
+          # Format posts with omitted information
+          first_post_number = posts_data.first[1]
+          last_post_number = posts_data.last[1]
+
+          # Handle posts before our selection
+          if first_post_number > 1
+            omitted_before = first_post_number - 1
+            text += format_omitted_posts(omitted_before, "before")
+            total_tokens += estimate_tokens(format_omitted_posts(omitted_before, "before"))
+          end
+
+          # Format each post
+          posts_data.each do |post_id, post_number|
+            post = Post.find_by(id: post_id)
+            next unless post
+
+            text += format_post(post)
+            total_tokens += estimate_tokens(format_post(post))
+            post_count += 1
+          end
+
+          # Handle posts after our selection
+          if last_post_number < all_post_numbers.last
+            omitted_after = all_post_numbers.last - last_post_number
+            text += format_omitted_posts(omitted_after, "after")
+            total_tokens += estimate_tokens(format_omitted_posts(omitted_after, "after"))
+          end
+
+          [text, total_tokens, post_count]
+        end
+
+        def format_topic_header(topic)
+          header = +"# #{topic.title}\n"
+
+          # Add category
+          header << "Category: #{topic.category.name}\n" if topic.category
+
+          # Add tags
+          header << "Tags: #{topic.tags.map(&:name).join(", ")}\n" if topic.tags.present?
+
+          # Add creation date
+          header << "Created: #{format_date(topic.created_at)}\n"
+          header << "Topic url: /t/#{topic.id}\n\n"
+
+          header
+        end
+
+        def format_post(post)
+          text = +"---\n"
+          text << "## Post by #{post.user.username} - #{format_date(post.created_at)}\n\n"
+          text << "#{post.raw}\n"
+          text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
+          text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
+          text
+        end
+
+        def format_omitted_posts(count, position)
+          if position == "before"
+            "#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
+          else
+            "#{count} later #{count == 1 ? "post" : "posts"} omitted\n\n"
+          end
+        end
+
+        def format_date(date)
+          date.strftime("%Y-%m-%d %H:%M")
+        end
+
+        def estimate_tokens(text)
+          @tokenizer.tokenize(text).length
+        end
       end
     end
   end
diff --git a/spec/lib/personas/tools/researcher_spec.rb b/spec/lib/personas/tools/researcher_spec.rb
@@ -64,5 +64,42 @@
 
       expect(researcher.options[:max_results]).to eq(50)
     end
+
+    it "returns correct results for non-dry-run with filtered posts" do
+      # Stage 2 topics, each with 2 posts
+      topics = Array.new(2) { Fabricate(:topic, category: category, tags: [tag_research]) }
+      topics.flat_map do |topic|
+        [
+          Fabricate(:post, topic: topic, raw: "Relevant content 1", user: user),
+          Fabricate(:post, topic: topic, raw: "Relevant content 2", user: admin),
+        ]
+      end
+
+      # Filter to posts by user in research-category
+      researcher =
+        described_class.new(
+          {
+            filter: "category:research-category @#{user.username}",
+            goal: "find relevant content",
+            dry_run: false,
+          },
+          bot_user: bot_user,
+          llm: llm,
+          context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
+        )
+
+      responses = 10.times.map { |i| ["Found: Relevant content #{i + 1}"] }
+      results = nil
+
+      DiscourseAi::Completions::Llm.with_prepared_responses(responses) do
+        researcher.llm = llm_model.to_llm
+        results = researcher.invoke(&progress_blk)
+      end
+
+      expect(results[:dry_run]).to eq(false)
+      expect(results[:goal]).to eq("find relevant content")
+      expect(results[:filter]).to eq("category:research-category @#{user.username}")
+      expect(results[:results].first).to include("Found: Relevant content 1")
+    end
   end
 end