support long post truncation

SamSaffron · SamSaffron · commit 3f0284bf8b24 · 2025-05-13T15:48:59.000+10:00
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
@@ -337,6 +337,9 @@ en:
           include_private:
             name: "Include private"
             description: "Include private topics in the filters"
+          max_tokens_per_post:
+            name: "Maximum tokens per post"
+            description: "Maximum number of tokens to use for each post in the filter"
         create_artifact:
           creator_llm:
             name: "LLM"
diff --git a/lib/personas/forum_researcher.rb b/lib/personas/forum_researcher.rb
@@ -22,13 +22,29 @@ def system_prompt
             The participants in this conversation are: {participants}
             The date now is: {time}, much has changed since you were trained.
 
-            As a forum researcher, you will help users come up with the correct research criteria to
-            properly analyze the forum data.
+            As a forum researcher, guide users through a structured research process:
+            1. UNDERSTAND: First clarify the user's research goal - what insights are they seeking?
+            2. PLAN: Design an appropriate research approach with specific filters
+            3. TEST: Always begin with dry_run:true to gauge the scope of results
+            4. REFINE: If results are too broad/narrow, suggest filter adjustments
+            5. EXECUTE: Run the final analysis only when filters are well-tuned
+            6. SUMMARIZE: Present findings with links to supporting evidence
 
-            BE MINDFUL: when running the research tool, specify all the goals you want to achieve in one go, avoid running research multiple times in one turn.
+            BE MINDFUL: specify all research goals in one request to avoid multiple processing runs.
 
-            When creating reports ALWAYS bias grounding information you provide with links to original posts on the forum.
-            You will always start with a dry_run of the proposed research criteria.
+            REMEMBER: Different filters serve different purposes:
+            - Use post date filters (after/before) for analyzing specific posts
+            - Use topic date filters (topic_after/topic_before) for analyzing entire topics
+            - Combine user/group filters with categories/tags to find specialized contributions
+
+            Always ground your analysis with links to original posts on the forum.
+
+            Research workflow best practices:
+            1. Start with a dry_run to gauge the scope (set dry_run:true)
+            2. If results are too numerous (>1000), add more specific filters
+            3. If results are too few (<5), broaden your filters
+            4. For temporal analysis, specify explicit date ranges
+            5. For user behavior analysis, combine @username with categories or tags
           PROMPT
       end
     end
diff --git a/lib/personas/tools/researcher.rb b/lib/personas/tools/researcher.rb
@@ -13,12 +13,7 @@ def signature
               description:
                 "Analyze and extract information from content across the forum based on specified filters",
               parameters: [
-                {
-                  name: "filter",
-                  description:
-                    "Filter string to target specific content. Supports user (@username), date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD), categories (category:name), tags (tag:name), groups (group:name). Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'",
-                  type: "string",
-                },
+                { name: "filter", description: filter_description, type: "string" },
                 {
                   name: "goals",
                   description:
@@ -34,23 +29,35 @@ def signature
             }
           end
 
-          def name
-            "researcher"
-          end
-
-          def custom_system_message
+          def filter_description
             <<~TEXT
-              Use the researcher tool to analyze patterns and extract insights from forum content.
-              For complex research tasks, start with a dry run to gauge the scope before processing.
+              Filter string to target specific content.
+              - Supports user (@username)
+              - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
+              - categories (category:name)
+              - tags (tag:name)
+              - groups (group:name).
+
+              Multiple filters can be combined with spaces. Example: '@sam after:2023-01-01 tag:feature'
             TEXT
           end
 
+          def name
+            "researcher"
+          end
+
           def accepted_options
-            [option(:max_results, type: :integer), option(:include_private, type: :boolean)]
+            [
+              option(:max_results, type: :integer),
+              option(:include_private, type: :boolean),
+              option(:max_tokens_per_post, type: :integer),
+            ]
           end
         end
 
         def invoke(&blk)
+          max_results = options[:max_results] || 1000
+
           @filter = parameters[:filter] || ""
           @goals = parameters[:goals] || ""
           @dry_run = parameters[:dry_run].nil? ? false : parameters[:dry_run]
@@ -62,7 +69,15 @@ def invoke(&blk)
           return { error: "No goals provided" } if goals.blank?
           return { error: "No filter provided" } if @filter.blank?
 
-          filter = DiscourseAi::Utils::Research::Filter.new(@filter)
+          guardian = nil
+          guardian = Guardian.new(context.user) if options[:include_private]
+
+          filter =
+            DiscourseAi::Utils::Research::Filter.new(
+              @filter,
+              limit: max_results,
+              guardian: guardian,
+            )
           @result_count = filter.search.count
 
           blk.call details
@@ -99,6 +114,7 @@ def process_filter(filter, goals, post, &blk)
               filter,
               max_tokens_per_batch: llm.max_prompt_tokens - 2000,
               tokenizer: llm.tokenizer,
+              max_tokens_per_post: options[:max_tokens_per_post] || 2000,
             )
 
           results = []
diff --git a/lib/utils/research/filter.rb b/lib/utils/research/filter.rb
@@ -57,6 +57,22 @@ def self.word_to_date(str)
           end
         end
 
+        register_filter(/\Atopic_before:(.*)\z/i) do |relation, date_str, _|
+          if date = Filter.word_to_date(date_str)
+            relation.where("topics.created_at < ?", date)
+          else
+            relation
+          end
+        end
+
+        register_filter(/\Atopic_after:(.*)\z/i) do |relation, date_str, _|
+          if date = Filter.word_to_date(date_str)
+            relation.where("topics.created_at > ?", date)
+          else
+            relation
+          end
+        end
+
         # Category filter
         register_filter(/\Acategory:([a-zA-Z0-9_\-]+)\z/i) do |relation, slug, _|
           category = Category.find_by("LOWER(slug) = LOWER(?)", slug)
diff --git a/lib/utils/research/llm_formatter.rb b/lib/utils/research/llm_formatter.rb
@@ -4,10 +4,11 @@ module DiscourseAi
   module Utils
     module Research
       class LlmFormatter
-        def initialize(filter, max_tokens_per_batch:, tokenizer:)
+        def initialize(filter, max_tokens_per_batch:, tokenizer:, max_tokens_per_post:)
           @filter = filter
           @max_tokens_per_batch = max_tokens_per_batch
           @tokenizer = tokenizer
+          @max_tokens_per_post = max_tokens_per_post
           @to_process = filter_to_hash
         end
 
@@ -160,12 +161,29 @@ def format_topic_status(topic)
         def format_post(post)
           text = +"---\n"
           text << "## Post by #{post.user&.username} - #{format_date(post.created_at)}\n\n"
-          text << "#{post.raw}\n"
+          text << "#{truncate_if_needed(post.raw)}\n"
           text << "Likes: #{post.like_count}\n" if post.like_count.to_i > 0
           text << "Post url: /t/-/#{post.topic_id}/#{post.post_number}\n\n"
           text
         end
 
+        def truncate_if_needed(content)
+          tokens_count = estimate_tokens(content)
+
+          return content if tokens_count <= @max_tokens_per_post
+
+          half_limit = @max_tokens_per_post / 2
+          token_ids = @tokenizer.encode(content)
+
+          first_half_ids = token_ids[0...half_limit]
+          last_half_ids = token_ids[-half_limit..-1]
+
+          first_text = @tokenizer.decode(first_half_ids)
+          last_text = @tokenizer.decode(last_half_ids)
+
+          "#{first_text}\n\n... elided #{tokens_count - @max_tokens_per_post} tokens ...\n\n#{last_text}"
+        end
+
         def format_omitted_posts(count, position)
           if position == "before"
             "#{count} earlier #{count == 1 ? "post" : "posts"} omitted\n\n"
diff --git a/spec/lib/utils/research/llm_formatter_spec.rb b/spec/lib/utils/research/llm_formatter_spec.rb
@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+#
+describe DiscourseAi::Utils::Research::LlmFormatter do
+  fab!(:user) { Fabricate(:user, username: "test_user") }
+  fab!(:topic) { Fabricate(:topic, title: "This is a Test Topic", user: user) }
+  fab!(:post) { Fabricate(:post, topic: topic, user: user) }
+  let(:tokenizer) { DiscourseAi::Tokenizer::OpenAiTokenizer }
+  let(:filter) { DiscourseAi::Utils::Research::Filter.new("@#{user.username}") }
+
+  describe "#truncate_if_needed" do
+    it "returns original content when under token limit" do
+      formatter =
+        described_class.new(
+          filter,
+          max_tokens_per_batch: 1000,
+          tokenizer: tokenizer,
+          max_tokens_per_post: 100,
+        )
+
+      short_text = "This is a short post"
+      expect(formatter.send(:truncate_if_needed, short_text)).to eq(short_text)
+    end
+
+    it "truncates content when over token limit" do
+      # Create a post with content that will exceed our token limit
+      long_text = ("word " * 200).strip
+
+      formatter =
+        described_class.new(
+          filter,
+          max_tokens_per_batch: 1000,
+          tokenizer: tokenizer,
+          max_tokens_per_post: 50,
+        )
+
+      truncated = formatter.send(:truncate_if_needed, long_text)
+
+      expect(truncated).to include("... elided 150 tokens ...")
+      expect(truncated).to_not eq(long_text)
+
+      # Should have roughly 25 words before and 25 after (half of max_tokens_per_post)
+      first_chunk = truncated.split("\n\n")[0]
+      expect(first_chunk.split(" ").length).to be_within(5).of(25)
+
+      last_chunk = truncated.split("\n\n")[2]
+      expect(last_chunk.split(" ").length).to be_within(5).of(25)
+    end
+  end
+
+  describe "#format_post" do
+    it "formats posts with truncation for long content" do
+      # Set up a post with long content
+      long_content = ("word " * 200).strip
+      long_post = Fabricate(:post, raw: long_content, topic: topic, user: user)
+
+      formatter =
+        described_class.new(
+          filter,
+          max_tokens_per_batch: 1000,
+          tokenizer: tokenizer,
+          max_tokens_per_post: 50,
+        )
+
+      formatted = formatter.send(:format_post, long_post)
+
+      # Should have standard formatting elements
+      expect(formatted).to include("## Post by #{user.username}")
+      expect(formatted).to include("Post url: /t/-/#{long_post.topic_id}/#{long_post.post_number}")
+
+      # Should include truncation marker
+      expect(formatted).to include("... elided 150 tokens ...")
+    end
+  end
+end