@@ -11,29 +11,71 @@ def initialize(filter, max_tokens_per_batch:, tokenizer:)
11
11
@to_process = filter_to_hash
12
12
end
13
13
14
- def next_batch
15
- # return text that is easily consumed by the LLM containing:
16
- # - topic title, tags, category and creation date
17
- # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18
- # - raw post content
19
- # - author name
20
- # - date
21
- # - info about future posts in the topic that are omitted
22
- #
23
- # always attempt to return entire topics (or multiple) if possible
24
- # return nil if we are done
25
- #
26
- # example_return:
27
- # { post_count: 12, topic_count: 3, text: "..." }
14
+ def each_chunk
15
+ return nil if @to_process . empty?
16
+
17
+ result = { post_count : 0 , topic_count : 0 , text : +"" }
18
+ estimated_tokens = 0
19
+
20
+ @to_process . each do |topic_id , topic_data |
21
+ topic = Topic . find_by ( id : topic_id )
22
+ next unless topic
23
+
24
+ topic_text , topic_tokens , post_count = format_topic ( topic , topic_data [ :posts ] )
25
+
26
+ # If this single topic exceeds our token limit and we haven't added anything yet,
27
+ # we need to include at least this one topic (partial content)
28
+ if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
29
+ offset = 0
30
+ while offset < topic_text . length
31
+ chunk = +""
32
+ chunk_tokens = 0
33
+ lines = topic_text [ offset ..] . lines
34
+ lines . each do |line |
35
+ line_tokens = estimate_tokens ( line )
36
+ break if chunk_tokens + line_tokens > @max_tokens_per_batch
37
+ chunk << line
38
+ chunk_tokens += line_tokens
39
+ end
40
+ break if chunk . empty?
41
+ yield (
42
+ {
43
+ text : chunk ,
44
+ post_count : post_count , # This may overcount if split mid-topic, but preserves original logic
45
+ topic_count : 1 ,
46
+ }
47
+ )
48
+ offset += chunk . length
49
+ end
50
+
51
+ next
52
+ end
53
+
54
+ # If adding this topic would exceed our token limit and we already have content, skip it
55
+ if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
56
+ yield result if result [ :text ] . present?
57
+ estimated_tokens = 0
58
+ result = { post_count : 0 , topic_count : 0 , text : +"" }
59
+ else
60
+ # Add this topic to the result
61
+ result [ :text ] << topic_text
62
+ result [ :post_count ] += post_count
63
+ result [ :topic_count ] += 1
64
+ estimated_tokens += topic_tokens
65
+ end
66
+ end
67
+ yield result if result [ :text ] . present?
68
+
69
+ @to_process . clear
28
70
end
29
71
30
72
private
31
73
32
74
def filter_to_hash
33
75
hash = { }
34
- filter
76
+ @ filter
35
77
. search
36
- . pluck ( :topic_id , :post_id , :post_number )
78
+ . pluck ( :topic_id , :id , :post_number )
37
79
. each do |topic_id , post_id , post_number |
38
80
hash [ topic_id ] ||= { posts : [ ] }
39
81
hash [ topic_id ] [ :posts ] << [ post_id , post_number ]
@@ -42,6 +84,90 @@ def filter_to_hash
42
84
hash . each_value { |topic | topic [ :posts ] . sort_by! { |_ , post_number | post_number } }
43
85
hash
44
86
end
87
+
88
+ def format_topic ( topic , posts_data )
89
+ text = ""
90
+ total_tokens = 0
91
+ post_count = 0
92
+
93
+ # Add topic header
94
+ text += format_topic_header ( topic )
95
+ total_tokens += estimate_tokens ( text )
96
+
97
+ # Get all post numbers in this topic
98
+ all_post_numbers = topic . posts . pluck ( :post_number ) . sort
99
+
100
+ # Format posts with omitted information
101
+ first_post_number = posts_data . first [ 1 ]
102
+ last_post_number = posts_data . last [ 1 ]
103
+
104
+ # Handle posts before our selection
105
+ if first_post_number > 1
106
+ omitted_before = first_post_number - 1
107
+ text += format_omitted_posts ( omitted_before , "before" )
108
+ total_tokens += estimate_tokens ( format_omitted_posts ( omitted_before , "before" ) )
109
+ end
110
+
111
+ # Format each post
112
+ posts_data . each do |post_id , post_number |
113
+ post = Post . find_by ( id : post_id )
114
+ next unless post
115
+
116
+ text += format_post ( post )
117
+ total_tokens += estimate_tokens ( format_post ( post ) )
118
+ post_count += 1
119
+ end
120
+
121
+ # Handle posts after our selection
122
+ if last_post_number < all_post_numbers . last
123
+ omitted_after = all_post_numbers . last - last_post_number
124
+ text += format_omitted_posts ( omitted_after , "after" )
125
+ total_tokens += estimate_tokens ( format_omitted_posts ( omitted_after , "after" ) )
126
+ end
127
+
128
+ [ text , total_tokens , post_count ]
129
+ end
130
+
131
+ def format_topic_header ( topic )
132
+ header = +"# #{ topic . title } \n "
133
+
134
+ # Add category
135
+ header << "Category: #{ topic . category . name } \n " if topic . category
136
+
137
+ # Add tags
138
+ header << "Tags: #{ topic . tags . map ( &:name ) . join ( ", " ) } \n " if topic . tags . present?
139
+
140
+ # Add creation date
141
+ header << "Created: #{ format_date ( topic . created_at ) } \n "
142
+ header << "Topic url: /t/#{ topic . id } \n \n "
143
+
144
+ header
145
+ end
146
+
147
+ def format_post ( post )
148
+ text = +"---\n "
149
+ text << "## Post by #{ post . user . username } - #{ format_date ( post . created_at ) } \n \n "
150
+ text << "#{ post . raw } \n "
151
+ text << "Likes: #{ post . like_count } \n " if post . like_count . to_i > 0
152
+ text << "Post url: /t/-/#{ post . topic_id } /#{ post . post_number } \n \n "
153
+ text
154
+ end
155
+
156
+ def format_omitted_posts ( count , position )
157
+ if position == "before"
158
+ "#{ count } earlier #{ count == 1 ? "post" : "posts" } omitted\n \n "
159
+ else
160
+ "#{ count } later #{ count == 1 ? "post" : "posts" } omitted\n \n "
161
+ end
162
+ end
163
+
164
+ def format_date ( date )
165
+ date . strftime ( "%Y-%m-%d %H:%M" )
166
+ end
167
+
168
+ def estimate_tokens ( text )
169
+ @tokenizer . tokenize ( text ) . length
170
+ end
45
171
end
46
172
end
47
173
end
0 commit comments