Skip to content

Commit af4f871

Browse files
authored
FIX: never provide tools with invalid UTF-8 strings (#692)
Previous to this change, on truncation we could return invalid UTF-8 strings to caller This also allows tools to read up to 30 megs vs the old 4 megs.
1 parent e26c598 commit af4f871

File tree

2 files changed

+62
-2
lines changed

2 files changed

+62
-2
lines changed

lib/ai_bot/tools/tool.rb

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ module DiscourseAi
44
module AiBot
55
module Tools
66
class Tool
7+
# Why 30 mega bytes?
8+
# This general limit is mainly a security feature to avoid tools
9+
# forcing infinite downloads or causing memory exhaustion.
10+
# The limit is somewhat arbitrary and can be increased in future if needed.
11+
MAX_RESPONSE_BODY_LENGTH = 30.megabyte
12+
713
class << self
814
def signature
915
raise NotImplemented
@@ -158,14 +164,24 @@ def send_http_request(url, headers: {}, authenticate_github: false, follow_redir
158164
end
159165
end
160166

161-
def read_response_body(response, max_length: 4.megabyte)
167+
def self.read_response_body(response, max_length: nil)
168+
max_length ||= MAX_RESPONSE_BODY_LENGTH
169+
162170
body = +""
163171
response.read_body do |chunk|
164172
body << chunk
165173
break if body.bytesize > max_length
166174
end
167175

168-
body[0..max_length]
176+
if body.bytesize > max_length
177+
body[0...max_length].scrub
178+
else
179+
body.scrub
180+
end
181+
end
182+
183+
def read_response_body(response, max_length: nil)
184+
self.class.read_response_body(response, max_length: max_length)
169185
end
170186

171187
def truncate(text, llm:, percent_length: nil, max_length: nil)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe DiscourseAi::AiBot::Tools::Tool do
4+
let :tool_class do
5+
described_class
6+
end
7+
8+
let :corrupt_string do
9+
"\xC3\x28\xA0\xA1\xE2\x28\xA1\xE2\x82\x28\xF0\x28\x8C\xBC"
10+
end
11+
12+
describe "#read_response_body" do
13+
class FakeResponse
14+
def initialize(chunk)
15+
@chunk = chunk
16+
end
17+
18+
def read_body
19+
yield @chunk while true
20+
end
21+
end
22+
23+
it "never returns a corrupt string" do
24+
response = FakeResponse.new(corrupt_string)
25+
result = tool_class.read_response_body(response, max_length: 100.bytes)
26+
27+
expect(result.encoding).to eq(Encoding::UTF_8)
28+
expect(result.valid_encoding?).to eq(true)
29+
30+
# scrubbing removes 7 chars
31+
expect(result.length).to eq(93)
32+
end
33+
34+
it "returns correctly truncated strings" do
35+
response = FakeResponse.new("abc")
36+
result = tool_class.read_response_body(response, max_length: 10.bytes)
37+
38+
expect(result.encoding).to eq(Encoding::UTF_8)
39+
expect(result.valid_encoding?).to eq(true)
40+
41+
expect(result).to eq("abcabcabca")
42+
end
43+
end
44+
end

0 commit comments

Comments
 (0)