diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index f7687a3d..c9b5e2ea 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -143,7 +143,9 @@ mod test_utils { |c: &[u8]| output.push(c), ); - rewriter.write(html).unwrap(); + for ch in html.chunks(15) { + rewriter.write(ch).unwrap(); + } rewriter.end().unwrap(); } diff --git a/src/rewritable_units/text_decoder.rs b/src/rewritable_units/text_decoder.rs index 5c2f159e..4a9b1470 100644 --- a/src/rewritable_units/text_decoder.rs +++ b/src/rewritable_units/text_decoder.rs @@ -2,6 +2,8 @@ use crate::base::SharedEncoding; use crate::rewriter::RewritingError; use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8}; +const DEFAULT_BUFFER_LEN: usize = if cfg!(test) { 13 } else { 1024 }; + pub(crate) struct TextDecoder { encoding: SharedEncoding, pending_text_streaming_decoder: Option, @@ -15,8 +17,9 @@ impl TextDecoder { Self { encoding, pending_text_streaming_decoder: None, - // TODO make adjustable - text_buffer: String::from_utf8(vec![0u8; 1024]).unwrap(), + // this will be later initialized to DEFAULT_BUFFER_LEN, + // because encoding_rs wants a slice + text_buffer: String::new(), } } @@ -52,6 +55,10 @@ impl TextDecoder { } } + if self.pending_text_streaming_decoder.is_none() && self.text_buffer.is_empty() { + // repeat() avoids utf8 check comapred to `String::from_utf8(vec![0; len])` + self.text_buffer = "\0".repeat(DEFAULT_BUFFER_LEN); + } let decoder = self .pending_text_streaming_decoder .get_or_insert_with(|| encoding.new_decoder_without_bom_handling()); @@ -113,7 +120,7 @@ impl TextDecoder { // The slow path buffers 1KB, and even though this shouldn't matter, // it is an observable behavior, and it makes bugs worse for text handlers // that assume they'll get only a single chunk. - if valid_up_to != raw_input.len() && valid_up_to < self.text_buffer.len() { + if valid_up_to != raw_input.len() && valid_up_to < DEFAULT_BUFFER_LEN { return None; } diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index bf721f7a..2799bf26 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -412,16 +412,6 @@ mod tests { }; } - macro_rules! skip_eof_chunk { - ($c:ident) => { - if $c.last_in_text_node() { - // This is not always true — a replacement char for an incomplete UTF-8 sequence could be flushed last - assert!($c.as_str().is_empty()); - return; - } - }; - } - #[test] fn parsed() { test!(|_| {}, HTML); @@ -429,15 +419,20 @@ mod tests { #[test] fn with_prepends_and_appends() { + let mut first = true; test!( |c| { - skip_eof_chunk!(c); - c.before("", ContentType::Text); - c.before("
Hey
", ContentType::Html); - c.before("", ContentType::Html); - c.after("", ContentType::Html); - c.after("", ContentType::Html); - c.after("", ContentType::Text); + let is_first = std::mem::replace(&mut first, c.last_in_text_node()); + if is_first { + c.before("", ContentType::Text); + c.before("
Hey
", ContentType::Html); + c.before("", ContentType::Html); + } + if c.last_in_text_node() { + c.after("", ContentType::Html); + c.after("", ContentType::Html); + c.after("", ContentType::Text); + } }, concat!( "<span>
Hey
", @@ -452,17 +447,22 @@ mod tests { #[test] fn removed() { + let mut first = true; test!( |c| { - skip_eof_chunk!(c); + let is_first = std::mem::replace(&mut first, c.last_in_text_node()); assert!(!c.removed()); c.remove(); assert!(c.removed()); - c.before("", ContentType::Html); - c.after("", ContentType::Html); + if is_first { + c.before("", ContentType::Html); + } + if c.last_in_text_node() { + c.after("", ContentType::Html); + } }, "" ); @@ -472,17 +472,20 @@ mod tests { fn replaced_with_text() { test!( |c| { - skip_eof_chunk!(c); - c.before("", ContentType::Html); - c.after("", ContentType::Html); + if c.last_in_text_node() { + c.before("", ContentType::Html); + c.after("", ContentType::Html); - assert!(!c.removed()); + assert!(!c.removed()); - c.replace("
", ContentType::Html); - c.replace("", ContentType::Html); - c.replace("", ContentType::Text); + c.replace("
", ContentType::Html); + c.replace("", ContentType::Html); + c.replace("", ContentType::Text); - assert!(c.removed()); + assert!(c.removed()); + } else { + c.remove(); + } }, "<foo & bar>" ); @@ -492,17 +495,20 @@ mod tests { fn replaced_with_html() { test!( |c| { - skip_eof_chunk!(c); - c.before("", ContentType::Html); - c.after("", ContentType::Html); + if c.last_in_text_node() { + c.before("", ContentType::Html); + c.after("", ContentType::Html); - assert!(!c.removed()); + assert!(!c.removed()); - c.replace("
", ContentType::Html); - c.replace("", ContentType::Html); - c.replace("", ContentType::Html); + c.replace("
", ContentType::Html); + c.replace("", ContentType::Html); + c.replace("", ContentType::Html); - assert!(c.removed()); + assert!(c.removed()); + } else { + c.remove(); + } }, "" );