From 2aefa41d51efff154f8bbd24ba6cfa35521cea87 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Thu, 17 Oct 2024 15:51:35 +0200 Subject: [PATCH 1/2] Add a fast path for ASCII strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This optimization is based on a few assumptions: - Most strings are ASCII only. - Most strings had their coderange scanned already. If the above is true, then by checking the string coderange, we can use a much more streamlined function to encode ASCII strings. Before: ``` == Encoding twitter.json (466906 bytes) ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 140.000 i/100ms oj 230.000 i/100ms rapidjson 108.000 i/100ms Calculating ------------------------------------- json 1.464k (± 1.4%) i/s (682.83 μs/i) - 7.420k in 5.067573s oj 2.338k (± 1.5%) i/s (427.64 μs/i) - 11.730k in 5.017336s rapidjson 1.075k (± 1.6%) i/s (930.40 μs/i) - 5.400k in 5.025469s Comparison: json: 1464.5 i/s oj: 2338.4 i/s - 1.60x faster rapidjson: 1074.8 i/s - 1.36x slower ``` After: ``` == Encoding twitter.json (466906 bytes) ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 189.000 i/100ms oj 228.000 i/100ms rapidjson 108.000 i/100ms Calculating ------------------------------------- json 1.903k (± 1.2%) i/s (525.55 μs/i) - 9.639k in 5.066521s oj 2.306k (± 1.3%) i/s (433.71 μs/i) - 11.628k in 5.044096s rapidjson 1.069k (± 2.4%) i/s (935.38 μs/i) - 5.400k in 5.053794s Comparison: json: 1902.8 i/s oj: 2305.7 i/s - 1.21x faster rapidjson: 1069.1 i/s - 1.78x slower ``` --- ext/json/ext/generator/generator.c | 215 +++++++++++++++++++++++++---- ext/json/ext/generator/generator.h | 1 - 2 files changed, 185 insertions(+), 31 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 0af1592a7..b9946b3cf 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -25,14 +25,13 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend; * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe) +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; const char *in_utf8_str = RSTRING_PTR(in_string); unsigned long in_utf8_len = RSTRING_LEN(in_string); - bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string); unsigned long beg = 0, pos; @@ -42,46 +41,183 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ bool should_escape; /* UTF-8 decoding */ - if (in_is_ascii_only) { - ch = in_utf8_str[pos]; - ch_len = 1; - } else { - short i; - if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ - else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ - else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ - else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ - else - rb_raise(rb_path2class("JSON::GeneratorError"), - "source sequence is illegal/malformed utf-8"); - if ((pos+ch_len) > in_utf8_len) - rb_raise(rb_path2class("JSON::GeneratorError"), - "partial character in source, but hit end"); - for (i = 1; i < ch_len; i++) { - if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */ - rb_raise(rb_path2class("JSON::GeneratorError"), - "source sequence is illegal/malformed utf-8"); - ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); + short i; + if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ + else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ + else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ + else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ + else { + rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); + } + + for (i = 1; i < ch_len; i++) { + ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); + } + + /* JSON policy */ + should_escape = + (ch < 0x20) || + (ch == '"') || + (ch == '\\') || + (out_script_safe && (ch == '/')) || + (out_script_safe && (ch == 0x2028)) || + (out_script_safe && (ch == 0x2029)); + + /* JSON encoding */ + if (should_escape) { + if (pos > beg) { + fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg); + } + + beg = pos + ch_len; + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: + if (ch <= 0xFFFF) { + scratch[2] = hexdig[ch >> 12]; + scratch[3] = hexdig[(ch >> 8) & 0xf]; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + } else { + uint16_t hi, lo; + ch -= 0x10000; + hi = 0xD800 + (uint16_t)(ch >> 10); + lo = 0xDC00 + (uint16_t)(ch & 0x3FF); + + scratch[2] = hexdig[hi >> 12]; + scratch[3] = hexdig[(hi >> 8) & 0xf]; + scratch[4] = hexdig[(hi >> 4) & 0xf]; + scratch[5] = hexdig[hi & 0xf]; + + scratch[8] = hexdig[lo >> 12]; + scratch[9] = hexdig[(lo >> 8) & 0xf]; + scratch[10] = hexdig[(lo >> 4) & 0xf]; + scratch[11] = hexdig[lo & 0xf]; + + fbuffer_append(out_buffer, scratch, 12); + } } - if (ch > 0x10FFFF) - rb_raise(rb_path2class("JSON::GeneratorError"), - "source sequence is illegal/malformed utf-8"); } + pos += ch_len; + } + + if (beg < in_utf8_len) { + fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg); + } + + RB_GC_GUARD(in_string); +} + +static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, bool out_script_safe) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + const char *ptr = RSTRING_PTR(str); + unsigned long len = RSTRING_LEN(str); + + unsigned long beg = 0, pos; + + for (pos = 0; pos < len;) { + unsigned char ch = ptr[pos]; + bool should_escape; + /* JSON policy */ should_escape = (ch < 0x20) || (ch == '"') || (ch == '\\') || - (out_ascii_only && (ch > 0x7F)) || + (out_script_safe && (ch == '/')); + + /* JSON encoding */ + if (should_escape) { + if (pos > beg) { + fbuffer_append(out_buffer, &ptr[beg], pos - beg); + } + + beg = pos + 1; + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: + scratch[2] = hexdig[ch >> 12]; + scratch[3] = hexdig[(ch >> 8) & 0xf]; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + } + } + + pos++; + } + + if (beg < len) { + fbuffer_append(out_buffer, &ptr[beg], len - beg); + } + + RB_GC_GUARD(str); +} + +static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + const char *in_utf8_str = RSTRING_PTR(in_string); + unsigned long in_utf8_len = RSTRING_LEN(in_string); + + unsigned long beg = 0, pos; + + for (pos = 0; pos < in_utf8_len;) { + uint32_t ch; + short ch_len; + bool should_escape; + + /* UTF-8 decoding */ + short i; + if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ + else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ + else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ + else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ + else { + rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); + } + + for (i = 1; i < ch_len; i++) { + ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); + } + + /* JSON policy */ + should_escape = + (ch < 0x20) || + (ch == '"') || + (ch == '\\') || + (ch > 0x7F) || (out_script_safe && (ch == '/')) || (out_script_safe && (ch == 0x2028)) || (out_script_safe && (ch == 0x2029)); /* JSON encoding */ if (should_escape) { - if (pos > beg) + if (pos > beg) { fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg); + } + beg = pos + ch_len; switch (ch) { case '"': fbuffer_append(out_buffer, "\\\"", 2); break; @@ -122,8 +258,11 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ pos += ch_len; } - if (beg < in_utf8_len) + + if (beg < in_utf8_len) { fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg); + } + RB_GC_GUARD(in_string); } @@ -570,11 +709,27 @@ static int enc_utf8_compatible_p(int enc_idx) static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj) { - fbuffer_append_char(buffer, '"'); if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) { obj = rb_str_export_to_enc(obj, rb_utf8_encoding()); } - convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe); + + fbuffer_append_char(buffer, '"'); + + switch(rb_enc_str_coderange(obj)) { + case ENC_CODERANGE_7BIT: + convert_ASCII_to_JSON(buffer, obj, state->script_safe); + break; + case ENC_CODERANGE_VALID: + if (RB_UNLIKELY(state->ascii_only)) { + convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe); + } else { + convert_UTF8_to_JSON(buffer, obj, state->script_safe); + } + break; + default: + rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); + break; + } fbuffer_append_char(buffer, '"'); } diff --git a/ext/json/ext/generator/generator.h b/ext/json/ext/generator/generator.h index 09bf68079..0553277fa 100644 --- a/ext/json/ext/generator/generator.h +++ b/ext/json/ext/generator/generator.h @@ -23,7 +23,6 @@ typedef unsigned char _Bool; #endif #endif -static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe); static char *fstrndup(const char *ptr, unsigned long len); /* ruby api and some helpers */ From e2a2a33b2d2e82600bcfa1e3461026aa01072900 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Thu, 17 Oct 2024 17:04:08 +0200 Subject: [PATCH 2/2] Use a table to check for escaping needs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This performs noticeably better than the boolean logic. Before: ``` == Encoding twitter.json (466906 bytes) ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 189.000 i/100ms oj 228.000 i/100ms rapidjson 108.000 i/100ms Calculating ------------------------------------- json 1.903k (± 1.2%) i/s (525.55 μs/i) - 9.639k in 5.066521s oj 2.306k (± 1.3%) i/s (433.71 μs/i) - 11.628k in 5.044096s rapidjson 1.069k (± 2.4%) i/s (935.38 μs/i) - 5.400k in 5.053794s Comparison: json: 1902.8 i/s oj: 2305.7 i/s - 1.21x faster rapidjson: 1069.1 i/s - 1.78x slower ``` After: ``` == Encoding twitter.json (466906 bytes) ruby 3.4.0preview2 (2024-10-07 master 32c733f57b) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 224.000 i/100ms oj 230.000 i/100ms rapidjson 107.000 i/100ms Calculating ------------------------------------- json 2.254k (± 1.6%) i/s (443.69 μs/i) - 11.424k in 5.069999s oj 2.318k (± 1.4%) i/s (431.32 μs/i) - 11.730k in 5.060421s rapidjson 1.081k (± 1.9%) i/s (925.05 μs/i) - 5.457k in 5.049738s Comparison: json: 2253.8 i/s oj: 2318.5 i/s - same-ish: difference falls within error rapidjson: 1081.0 i/s - 2.08x slower ``` The escape table is taken directly from Mame's PR. Co-Authored-By: Yusuke Endoh --- ext/json/ext/generator/generator.c | 37 ++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index b9946b3cf..53c4223f0 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -117,7 +117,29 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ RB_GC_GUARD(in_string); } -static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, bool out_script_safe) +static const bool escape_table[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +static const bool script_safe_escape_table[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -129,17 +151,8 @@ static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, bool out_scrip for (pos = 0; pos < len;) { unsigned char ch = ptr[pos]; - bool should_escape; - - /* JSON policy */ - should_escape = - (ch < 0x20) || - (ch == '"') || - (ch == '\\') || - (out_script_safe && (ch == '/')); - /* JSON encoding */ - if (should_escape) { + if (escape_table[ch]) { if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } @@ -717,7 +730,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: - convert_ASCII_to_JSON(buffer, obj, state->script_safe); + convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); break; case ENC_CODERANGE_VALID: if (RB_UNLIKELY(state->ascii_only)) {