Skip to content

Commit 7106b03

Browse files
authored
Merge pull request #620 from casperisfine/optimize-ascii-strings
Add a fast path for ASCII strings
2 parents 92a2d54 + e2a2a33 commit 7106b03

File tree

2 files changed

+198
-31
lines changed

2 files changed

+198
-31
lines changed

ext/json/ext/generator/generator.c

Lines changed: 198 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,13 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
2525
* Everything else (should be UTF-8) is just passed through and
2626
* appended to the result.
2727
*/
28-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
28+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
2929
{
3030
const char *hexdig = "0123456789abcdef";
3131
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
3232

3333
const char *in_utf8_str = RSTRING_PTR(in_string);
3434
unsigned long in_utf8_len = RSTRING_LEN(in_string);
35-
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
3635

3736
unsigned long beg = 0, pos;
3837

@@ -42,46 +41,196 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_
4241
bool should_escape;
4342

4443
/* UTF-8 decoding */
45-
if (in_is_ascii_only) {
46-
ch = in_utf8_str[pos];
47-
ch_len = 1;
48-
} else {
49-
short i;
50-
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
51-
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
52-
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
53-
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
54-
else
55-
rb_raise(rb_path2class("JSON::GeneratorError"),
56-
"source sequence is illegal/malformed utf-8");
57-
if ((pos+ch_len) > in_utf8_len)
58-
rb_raise(rb_path2class("JSON::GeneratorError"),
59-
"partial character in source, but hit end");
60-
for (i = 1; i < ch_len; i++) {
61-
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
62-
rb_raise(rb_path2class("JSON::GeneratorError"),
63-
"source sequence is illegal/malformed utf-8");
64-
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
44+
short i;
45+
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
46+
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
47+
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
48+
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
49+
else {
50+
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
51+
}
52+
53+
for (i = 1; i < ch_len; i++) {
54+
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
55+
}
56+
57+
/* JSON policy */
58+
should_escape =
59+
(ch < 0x20) ||
60+
(ch == '"') ||
61+
(ch == '\\') ||
62+
(out_script_safe && (ch == '/')) ||
63+
(out_script_safe && (ch == 0x2028)) ||
64+
(out_script_safe && (ch == 0x2029));
65+
66+
/* JSON encoding */
67+
if (should_escape) {
68+
if (pos > beg) {
69+
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
70+
}
71+
72+
beg = pos + ch_len;
73+
switch (ch) {
74+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
75+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
76+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
77+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
78+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
79+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
80+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
81+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
82+
default:
83+
if (ch <= 0xFFFF) {
84+
scratch[2] = hexdig[ch >> 12];
85+
scratch[3] = hexdig[(ch >> 8) & 0xf];
86+
scratch[4] = hexdig[(ch >> 4) & 0xf];
87+
scratch[5] = hexdig[ch & 0xf];
88+
fbuffer_append(out_buffer, scratch, 6);
89+
} else {
90+
uint16_t hi, lo;
91+
ch -= 0x10000;
92+
hi = 0xD800 + (uint16_t)(ch >> 10);
93+
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
94+
95+
scratch[2] = hexdig[hi >> 12];
96+
scratch[3] = hexdig[(hi >> 8) & 0xf];
97+
scratch[4] = hexdig[(hi >> 4) & 0xf];
98+
scratch[5] = hexdig[hi & 0xf];
99+
100+
scratch[8] = hexdig[lo >> 12];
101+
scratch[9] = hexdig[(lo >> 8) & 0xf];
102+
scratch[10] = hexdig[(lo >> 4) & 0xf];
103+
scratch[11] = hexdig[lo & 0xf];
104+
105+
fbuffer_append(out_buffer, scratch, 12);
106+
}
107+
}
108+
}
109+
110+
pos += ch_len;
111+
}
112+
113+
if (beg < in_utf8_len) {
114+
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
115+
}
116+
117+
RB_GC_GUARD(in_string);
118+
}
119+
120+
static const bool escape_table[256] = {
121+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
122+
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */
123+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
124+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
126+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
128+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
129+
};
130+
131+
static const bool script_safe_escape_table[256] = {
132+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
133+
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
134+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
135+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
136+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
138+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
140+
};
141+
142+
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
143+
{
144+
const char *hexdig = "0123456789abcdef";
145+
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
146+
147+
const char *ptr = RSTRING_PTR(str);
148+
unsigned long len = RSTRING_LEN(str);
149+
150+
unsigned long beg = 0, pos;
151+
152+
for (pos = 0; pos < len;) {
153+
unsigned char ch = ptr[pos];
154+
/* JSON encoding */
155+
if (escape_table[ch]) {
156+
if (pos > beg) {
157+
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
158+
}
159+
160+
beg = pos + 1;
161+
switch (ch) {
162+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
163+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
164+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
165+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
166+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
167+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
168+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
169+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
170+
default:
171+
scratch[2] = hexdig[ch >> 12];
172+
scratch[3] = hexdig[(ch >> 8) & 0xf];
173+
scratch[4] = hexdig[(ch >> 4) & 0xf];
174+
scratch[5] = hexdig[ch & 0xf];
175+
fbuffer_append(out_buffer, scratch, 6);
65176
}
66-
if (ch > 0x10FFFF)
67-
rb_raise(rb_path2class("JSON::GeneratorError"),
68-
"source sequence is illegal/malformed utf-8");
177+
}
178+
179+
pos++;
180+
}
181+
182+
if (beg < len) {
183+
fbuffer_append(out_buffer, &ptr[beg], len - beg);
184+
}
185+
186+
RB_GC_GUARD(str);
187+
}
188+
189+
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
190+
{
191+
const char *hexdig = "0123456789abcdef";
192+
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
193+
194+
const char *in_utf8_str = RSTRING_PTR(in_string);
195+
unsigned long in_utf8_len = RSTRING_LEN(in_string);
196+
197+
unsigned long beg = 0, pos;
198+
199+
for (pos = 0; pos < in_utf8_len;) {
200+
uint32_t ch;
201+
short ch_len;
202+
bool should_escape;
203+
204+
/* UTF-8 decoding */
205+
short i;
206+
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
207+
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
208+
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
209+
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
210+
else {
211+
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
212+
}
213+
214+
for (i = 1; i < ch_len; i++) {
215+
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
69216
}
70217

71218
/* JSON policy */
72219
should_escape =
73220
(ch < 0x20) ||
74221
(ch == '"') ||
75222
(ch == '\\') ||
76-
(out_ascii_only && (ch > 0x7F)) ||
223+
(ch > 0x7F) ||
77224
(out_script_safe && (ch == '/')) ||
78225
(out_script_safe && (ch == 0x2028)) ||
79226
(out_script_safe && (ch == 0x2029));
80227

81228
/* JSON encoding */
82229
if (should_escape) {
83-
if (pos > beg)
230+
if (pos > beg) {
84231
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
232+
}
233+
85234
beg = pos + ch_len;
86235
switch (ch) {
87236
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -122,8 +271,11 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_
122271

123272
pos += ch_len;
124273
}
125-
if (beg < in_utf8_len)
274+
275+
if (beg < in_utf8_len) {
126276
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
277+
}
278+
127279
RB_GC_GUARD(in_string);
128280
}
129281

@@ -570,11 +722,27 @@ static int enc_utf8_compatible_p(int enc_idx)
570722

571723
static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj)
572724
{
573-
fbuffer_append_char(buffer, '"');
574725
if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) {
575726
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
576727
}
577-
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
728+
729+
fbuffer_append_char(buffer, '"');
730+
731+
switch(rb_enc_str_coderange(obj)) {
732+
case ENC_CODERANGE_7BIT:
733+
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
734+
break;
735+
case ENC_CODERANGE_VALID:
736+
if (RB_UNLIKELY(state->ascii_only)) {
737+
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
738+
} else {
739+
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
740+
}
741+
break;
742+
default:
743+
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
744+
break;
745+
}
578746
fbuffer_append_char(buffer, '"');
579747
}
580748

ext/json/ext/generator/generator.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ typedef unsigned char _Bool;
2323
#endif
2424
#endif
2525

26-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
2726
static char *fstrndup(const char *ptr, unsigned long len);
2827

2928
/* ruby api and some helpers */

0 commit comments

Comments
 (0)