Skip to content

Commit 38c7831

Browse files
committed
Port convert_UTF8_to_JSON from C
Also includes updated logic for generate (generate_json_string) based on current C code. Original code by @byroot See ruby#620
1 parent 4d37e9f commit 38c7831

File tree

1 file changed

+95
-11
lines changed

1 file changed

+95
-11
lines changed

java/src/json/ext/StringEncoder.java

Lines changed: 95 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import java.io.IOException;
2020
import java.io.OutputStream;
21+
import java.nio.charset.StandardCharsets;
2122

2223
/**
2324
* An encoder that reads from the given source and outputs its representation
@@ -46,6 +47,15 @@ final class StringEncoder extends ByteListTranscoder {
4647
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
4748
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4849
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50+
51+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4959
};
5060

5161
static final byte[] ASCII_ONLY_ESCAPE_TABLE = {
@@ -97,6 +107,8 @@ final class StringEncoder extends ByteListTranscoder {
97107
//First byte of a 4+ byte code point
98108
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
99109
};
110+
private static final byte[] BACKSLASH_U2028 = "\\u2028".getBytes(StandardCharsets.US_ASCII);
111+
private static final byte[] BACKSLASH_U2029 = "\\u2029".getBytes(StandardCharsets.US_ASCII);
100112

101113
private final boolean asciiOnly, scriptSafe;
102114

@@ -143,10 +155,12 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
143155
append('"');
144156
switch (object.scanForCodeRange()) {
145157
case StringSupport.CR_7BIT:
146-
encodeASCII(context, byteList, buffer);
147-
break;
148158
case StringSupport.CR_VALID:
149-
encode(context, byteList, buffer);
159+
if (asciiOnly) {
160+
encodeASCII(byteList, scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE);
161+
} else {
162+
encode(byteList, scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ESCAPE_TABLE);
163+
}
150164
break;
151165
default:
152166
throw Utils.buildGeneratorError(context, object, "source sequence is illegal/malformed utf-8").toThrowable();
@@ -178,15 +192,85 @@ static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
178192
return str;
179193
}
180194

181-
void encode(ThreadContext context, ByteList src, OutputStream out) throws IOException {
182-
while (hasNext()) {
183-
handleChar(readUtf8Char(context));
195+
// C: convert_UTF8_to_JSON
196+
void encode(ByteList src, byte[] escape_table) throws IOException {
197+
byte[] hexdig = HEX;
198+
byte[] scratch = aux;
199+
200+
byte[] ptrBytes = src.unsafeBytes();
201+
int ptr = src.begin();
202+
int len = src.realSize();
203+
204+
int beg = 0;
205+
int pos = 0;
206+
207+
while (pos < len) {
208+
int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
209+
int ch_len = escape_table[ch];
210+
/* JSON encoding */
211+
212+
if (ch_len > 0) {
213+
switch (ch_len) {
214+
case 9: {
215+
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
216+
switch (ch) {
217+
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
218+
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
219+
case '/': appendEscape(BACKSLASH_FORWARDSLASH); break;
220+
case '\b': appendEscape(BACKSLASH_B); break;
221+
case '\f': appendEscape(BACKSLASH_F); break;
222+
case '\n': appendEscape(BACKSLASH_N); break;
223+
case '\r': appendEscape(BACKSLASH_R); break;
224+
case '\t': appendEscape(BACKSLASH_T); break;
225+
default: {
226+
scratch[2] = '0';
227+
scratch[3] = '0';
228+
scratch[4] = hexdig[(ch >> 4) & 0xf];
229+
scratch[5] = hexdig[ch & 0xf];
230+
append(scratch, 0, 6);
231+
break;
232+
}
233+
}
234+
break;
235+
}
236+
case 11: {
237+
int b2 = Byte.toUnsignedInt(ptrBytes[ptr + pos + 1]);
238+
if (b2 == 0x80) {
239+
int b3 = Byte.toUnsignedInt(ptrBytes[ptr + pos + 2]);
240+
if (b3 == 0xA8) {
241+
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 3);
242+
append(BACKSLASH_U2028, 0, 6);
243+
break;
244+
} else if (b3 == 0xA9) {
245+
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 3);
246+
append(BACKSLASH_U2029, 0, 6);
247+
break;
248+
}
249+
}
250+
ch_len = 3;
251+
// fallthrough
252+
}
253+
default:
254+
pos += ch_len;
255+
break;
256+
}
257+
} else {
258+
pos++;
259+
}
184260
}
261+
262+
if (beg < len) {
263+
append(ptrBytes, ptr + beg, len - beg);
264+
}
265+
}
266+
267+
private int flushPos(int pos, int beg, byte[] ptrBytes, int ptr, int size) throws IOException {
268+
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); }
269+
return pos + size;
185270
}
186271

187272
// C: convert_UTF8_to_ASCII_only_JSON
188-
void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws IOException {
189-
byte[] escape_table = scriptSafe ? SCRIPT_SAFE_ESCAPE_TABLE : ASCII_ONLY_ESCAPE_TABLE;
273+
void encodeASCII(ByteList src, byte[] escape_table) throws IOException {
190274
byte[] hexdig = HEX;
191275
byte[] scratch = aux;
192276

@@ -198,13 +282,13 @@ void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws I
198282
int pos = 0;
199283

200284
while (pos < len) {
201-
byte ch = ptrBytes[ptr + pos];
285+
int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]);
202286
int ch_len = escape_table[ch];
203287

204288
if (ch_len != 0) {
205289
switch (ch_len) {
206290
case 9: {
207-
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); } pos += 1; beg = pos; // FLUSH_POS
291+
beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1);
208292
switch (ch) {
209293
case '"': appendEscape(BACKSLASH_DOUBLEQUOTE); break;
210294
case '\\': appendEscape(BACKSLASH_BACKSLASH); break;
@@ -245,7 +329,7 @@ void encodeASCII(ThreadContext context, ByteList src, OutputStream out) throws I
245329
wchar = (wchar << 6) | (ptrBytes[ptr + pos +i] & 0x3F);
246330
}
247331

248-
if (pos > beg) { append(ptrBytes, ptr + beg, pos - beg); } pos += ch_len; beg = pos; // FLUSH_POS
332+
beg = pos = flushPos(pos, beg, ptrBytes, ptr, ch_len);
249333

250334
if (wchar <= 0xFFFF) {
251335
scratch[2] = hexdig[wchar >> 12];

0 commit comments

Comments
 (0)