Skip to content

Further improve parsing errors #802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 46 additions & 34 deletions ext/json/ext/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ RBIMPL_ATTR_NORETURN()
#endif
static void raise_parse_error(const char *format, JSON_ParserState *state)
{
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1];
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];

const char *cursor = state->cursor;
long column = 0;
Expand All @@ -412,22 +412,34 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
}
}

const char *ptr = state->cursor;
size_t len = ptr ? strnlen(ptr, PARSE_ERROR_FRAGMENT_LEN) : 0;
const char *ptr = "EOF";
if (state->cursor && state->cursor < state->end) {
ptr = state->cursor;
size_t len = 0;
while (len < PARSE_ERROR_FRAGMENT_LEN) {
char ch = ptr[len];
if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') {
break;
}
len++;
}

if (len == PARSE_ERROR_FRAGMENT_LEN) {
MEMCPY(buffer, ptr, char, PARSE_ERROR_FRAGMENT_LEN);
if (len) {
buffer[0] = '\'';
MEMCPY(buffer + 1, ptr, char, len);

while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte
len--;
}
while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte
len--;
}

if (buffer[len - 1] >= 0xC0) { // multibyte character start
len--;
}
if (buffer[len] >= 0xC0) { // multibyte character start
len--;
}

buffer[len] = '\0';
ptr = (const char *)buffer;
buffer[len + 1] = '\'';
buffer[len + 2] = '\0';
ptr = (const char *)buffer;
}
}

VALUE msg = rb_sprintf(format, ptr);
Expand Down Expand Up @@ -473,16 +485,16 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
signed char b;
uint32_t result = 0;
b = digit_values[p[0]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
return result;
}
Expand Down Expand Up @@ -532,11 +544,11 @@ json_eat_comments(JSON_ParserState *state)
break;
}
default:
raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
}
} else {
raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
}
}

Expand Down Expand Up @@ -655,7 +667,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
break;
case 'u':
if (pe > stringEnd - 5) {
raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, p);
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
} else {
uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
pe += 3;
Expand All @@ -672,7 +684,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
raise_parse_error_at("incomplete surrogate pair at '%s'", state, p);
raise_parse_error_at("incomplete surrogate pair at %s", state, p);
}
if (pe[0] == '\\' && pe[1] == 'u') {
uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
Expand Down Expand Up @@ -894,15 +906,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qnil);
}

raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
case 't':
if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
state->cursor += 4;
return json_push_value(state, config, Qtrue);
}

raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
case 'f':
// Note: memcmp with a small power of two compile to an integer comparison
Expand All @@ -911,7 +923,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qfalse);
}

raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
case 'N':
// Note: memcmp with a small power of two compile to an integer comparison
Expand All @@ -920,15 +932,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, CNaN);
}

raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
case 'I':
if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
state->cursor += 8;
return json_push_value(state, config, CInfinity);
}

raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
break;
case '-':
// Note: memcmp with a small power of two compile to an integer comparison
Expand All @@ -937,7 +949,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
state->cursor += 9;
return json_push_value(state, config, CMinusInfinity);
} else {
raise_parse_error("unexpected token '%s'", state);
raise_parse_error("unexpected token %s", state);
}
}
// Fallthrough
Expand Down Expand Up @@ -1062,7 +1074,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}

if (*state->cursor != '"') {
raise_parse_error("expected object key, got '%s'", state);
raise_parse_error("expected object key, got %s", state);
}
json_parse_string(state, config, true);

Expand Down Expand Up @@ -1097,13 +1109,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}

if (*state->cursor != '"') {
raise_parse_error("expected object key, got: '%s'", state);
raise_parse_error("expected object key, got: %s", state);
}
json_parse_string(state, config, true);

json_eat_whitespace(state);
if ((state->cursor >= state->end) || (*state->cursor != ':')) {
raise_parse_error("expected ':' after object key, got: '%s'", state);
raise_parse_error("expected ':' after object key, got: %s", state);
}
state->cursor++;

Expand All @@ -1113,24 +1125,24 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
}

raise_parse_error("expected ',' or '}' after object value, got: '%s'", state);
raise_parse_error("expected ',' or '}' after object value, got: %s", state);
}
break;
}

default:
raise_parse_error("unexpected character: '%s'", state);
raise_parse_error("unexpected character: %s", state);
break;
}

raise_parse_error("unreacheable: '%s'", state);
raise_parse_error("unreacheable: %s", state);
}

static void json_ensure_eof(JSON_ParserState *state)
{
json_eat_whitespace(state);
if (state->cursor != state->end) {
raise_parse_error("unexpected token at end of stream '%s'", state);
raise_parse_error("unexpected token at end of stream %s", state);
}
}

Expand Down
21 changes: 18 additions & 3 deletions test/json/json_ext_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,35 @@ def test_allocate
end

def test_error_messages
ex = assert_raise(ParserError) { parse('Infinity') }
ex = assert_raise(ParserError) { parse('Infinity something') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'Infinity' at line 1 column 1", ex.message
end

ex = assert_raise(ParserError) { parse('-Infinity') }
ex = assert_raise(ParserError) { parse('foo bar') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'foo' at line 1 column 1", ex.message
end

ex = assert_raise(ParserError) { parse('-Infinity something') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token '-Infinity' at line 1 column 1", ex.message
end

ex = assert_raise(ParserError) { parse('NaN') }
ex = assert_raise(ParserError) { parse('NaN something') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'NaN' at line 1 column 1", ex.message
end

ex = assert_raise(ParserError) { parse(' ') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected end of input at line 1 column 4", ex.message
end

ex = assert_raise(ParserError) { parse('{ ') }
unless RUBY_PLATFORM =~ /java/
assert_equal "expected object key, got EOF at line 1 column 5", ex.message
end
end

if GC.respond_to?(:stress=)
Expand Down
2 changes: 1 addition & 1 deletion test/json/json_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ def test_parse_error_incomplete_hash
JSON.parse('{"input":{"firstName":"Bob","lastName":"Mob","email":"bob@example.com"}')
end
if RUBY_ENGINE == "ruby"
assert_equal %(expected ',' or '}' after object value, got: '' at line 1 column 72), error.message
assert_equal %(expected ',' or '}' after object value, got: EOF at line 1 column 72), error.message
end
end

Expand Down
Loading