From 3a4dc9e1b48b34bf07d79bc2653b8f7dc084a363 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 6 Nov 2024 12:58:23 +0100 Subject: [PATCH] Implement a fast path for integer parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `rb_cstr2inum` isn't very fast because it handles tons of different scenarios, and also require a NULL terminated string which forces us to copy the number into a secondary buffer. But since the parser already computed the length, we can much more cheaply do this with a very simple function as long as the number is small enough to fit into a native type (`long long`). If the number is too long, we can fallback to the `rb_cstr2inum` slowpath. Before: ``` == Parsing citm_catalog.json (1727030 bytes) ruby 3.4.0dev (2024-11-06T07:59:09Z precompute-hash-wh.. 7943f98a8a) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json 40.000 i/100ms oj 35.000 i/100ms Oj::Parser 45.000 i/100ms rapidjson 38.000 i/100ms Calculating ------------------------------------- json 425.941 (± 1.9%) i/s (2.35 ms/i) - 2.160k in 5.072833s oj 349.617 (± 1.7%) i/s (2.86 ms/i) - 1.750k in 5.006953s Oj::Parser 464.767 (± 1.7%) i/s (2.15 ms/i) - 2.340k in 5.036381s rapidjson 382.413 (± 2.4%) i/s (2.61 ms/i) - 1.938k in 5.070757s Comparison: json: 425.9 i/s Oj::Parser: 464.8 i/s - 1.09x faster rapidjson: 382.4 i/s - 1.11x slower oj: 349.6 i/s - 1.22x slower ``` After: ``` == Parsing citm_catalog.json (1727030 bytes) ruby 3.4.0dev (2024-11-06T07:59:09Z precompute-hash-wh.. 7943f98a8a) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json 46.000 i/100ms oj 33.000 i/100ms Oj::Parser 45.000 i/100ms rapidjson 39.000 i/100ms Calculating ------------------------------------- json 462.332 (± 3.2%) i/s (2.16 ms/i) - 2.346k in 5.080504s oj 351.140 (± 1.1%) i/s (2.85 ms/i) - 1.782k in 5.075616s Oj::Parser 473.500 (± 1.3%) i/s (2.11 ms/i) - 2.385k in 5.037695s rapidjson 395.052 (± 3.5%) i/s (2.53 ms/i) - 1.989k in 5.042275s Comparison: json: 462.3 i/s Oj::Parser: 473.5 i/s - same-ish: difference falls within error rapidjson: 395.1 i/s - 1.17x slower oj: 351.1 i/s - 1.32x slower ``` --- ext/json/ext/parser/parser.c | 166 ++++++++++++++++++++-------------- ext/json/ext/parser/parser.rl | 34 ++++++- 2 files changed, 126 insertions(+), 74 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index db9b13a6..a5c918fa 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1488,20 +1488,42 @@ enum {JSON_integer_en_main = 1}; #line 695 "parser.rl" +#define MAX_FAST_INTEGER_SIZE 18 +static inline VALUE fast_parse_integer(char *p, char *pe) +{ + bool negative = false; + if (*p == '-') { + negative = true; + p++; + } + + long long memo = 0; + while (p < pe) { + memo *= 10; + memo += *p - '0'; + p++; + } + + if (negative) { + memo = -memo; + } + return LL2NUM(memo); +} + static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result) { int cs = EVIL; -#line 1497 "parser.c" +#line 1519 "parser.c" { cs = JSON_integer_start; } -#line 702 "parser.rl" +#line 724 "parser.rl" json->memo = p; -#line 1505 "parser.c" +#line 1527 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1542,7 +1564,7 @@ case 3: if ( ++p == pe ) goto _test_eof4; case 4: -#line 1546 "parser.c" +#line 1568 "parser.c" goto st0; st5: if ( ++p == pe ) @@ -1561,14 +1583,18 @@ case 5: _out: {} } -#line 704 "parser.rl" +#line 726 "parser.rl" if (cs >= JSON_integer_first_final) { long len = p - json->memo; - fbuffer_clear(&json->fbuffer); - fbuffer_append(&json->fbuffer, json->memo, len); - fbuffer_append_char(&json->fbuffer, '\0'); - *result = rb_cstr2inum(FBUFFER_PTR(&json->fbuffer), 10); + if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) { + *result = fast_parse_integer(json->memo, p); + } else { + fbuffer_clear(&json->fbuffer); + fbuffer_append(&json->fbuffer, json->memo, len); + fbuffer_append_char(&json->fbuffer, '\0'); + *result = rb_cstr2inum(FBUFFER_PTR(&json->fbuffer), 10); + } return p + 1; } else { return NULL; @@ -1576,7 +1602,7 @@ case 5: } -#line 1580 "parser.c" +#line 1606 "parser.c" enum {JSON_float_start = 1}; enum {JSON_float_first_final = 8}; enum {JSON_float_error = 0}; @@ -1584,7 +1610,7 @@ enum {JSON_float_error = 0}; enum {JSON_float_en_main = 1}; -#line 729 "parser.rl" +#line 755 "parser.rl" static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result) @@ -1592,15 +1618,15 @@ static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *resul int cs = EVIL; -#line 1596 "parser.c" +#line 1622 "parser.c" { cs = JSON_float_start; } -#line 736 "parser.rl" +#line 762 "parser.rl" json->memo = p; -#line 1604 "parser.c" +#line 1630 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1658,14 +1684,14 @@ case 8: goto st0; goto tr9; tr9: -#line 723 "parser.rl" +#line 749 "parser.rl" { p--; {p++; cs = 9; goto _out;} } goto st9; st9: if ( ++p == pe ) goto _test_eof9; case 9: -#line 1669 "parser.c" +#line 1695 "parser.c" goto st0; st5: if ( ++p == pe ) @@ -1726,7 +1752,7 @@ case 7: _out: {} } -#line 738 "parser.rl" +#line 764 "parser.rl" if (cs >= JSON_float_first_final) { VALUE mod = Qnil; @@ -1779,7 +1805,7 @@ case 7: -#line 1783 "parser.c" +#line 1809 "parser.c" enum {JSON_array_start = 1}; enum {JSON_array_first_final = 22}; enum {JSON_array_error = 0}; @@ -1787,7 +1813,7 @@ enum {JSON_array_error = 0}; enum {JSON_array_en_main = 1}; -#line 815 "parser.rl" +#line 841 "parser.rl" static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) @@ -1800,14 +1826,14 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul long stack_head = json->stack->head; -#line 1804 "parser.c" +#line 1830 "parser.c" { cs = JSON_array_start; } -#line 827 "parser.rl" +#line 853 "parser.rl" -#line 1811 "parser.c" +#line 1837 "parser.c" { short _widec; if ( p == pe ) @@ -1847,7 +1873,7 @@ case 2: goto st2; goto st0; tr2: -#line 795 "parser.rl" +#line 821 "parser.rl" { VALUE v = Qnil; char *np = JSON_parse_value(json, p, pe, &v, current_nesting); @@ -1862,12 +1888,12 @@ case 2: if ( ++p == pe ) goto _test_eof3; case 3: -#line 1866 "parser.c" +#line 1892 "parser.c" _widec = (*p); if ( 44 <= (*p) && (*p) <= 44 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -1914,14 +1940,14 @@ case 7: goto st3; goto st7; tr4: -#line 807 "parser.rl" +#line 833 "parser.rl" { p--; {p++; cs = 22; goto _out;} } goto st22; st22: if ( ++p == pe ) goto _test_eof22; case 22: -#line 1925 "parser.c" +#line 1951 "parser.c" goto st0; st8: if ( ++p == pe ) @@ -1989,13 +2015,13 @@ case 13: if ( 10 <= (*p) && (*p) <= 10 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) >= 9 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) > 13 ) { @@ -2003,19 +2029,19 @@ case 13: if ( 47 <= (*p) && (*p) <= 47 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) >= 32 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -2054,13 +2080,13 @@ case 14: if ( 47 <= (*p) && (*p) <= 47 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) >= 42 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -2079,20 +2105,20 @@ case 15: if ( (*p) <= 41 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) > 42 ) { if ( 43 <= (*p) ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -2115,13 +2141,13 @@ case 16: if ( 42 <= (*p) && (*p) <= 42 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) > 46 ) { @@ -2129,19 +2155,19 @@ case 16: if ( 48 <= (*p) ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) >= 47 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -2165,20 +2191,20 @@ case 17: if ( (*p) <= 9 ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else if ( (*p) > 10 ) { if ( 11 <= (*p) ) { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } } else { _widec = (short)(128 + ((*p) - -128)); if ( -#line 805 "parser.rl" +#line 831 "parser.rl" json->allow_trailing_comma ) _widec += 256; } switch( _widec ) { @@ -2250,7 +2276,7 @@ case 21: _out: {} } -#line 828 "parser.rl" +#line 854 "parser.rl" if(cs >= JSON_array_first_final) { long count = json->stack->head - stack_head; @@ -2444,7 +2470,7 @@ static VALUE json_string_unescape(JSON_Parser *json, char *string, char *stringE } -#line 2448 "parser.c" +#line 2474 "parser.c" enum {JSON_string_start = 1}; enum {JSON_string_first_final = 9}; enum {JSON_string_error = 0}; @@ -2452,7 +2478,7 @@ enum {JSON_string_error = 0}; enum {JSON_string_en_main = 1}; -#line 1051 "parser.rl" +#line 1077 "parser.rl" static int @@ -2473,15 +2499,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu VALUE match_string; -#line 2477 "parser.c" +#line 2503 "parser.c" { cs = JSON_string_start; } -#line 1071 "parser.rl" +#line 1097 "parser.rl" json->memo = p; -#line 2485 "parser.c" +#line 2511 "parser.c" { if ( p == pe ) goto _test_eof; @@ -2506,14 +2532,14 @@ case 2: goto st0; goto st2; tr2: -#line 1033 "parser.rl" +#line 1059 "parser.rl" { *result = json_string_fastpath(json, json->memo + 1, p, json->parsing_name, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names); {p = (( p + 1))-1;} p--; {p++; cs = 9; goto _out;} } -#line 1026 "parser.rl" +#line 1052 "parser.rl" { *result = json_string_unescape(json, json->memo + 1, p, json->parsing_name, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names); {p = (( p + 1))-1;} @@ -2522,7 +2548,7 @@ case 2: } goto st9; tr6: -#line 1026 "parser.rl" +#line 1052 "parser.rl" { *result = json_string_unescape(json, json->memo + 1, p, json->parsing_name, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names); {p = (( p + 1))-1;} @@ -2534,7 +2560,7 @@ case 2: if ( ++p == pe ) goto _test_eof9; case 9: -#line 2538 "parser.c" +#line 2564 "parser.c" goto st0; st3: if ( ++p == pe ) @@ -2622,7 +2648,7 @@ case 8: _out: {} } -#line 1073 "parser.rl" +#line 1099 "parser.rl" if (json->create_additions && RTEST(match_string = json->match_string)) { VALUE klass; @@ -2775,7 +2801,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self) } -#line 2779 "parser.c" +#line 2805 "parser.c" enum {JSON_start = 1}; enum {JSON_first_final = 10}; enum {JSON_error = 0}; @@ -2783,7 +2809,7 @@ enum {JSON_error = 0}; enum {JSON_en_main = 1}; -#line 1239 "parser.rl" +#line 1265 "parser.rl" /* @@ -2812,16 +2838,16 @@ static VALUE cParser_parse(VALUE self) json->stack = &stack; -#line 2816 "parser.c" +#line 2842 "parser.c" { cs = JSON_start; } -#line 1267 "parser.rl" +#line 1293 "parser.rl" p = json->source; pe = p + json->len; -#line 2825 "parser.c" +#line 2851 "parser.c" { if ( p == pe ) goto _test_eof; @@ -2855,7 +2881,7 @@ case 1: cs = 0; goto _out; tr2: -#line 1231 "parser.rl" +#line 1257 "parser.rl" { char *np = JSON_parse_value(json, p, pe, &result, 0); if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;} @@ -2865,7 +2891,7 @@ cs = 0; if ( ++p == pe ) goto _test_eof10; case 10: -#line 2869 "parser.c" +#line 2895 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -2954,7 +2980,7 @@ case 9: _out: {} } -#line 1270 "parser.rl" +#line 1296 "parser.rl" if (json->stack_handle) { rvalue_stack_eagerly_release(json->stack_handle); @@ -2990,16 +3016,16 @@ static VALUE cParser_m_parse(VALUE klass, VALUE source, VALUE opts) json->stack = &stack; -#line 2994 "parser.c" +#line 3020 "parser.c" { cs = JSON_start; } -#line 1305 "parser.rl" +#line 1331 "parser.rl" p = json->source; pe = p + json->len; -#line 3003 "parser.c" +#line 3029 "parser.c" { if ( p == pe ) goto _test_eof; @@ -3033,7 +3059,7 @@ case 1: cs = 0; goto _out; tr2: -#line 1231 "parser.rl" +#line 1257 "parser.rl" { char *np = JSON_parse_value(json, p, pe, &result, 0); if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;} @@ -3043,7 +3069,7 @@ cs = 0; if ( ++p == pe ) goto _test_eof10; case 10: -#line 3047 "parser.c" +#line 3073 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -3132,7 +3158,7 @@ case 9: _out: {} } -#line 1308 "parser.rl" +#line 1334 "parser.rl" if (json->stack_handle) { rvalue_stack_eagerly_release(json->stack_handle); diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl index 06f23fa8..9620b196 100644 --- a/ext/json/ext/parser/parser.rl +++ b/ext/json/ext/parser/parser.rl @@ -694,6 +694,28 @@ static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *resul main := '-'? ('0' | [1-9][0-9]*) (^[0-9]? @exit); }%% +#define MAX_FAST_INTEGER_SIZE 18 +static inline VALUE fast_parse_integer(char *p, char *pe) +{ + bool negative = false; + if (*p == '-') { + negative = true; + p++; + } + + long long memo = 0; + while (p < pe) { + memo *= 10; + memo += *p - '0'; + p++; + } + + if (negative) { + memo = -memo; + } + return LL2NUM(memo); +} + static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result) { int cs = EVIL; @@ -704,10 +726,14 @@ static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *res if (cs >= JSON_integer_first_final) { long len = p - json->memo; - fbuffer_clear(&json->fbuffer); - fbuffer_append(&json->fbuffer, json->memo, len); - fbuffer_append_char(&json->fbuffer, '\0'); - *result = rb_cstr2inum(FBUFFER_PTR(&json->fbuffer), 10); + if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) { + *result = fast_parse_integer(json->memo, p); + } else { + fbuffer_clear(&json->fbuffer); + fbuffer_append(&json->fbuffer, json->memo, len); + fbuffer_append_char(&json->fbuffer, '\0'); + *result = rb_cstr2inum(FBUFFER_PTR(&json->fbuffer), 10); + } return p + 1; } else { return NULL;