From e64b984ddc514a0c7d7f5db3f43590e62285c722 Mon Sep 17 00:00:00 2001 From: Roger Light Date: Thu, 10 Aug 2023 09:26:19 +0100 Subject: [PATCH] Optimise string parsing when there are no escape characters This resulted in approximately 40% reduction in instructions used in parse_string() when parsing a 4MB JSON file, as measured in callgrind. --- cJSON.c | 124 +++++++++++++++++++++++-------------------- tests/parse_string.c | 1 + 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/cJSON.c b/cJSON.c index 54f63f4..dc1934d 100644 --- a/cJSON.c +++ b/cJSON.c @@ -776,6 +776,8 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu const unsigned char *input_end = buffer_at_offset(input_buffer) + 1; unsigned char *output_pointer = NULL; unsigned char *output = NULL; + size_t skipped_bytes = 0; + size_t allocation_length = 0; /* not a string */ if (buffer_at_offset(input_buffer)[0] != '\"') @@ -785,8 +787,6 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu { /* calculate approximate size of the output (overestimate) */ - size_t allocation_length = 0; - size_t skipped_bytes = 0; while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"')) { /* is escape sequence */ @@ -816,65 +816,73 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu } } - output_pointer = output; - /* loop through the string literal */ - while (input_pointer < input_end) + /* If there are no escaped characters, we can use memcmp */ + if(skipped_bytes == 0) { - if (*input_pointer != '\\') - { - *output_pointer++ = *input_pointer++; - } - /* escape sequence */ - else - { - unsigned char sequence_length = 2; - if ((input_end - input_pointer) < 1) - { - goto fail; - } - - switch (input_pointer[1]) - { - case 'b': - *output_pointer++ = '\b'; - break; - case 'f': - *output_pointer++ = '\f'; - break; - case 'n': - *output_pointer++ = '\n'; - break; - case 'r': - *output_pointer++ = '\r'; - break; - case 't': - *output_pointer++ = '\t'; - break; - case '\"': - case '\\': - case '/': - *output_pointer++ = input_pointer[1]; - break; - - /* UTF-16 literal */ - case 'u': - sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer); - if (sequence_length == 0) - { - /* failed to convert UTF16-literal to UTF-8 */ - goto fail; - } - break; - - default: - goto fail; - } - input_pointer += sequence_length; - } + memcpy(output, input_pointer, allocation_length); + output[allocation_length-1] = '\0'; } + else + { + /* else loop through the string literal */ + output_pointer = output; + while (input_pointer < input_end) + { + if (*input_pointer != '\\') + { + *output_pointer++ = *input_pointer++; + } + /* escape sequence */ + else + { + unsigned char sequence_length = 2; + if ((input_end - input_pointer) < 1) + { + goto fail; + } - /* zero terminate the output */ - *output_pointer = '\0'; + switch (input_pointer[1]) + { + case 'b': + *output_pointer++ = '\b'; + break; + case 'f': + *output_pointer++ = '\f'; + break; + case 'n': + *output_pointer++ = '\n'; + break; + case 'r': + *output_pointer++ = '\r'; + break; + case 't': + *output_pointer++ = '\t'; + break; + case '\"': + case '\\': + case '/': + *output_pointer++ = input_pointer[1]; + break; + + /* UTF-16 literal */ + case 'u': + sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer); + if (sequence_length == 0) + { + /* failed to convert UTF16-literal to UTF-8 */ + goto fail; + } + break; + + default: + goto fail; + } + input_pointer += sequence_length; + } + } + /* zero terminate the output */ + *output_pointer = '\0'; + } item->type = cJSON_String; item->valuestring = (char*)output; diff --git a/tests/parse_string.c b/tests/parse_string.c index ce1c138..2435c26 100644 --- a/tests/parse_string.c +++ b/tests/parse_string.c @@ -73,6 +73,7 @@ static void assert_not_parse_string(const char * const string) static void parse_string_should_parse_strings(void) { assert_parse_string("\"\"", ""); + assert_parse_string("\"Simple string\"", "Simple string"); assert_parse_string( "\" !\\\"#$%&'()*+,-./\\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_'abcdefghijklmnopqrstuvwxyz{|}~\"", " !\"#$%&'()*+,-.//0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'abcdefghijklmnopqrstuvwxyz{|}~");