Optimise string parsing when there are no escape characters

This resulted in approximately 40% reduction in instructions used in
parse_string() when parsing a 4MB JSON file, as measured in callgrind.
This commit is contained in:
Roger Light 2023-08-10 09:26:19 +01:00
parent bb27ffa152
commit e64b984ddc
2 changed files with 67 additions and 58 deletions

124
cJSON.c
View File

@ -776,6 +776,8 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu
const unsigned char *input_end = buffer_at_offset(input_buffer) + 1; const unsigned char *input_end = buffer_at_offset(input_buffer) + 1;
unsigned char *output_pointer = NULL; unsigned char *output_pointer = NULL;
unsigned char *output = NULL; unsigned char *output = NULL;
size_t skipped_bytes = 0;
size_t allocation_length = 0;
/* not a string */ /* not a string */
if (buffer_at_offset(input_buffer)[0] != '\"') if (buffer_at_offset(input_buffer)[0] != '\"')
@ -785,8 +787,6 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu
{ {
/* calculate approximate size of the output (overestimate) */ /* calculate approximate size of the output (overestimate) */
size_t allocation_length = 0;
size_t skipped_bytes = 0;
while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"')) while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"'))
{ {
/* is escape sequence */ /* is escape sequence */
@ -816,65 +816,73 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu
} }
} }
output_pointer = output; /* If there are no escaped characters, we can use memcmp */
/* loop through the string literal */ if(skipped_bytes == 0)
while (input_pointer < input_end)
{ {
if (*input_pointer != '\\') memcpy(output, input_pointer, allocation_length);
{ output[allocation_length-1] = '\0';
*output_pointer++ = *input_pointer++;
}
/* escape sequence */
else
{
unsigned char sequence_length = 2;
if ((input_end - input_pointer) < 1)
{
goto fail;
}
switch (input_pointer[1])
{
case 'b':
*output_pointer++ = '\b';
break;
case 'f':
*output_pointer++ = '\f';
break;
case 'n':
*output_pointer++ = '\n';
break;
case 'r':
*output_pointer++ = '\r';
break;
case 't':
*output_pointer++ = '\t';
break;
case '\"':
case '\\':
case '/':
*output_pointer++ = input_pointer[1];
break;
/* UTF-16 literal */
case 'u':
sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
if (sequence_length == 0)
{
/* failed to convert UTF16-literal to UTF-8 */
goto fail;
}
break;
default:
goto fail;
}
input_pointer += sequence_length;
}
} }
else
{
/* else loop through the string literal */
output_pointer = output;
while (input_pointer < input_end)
{
if (*input_pointer != '\\')
{
*output_pointer++ = *input_pointer++;
}
/* escape sequence */
else
{
unsigned char sequence_length = 2;
if ((input_end - input_pointer) < 1)
{
goto fail;
}
/* zero terminate the output */ switch (input_pointer[1])
*output_pointer = '\0'; {
case 'b':
*output_pointer++ = '\b';
break;
case 'f':
*output_pointer++ = '\f';
break;
case 'n':
*output_pointer++ = '\n';
break;
case 'r':
*output_pointer++ = '\r';
break;
case 't':
*output_pointer++ = '\t';
break;
case '\"':
case '\\':
case '/':
*output_pointer++ = input_pointer[1];
break;
/* UTF-16 literal */
case 'u':
sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
if (sequence_length == 0)
{
/* failed to convert UTF16-literal to UTF-8 */
goto fail;
}
break;
default:
goto fail;
}
input_pointer += sequence_length;
}
}
/* zero terminate the output */
*output_pointer = '\0';
}
item->type = cJSON_String; item->type = cJSON_String;
item->valuestring = (char*)output; item->valuestring = (char*)output;

View File

@ -73,6 +73,7 @@ static void assert_not_parse_string(const char * const string)
static void parse_string_should_parse_strings(void) static void parse_string_should_parse_strings(void)
{ {
assert_parse_string("\"\"", ""); assert_parse_string("\"\"", "");
assert_parse_string("\"Simple string\"", "Simple string");
assert_parse_string( assert_parse_string(
"\" !\\\"#$%&'()*+,-./\\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_'abcdefghijklmnopqrstuvwxyz{|}~\"", "\" !\\\"#$%&'()*+,-./\\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_'abcdefghijklmnopqrstuvwxyz{|}~\"",
" !\"#$%&'()*+,-.//0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'abcdefghijklmnopqrstuvwxyz{|}~"); " !\"#$%&'()*+,-.//0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'abcdefghijklmnopqrstuvwxyz{|}~");