Optimise string parsing when there are no escape characters

This resulted in approximately 40% reduction in instructions used in parse_string() when parsing a 4MB JSON file, as measured in callgrind.
2023-08-10 21:13:26 +03:00 · 2023-08-10 09:26:19 +01:00
parent bb27ffa152
commit e64b984ddc
2 changed files with 67 additions and 58 deletions
--- a/cJSON.c
+++ b/cJSON.c
@@ -776,6 +776,8 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu
    const unsigned char *input_end = buffer_at_offset(input_buffer) + 1;
    unsigned char *output_pointer = NULL;
    unsigned char *output = NULL;
+    size_t skipped_bytes = 0;
+    size_t allocation_length = 0;

    /* not a string */
    if (buffer_at_offset(input_buffer)[0] != '\"')
@@ -785,8 +787,6 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu

    {
        /* calculate approximate size of the output (overestimate) */
-        size_t allocation_length = 0;
-        size_t skipped_bytes = 0;
        while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"'))
        {
            /* is escape sequence */
@@ -816,65 +816,73 @@ static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_bu
        }
    }

-    output_pointer = output;
-    /* loop through the string literal */
-    while (input_pointer < input_end)
+    /* If there are no escaped characters, we can use memcmp */
+    if(skipped_bytes == 0)
    {
-        if (*input_pointer != '\\')
-        {
-            *output_pointer++ = *input_pointer++;
-        }
-        /* escape sequence */
-        else
-        {
-            unsigned char sequence_length = 2;
-            if ((input_end - input_pointer) < 1)
-            {
-                goto fail;
-            }
-
-            switch (input_pointer[1])
-            {
-                case 'b':
-                    *output_pointer++ = '\b';
-                    break;
-                case 'f':
-                    *output_pointer++ = '\f';
-                    break;
-                case 'n':
-                    *output_pointer++ = '\n';
-                    break;
-                case 'r':
-                    *output_pointer++ = '\r';
-                    break;
-                case 't':
-                    *output_pointer++ = '\t';
-                    break;
-                case '\"':
-                case '\\':
-                case '/':
-                    *output_pointer++ = input_pointer[1];
-                    break;
-
-                /* UTF-16 literal */
-                case 'u':
-                    sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
-                    if (sequence_length == 0)
-                    {
-                        /* failed to convert UTF16-literal to UTF-8 */
-                        goto fail;
-                    }
-                    break;
-
-                default:
-                    goto fail;
-            }
-            input_pointer += sequence_length;
-        }
+        memcpy(output, input_pointer, allocation_length);
+        output[allocation_length-1] = '\0';
    }
+    else
+    {
+        /* else loop through the string literal */
+        output_pointer = output;
+        while (input_pointer < input_end)
+        {
+            if (*input_pointer != '\\')
+            {
+                *output_pointer++ = *input_pointer++;
+            }
+            /* escape sequence */
+            else
+            {
+                unsigned char sequence_length = 2;
+                if ((input_end - input_pointer) < 1)
+                {
+                    goto fail;
+                }

-    /* zero terminate the output */
-    *output_pointer = '\0';
+                switch (input_pointer[1])
+                {
+                    case 'b':
+                        *output_pointer++ = '\b';
+                        break;
+                    case 'f':
+                        *output_pointer++ = '\f';
+                        break;
+                    case 'n':
+                        *output_pointer++ = '\n';
+                        break;
+                    case 'r':
+                        *output_pointer++ = '\r';
+                        break;
+                    case 't':
+                        *output_pointer++ = '\t';
+                        break;
+                    case '\"':
+                    case '\\':
+                    case '/':
+                        *output_pointer++ = input_pointer[1];
+                        break;
+
+                    /* UTF-16 literal */
+                    case 'u':
+                        sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
+                        if (sequence_length == 0)
+                        {
+                            /* failed to convert UTF16-literal to UTF-8 */
+                            goto fail;
+                        }
+                        break;
+
+                    default:
+                        goto fail;
+                }
+                input_pointer += sequence_length;
+            }
+        }
+        /* zero terminate the output */
+        *output_pointer = '\0';
+    }

    item->type = cJSON_String;
    item->valuestring = (char*)output;
--- a/tests/parse_string.c
+++ b/tests/parse_string.c
@@ -73,6 +73,7 @@ static void assert_not_parse_string(const char * const string)
 static void parse_string_should_parse_strings(void)
 {
    assert_parse_string("\"\"", "");
+    assert_parse_string("\"Simple string\"", "Simple string");
    assert_parse_string(
        "\" !\\\"#$%&'()*+,-./\\/0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_'abcdefghijklmnopqrstuvwxyz{|}~\"",
        " !\"#$%&'()*+,-.//0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'abcdefghijklmnopqrstuvwxyz{|}~");