From ace5047782c0eb1f428d6566c9e6ed263ff9fa28 Mon Sep 17 00:00:00 2001 From: Max Bruckner Date: Thu, 16 Feb 2017 18:55:27 +0100 Subject: [PATCH] parse_string: reduce mental burden when reading the code This restructures parse_string in a way, that you need to keep less state in your head to understand the code. This is achieved by: * only changing the input pointer (current position) at a few places (not all throughout) * splitting out the UTF16 handling into a separate function * renaming the variables so you know what they do without additional context --- cJSON.c | 345 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 201 insertions(+), 144 deletions(-) diff --git a/cJSON.c b/cJSON.c index 89ea669..d6918fa 100644 --- a/cJSON.c +++ b/cJSON.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cJSON.h" /* define our own boolean type */ @@ -403,201 +404,257 @@ static unsigned parse_hex4(const unsigned char *str) return h; } -/* first bytes of UTF8 encoding for a given length in bytes */ -static const unsigned char firstByteMark[5] = +/* converts a UTF-16 literal to UTF-8 + * A literal can be one or two sequences of the form \uXXXX */ +static uint8_t utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer, const unsigned char **error_pointer) { - 0x00, /* should never happen */ - 0x00, /* 0xxxxxxx */ - 0xC0, /* 110xxxxx */ - 0xE0, /* 1110xxxx */ - 0xF0 /* 11110xxx */ -}; - -/* Parse the input text into an unescaped cstring, and populate item. */ -static const unsigned char *parse_string(cJSON *item, const unsigned char *str, const unsigned char **ep) -{ - const unsigned char *ptr = str + 1; - const unsigned char *end_ptr = str + 1; - unsigned char *ptr2 = NULL; - unsigned char *out = NULL; - size_t len = 0; - unsigned uc = 0; - unsigned uc2 = 0; - - /* not a string! */ - if (*str != '\"') + /* first bytes of UTF8 encoding for a given length in bytes */ + static const unsigned char firstByteMark[5] = { - *ep = str; + 0x00, /* should never happen */ + 0x00, /* 0xxxxxxx */ + 0xC0, /* 110xxxxx */ + 0xE0, /* 1110xxxx */ + 0xF0 /* 11110xxx */ + }; + + long unsigned int codepoint = 0; + unsigned int first_code = 0; + const unsigned char *first_sequence = input_pointer; + uint8_t utf8_length = 0; + uint8_t sequence_length = 0; + + /* get the first utf16 sequence */ + first_code = parse_hex4(first_sequence + 2); + if ((input_end - first_sequence) < 6) + { + /* input ends unexpectedly */ + *error_pointer = first_sequence; goto fail; } - while ((*end_ptr != '\"') && *end_ptr) + /* check that the code is valid */ + if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)) || (first_code == 0)) { - if (*end_ptr++ == '\\') + *error_pointer = first_sequence; + goto fail; + } + + /* UTF16 surrogate pair */ + if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) + { + const unsigned char *second_sequence = first_sequence + 6; + unsigned int second_code = 0; + sequence_length = 12; /* \uXXXX\uXXXX */ + + if ((input_end - second_sequence) < 6) { - if (*end_ptr == '\0') - { - /* prevent buffer overflow when last input character is a backslash */ - goto fail; - } - /* Skip escaped quotes. */ - end_ptr++; + /* input ends unexpectedly */ + *error_pointer = first_sequence; + goto fail; } - len++; + + if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u')) + { + /* missing second half of the surrogate pair */ + *error_pointer = first_sequence; + goto fail; + } + + /* get the second utf16 sequence */ + second_code = parse_hex4(second_sequence + 2); + /* check that the code is valid */ + if ((second_code < 0xDC00) || (second_code > 0xDFFF)) + { + /* invalid second half of the surrogate pair */ + *error_pointer = first_sequence; + goto fail; + } + + + /* calculate the unicode codepoint from the surrogate pair */ + codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF)); + } + else + { + sequence_length = 6; /* \uXXXX */ + codepoint = first_code; } - /* This is at most how long we need for the string, roughly. */ - out = (unsigned char*)cJSON_malloc(len + 1); - if (!out) + /* encode as UTF-8 + * takes at maximum 4 bytes to encode: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (codepoint < 0x80) { + /* normal ascii, encoding 0xxxxxxx */ + utf8_length = 1; + } + else if (codepoint < 0x800) + { + /* two bytes, encoding 110xxxxx 10xxxxxx */ + utf8_length = 2; + } + else if (codepoint < 0x10000) + { + /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */ + utf8_length = 3; + } + else if (codepoint <= 0x10FFFF) + { + /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + utf8_length = 4; + } + else + { + /* invalid unicode codepoint */ + *error_pointer = first_sequence; goto fail; } - ptr2 = out; - /* loop through the string literal */ - while (ptr < end_ptr) + /* encode as utf8 */ + switch (utf8_length) { - if (*ptr != '\\') + case 4: + /* 10xxxxxx */ + (*output_pointer)[3] = (unsigned char)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; + case 3: + /* 10xxxxxx */ + (*output_pointer)[2] = (unsigned char)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; + case 2: + (*output_pointer)[1] = (unsigned char)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; + case 1: + /* depending on the length in bytes this determines the + encoding of the first UTF8 byte */ + (*output_pointer)[0] = (unsigned char)((codepoint | firstByteMark[utf8_length]) & 0xFF); + break; + default: + *error_pointer = first_sequence; + goto fail; + } + *output_pointer += utf8_length; + + return sequence_length; + +fail: + return 0; +} + +/* Parse the input text into an unescaped cinput, and populate item. */ +static const unsigned char *parse_string(cJSON *item, const unsigned char *input, const unsigned char **error_pointer) +{ + const unsigned char *input_pointer = input + 1; + const unsigned char *input_end = input + 1; + unsigned char *output_pointer = NULL; + unsigned char *output = NULL; + + /* not a string */ + if (*input != '\"') + { + *error_pointer = input; + goto fail; + } + + { + /* calculate approximate size of the output (overestimate) */ + size_t allocation_length = 0; + size_t skipped_bytes = 0; + while ((*input_end != '\"') && (*input_end != '\0')) { - *ptr2++ = *ptr++; + /* is escape sequence */ + if (input_end[0] == '\\') + { + if (input_end[1] == '\0') + { + /* prevent buffer overflow when last input character is a backslash */ + goto fail; + } + skipped_bytes++; + input_end++; + } + input_end++; + } + if (*input_end == '\0') + { + goto fail; /* string ended unexpectedly */ + } + + /* This is at most how much we need for the output */ + allocation_length = (size_t) (input_end - input) - skipped_bytes; + output = (unsigned char*)cJSON_malloc(allocation_length + sizeof('\0')); + if (output == NULL) + { + goto fail; /* allocation failure */ + } + } + + output_pointer = output; + /* loop through the string literal */ + while (input_pointer < input_end) + { + if (*input_pointer != '\\') + { + *output_pointer++ = *input_pointer++; } /* escape sequence */ else { - ptr++; - switch (*ptr) + uint8_t sequence_length = 2; + switch (input_pointer[1]) { case 'b': - *ptr2++ = '\b'; + *output_pointer++ = '\b'; break; case 'f': - *ptr2++ = '\f'; + *output_pointer++ = '\f'; break; case 'n': - *ptr2++ = '\n'; + *output_pointer++ = '\n'; break; case 'r': - *ptr2++ = '\r'; + *output_pointer++ = '\r'; break; case 't': - *ptr2++ = '\t'; + *output_pointer++ = '\t'; break; case '\"': case '\\': case '/': - *ptr2++ = *ptr; + *output_pointer++ = input_pointer[1]; break; + + /* UTF-16 literal */ case 'u': - /* transcode utf16 to utf8. See RFC2781 and RFC3629. */ - uc = parse_hex4(ptr + 1); /* get the unicode char. */ - ptr += 4; - if (ptr >= end_ptr) + sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer, error_pointer); + if (sequence_length == 0) { - /* invalid */ - *ep = str; + /* failed to convert UTF16-literal to UTF-8 */ goto fail; } - /* check for invalid. */ - if (((uc >= 0xDC00) && (uc <= 0xDFFF)) || (uc == 0)) - { - *ep = str; - goto fail; - } - - /* UTF16 surrogate pairs. */ - if ((uc >= 0xD800) && (uc<=0xDBFF)) - { - if ((ptr + 6) > end_ptr) - { - /* invalid */ - *ep = str; - goto fail; - } - if ((ptr[1] != '\\') || (ptr[2] != 'u')) - { - /* missing second-half of surrogate. */ - *ep = str; - goto fail; - } - uc2 = parse_hex4(ptr + 3); - ptr += 6; /* \uXXXX */ - if ((uc2 < 0xDC00) || (uc2 > 0xDFFF)) - { - /* invalid second-half of surrogate. */ - *ep = str; - goto fail; - } - /* calculate unicode codepoint from the surrogate pair */ - uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF)); - } - - /* encode as UTF8 - * takes at maximum 4 bytes to encode: - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - len = 4; - if (uc < 0x80) - { - /* normal ascii, encoding 0xxxxxxx */ - len = 1; - } - else if (uc < 0x800) - { - /* two bytes, encoding 110xxxxx 10xxxxxx */ - len = 2; - } - else if (uc < 0x10000) - { - /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */ - len = 3; - } - ptr2 += len; - - switch (len) { - case 4: - /* 10xxxxxx */ - *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF); - uc >>= 6; - case 3: - /* 10xxxxxx */ - *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF); - uc >>= 6; - case 2: - /* 10xxxxxx */ - *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF); - uc >>= 6; - case 1: - /* depending on the length in bytes this determines the - * encoding ofthe first UTF8 byte */ - *--ptr2 = (unsigned char)((uc | firstByteMark[len]) & 0xFF); - break; - default: - *ep = str; - goto fail; - } - ptr2 += len; break; + default: - *ep = str; + *error_pointer = input_pointer; goto fail; } - ptr++; + input_pointer += sequence_length; } } - *ptr2 = '\0'; - if (*ptr == '\"') - { - ptr++; - } + + /* zero terminate the output */ + *output_pointer = '\0'; item->type = cJSON_String; - item->valuestring = (char*)out; + item->valuestring = (char*)output; - return ptr; + return input_end + 1; fail: - if (out != NULL) + if (output != NULL) { - cJSON_free(out); + cJSON_free(output); } return NULL;