Update to cJSON! We now support UTF-16 surrogate pairs :)

git-svn-id: http://svn.code.sf.net/p/cjson/code@41 e3330c51-1366-4df0-8b21-3ccf24e3d50e
2023-08-10 21:13:26 +03:00 · 2011-10-10 15:22:34 +00:00
parent 9061b7a7e7
commit 0d268cfef7
1 changed files with 17 additions and 5 deletions
--- a/cJSON.c
+++ b/cJSON.c
@@ -142,7 +142,7 @@ static char *print_number(cJSON *item)
 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 static const char *parse_string(cJSON *item,const char *str)
 {
-	const char *ptr=str+1;char *ptr2;char *out;int len=0;unsigned uc;
+	const char *ptr=str+1;char *ptr2;char *out;int len=0;unsigned uc,uc2;
 	if (*str!='\"') {ep=str;return 0;}	/* not a string! */
 	
 	while (*ptr!='\"' && *ptr && ++len) if (*ptr++ == '\\') ptr++;	/* Skip escaped quotes. */
@@ -164,16 +164,28 @@ static const char *parse_string(cJSON *item,const char *str)
 				case 'n': *ptr2++='\n';	break;
 				case 'r': *ptr2++='\r';	break;
 				case 't': *ptr2++='\t';	break;
-				case 'u':	 /* transcode utf16 to utf8. DOES NOT SUPPORT SURROGATE PAIRS CORRECTLY. */
-					sscanf(ptr+1,"%4x",&uc);	/* get the unicode char. */
-					len=3;if (uc<0x80) len=1;else if (uc<0x800) len=2;ptr2+=len;
+				case 'u':	 /* transcode utf16 to utf8. */
+					sscanf(ptr+1,"%4x",&uc);ptr+=4;	/* get the unicode char. */
+
+					if ((uc>=0xDC00 && uc<=0xDFFF) || uc==0)	break;	// check for invalid.
+
+					if (uc>=0xD800 && uc<=0xDBFF)	// UTF16 surrogate pairs.
+					{
+						if (ptr[1]!='\\' || ptr[2]!='u')	break;	// missing second-half of surrogate.
+						sscanf(ptr+3,"%4x",&uc2);ptr+=6;
+						if (uc2<0xDC00 || uc2>0xDFFF)		break;	// invalid second-half of surrogate.
+						uc=0x10000 | ((uc&0x3FF)<<10) | (uc2&0x3FF);
+					}
+
+					len=4;if (uc<0x80) len=1;else if (uc<0x800) len=2;else if (uc<0x10000) len=3; ptr2+=len;
 					
 					switch (len) {
+						case 4: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
 						case 3: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
 						case 2: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
 						case 1: *--ptr2 =(uc | firstByteMark[len]);
 					}
-					ptr2+=len;ptr+=4;
+					ptr2+=len;
 					break;
 				default:  *ptr2++=*ptr; break;
 			}