x.json2: proper string encoding + minor fixes (#9026)

2023-08-10 21:13:21 +03:00 · 2021-03-01 17:22:36 +08:00 · 2021-03-01 17:22:36 +08:00 · ee879f3e41
commit ee879f3e41
parent 506041a15b
4 changed files with 129 additions and 8 deletions
--- a/vlib/x/json2/encoder.v
+++ b/vlib/x/json2/encoder.v
@ -6,7 +6,7 @@ module json2
 import strings

 fn write_value(v Any, i int, len int, mut wr strings.Builder) {
-	str := v.str()
+	str := v.json_str()
 	if v is string {
 		wr.write_string('"$str"')
 	} else {
@ -51,11 +51,21 @@ pub fn (flds []Any) str() string {
 	return res
 }

-// str returns the string representation of the `Any` type.
+// str returns the string representation of the `Any` type. Use the `json_str` method
+// if you want to use the escaped str() version of the `Any` type.
 pub fn (f Any) str() string {
+	if f is string {
+		return f
+	} else {
+		return f.json_str()
+	}
+}
+
+// json_str returns the JSON string representation of the `Any` type.
+pub fn (f Any) json_str() string {
 	match f {
 		string {
-			return f
+			return json_string(f)
 		}
 		int {
 			return f.str()
@ -85,3 +95,76 @@ pub fn (f Any) str() string {
 		}
 	}
 }
+
+// char_len_list is a modified version of builtin.utf8_str_len
+// that returns an array of character lengths. (e.g "t✔" => [1,2])
+fn char_len_list(s string) []int {
+	mut l := 1
+	mut ls := []int{}
+	for i := 0; i < s.len; i++ {
+		c := s[i]
+		if (c & (1 << 7)) != 0 {
+			for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
+				l++
+				i++
+			}
+		}
+		ls << l
+		l = 1
+	}
+	return ls
+}
+
+const escaped_chars = [r'\b', r'\f', r'\n', r'\r', r'\t']
+
+// json_string returns the JSON spec-compliant version of the string.
+[manualfree]
+fn json_string(s string) string {
+	// not the best implementation but will revisit it soon
+	char_lens := char_len_list(s)
+	mut sb := strings.new_builder(s.len)
+	mut i := 0
+	defer {
+		unsafe {
+			char_lens.free()
+			// freeing string builder on defer after
+			// returning .str() still isn't working :(
+			// sb.free()
+		}
+	}
+	for char_len in char_lens {
+		if char_len == 1 {
+			chr := s[i]
+			if chr in json2.important_escapable_chars {
+				for j := 0 ; j < json2.important_escapable_chars.len; j++ {
+					if chr == json2.important_escapable_chars[j] {
+						sb.write_string(escaped_chars[j])
+						break
+					}
+				}
+			} else if chr == `"` || chr == `/` || chr == `\\` {
+				sb.write_string('\\' + chr.ascii_str())
+			} else {
+				sb.write_b(chr)
+			}
+		} else {
+			slice := s[i .. i + char_len]
+			hex_code := slice.utf32_code().hex()
+			if hex_code.len == 4 {
+				sb.write_string('\\u$hex_code')
+			} else {
+				// TODO: still figuring out what
+				// to do with more than 4 chars
+				sb.write_b(` `)
+			}
+			unsafe {
+				slice.free()
+				hex_code.free()
+			}
+		}
+		i += char_len
+	}
+	str := sb.str()
+	unsafe { sb.free() }
+	return str
+}
--- a/vlib/x/json2/encoder_test.v
+++ b/vlib/x/json2/encoder_test.v
@ -0,0 +1,21 @@
+import x.json2
+
+fn test_json_string_characters() {
+	text := json2.raw_decode(r'"\n\r\b\f\t\\\"\/"') or { '' }
+	assert text.json_str() == '\\n\\r\\b\\f\\t\\\\\\"\\/'
+}
+
+fn test_json_string() {
+	text := json2.Any('te✔st')
+	assert text.json_str() == r'te\u2714st'
+}
+
+fn test_json_string_emoji() {
+	text := json2.Any('🐈')
+	assert text.json_str() == r' '
+}
+
+fn test_json_string_non_ascii() {
+	text := json2.Any('ひらがな')
+	assert text.json_str() == r'\u3072\u3089\u304c\u306a'
+}
--- a/vlib/x/json2/scanner.v
+++ b/vlib/x/json2/scanner.v
@ -41,12 +41,12 @@ const (
 	// list of characters commonly used in JSON.
 	char_list                 = [`{`, `}`, `[`, `]`, `,`, `:`]
 	// list of newlines to check when moving to a new position.
-	newlines                  = [`\r`, `\n`, byte(9), `\t`]
+	newlines                  = [`\r`, `\n`, `\t`]
 	// list of escapable that needs to be escaped inside a JSON string.
 	// double quotes and forward slashes are excluded intentionally since
 	// they have their own separate checks for it in order to pass the
 	// JSON test suite (https://github.com/nst/JSONTestSuite/).
-	important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`]
+	important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
 	// list of valid unicode escapes aside from \u{4-hex digits}
 	valid_unicode_escapes     = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
 	// used for transforming escapes into valid unicode (eg. n => \n)
@ -129,7 +129,7 @@ fn (mut s Scanner) text_scan() Token {
 		} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
 			&& ch in json2.important_escapable_chars {
 			return s.error('character must be escaped with a backslash')
-		} else if s.pos == s.text.len - 1 && ch == `\\` {
+		} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) {
 			return s.error('invalid backslash escape')
 		} else if s.pos + 1 < s.text.len && ch == `\\` {
 			peek := s.text[s.pos + 1]
@ -154,8 +154,15 @@ fn (mut s Scanner) text_scan() Token {
 					if codepoint.len != 4 {
 						return s.error('unicode escape must have 4 hex digits')
 					}
-					chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32))
-					unsafe { codepoint.free() }
+					val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32))
+					converted := utf32_to_str(val)
+					converted_bytes := converted.bytes()
+					chrs << converted_bytes
+					unsafe {
+						converted.free()
+						converted_bytes.free()
+						codepoint.free()
+					}
 					continue
 				} else {
 					return s.error('incomplete unicode escape')
--- a/vlib/x/json2/scanner_test.v
+++ b/vlib/x/json2/scanner_test.v
@ -20,6 +20,16 @@ fn test_str_valid_unicode_escape() {
 	assert tok.lit.bytestr() == 'H'
 }

+fn test_str_valid_unicode_escape_2() {
+	mut sc := Scanner{
+		text: r'"\u2714"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .str_
+	assert tok.lit.len == 3
+	assert tok.lit.bytestr() == '✔'
+}
+
 fn test_str_invalid_escape() {
 	mut sc := Scanner{
 		text: r'"\z"'.bytes()