v/vlib/builtin/utf8.v

// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module builtin

pub fn utf8_char_len(b byte) int {
	return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
}

// Convert utf32 to utf8
// utf32 == Codepoint
pub fn utf32_to_str(code u32) string {
	unsafe {
		mut buffer := malloc(5)
		return utf32_to_str_no_malloc(code, buffer)
	}
}

[unsafe]
pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string {
	icode := int(code) // Prevents doing casts everywhere
	mut res := ''
	unsafe {
		mut buffer := &byte(buf)
		if icode <= 127 {
			// 0x7F
			buffer[0] = byte(icode)
			buffer[1] = 0
			res = tos(buffer, 1)
		} else if icode <= 2047 {
			// 0x7FF
			buffer[0] = 192 | byte(icode >> 6) // 0xC0 - 110xxxxx
			buffer[1] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[2] = 0
			res = tos(buffer, 2)
		} else if icode <= 65535 {
			// 0xFFFF
			buffer[0] = 224 | byte(icode >> 12) // 0xE0 - 1110xxxx
			buffer[1] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[2] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[3] = 0
			res = tos(buffer, 3)
		}
		// 0x10FFFF
		else if icode <= 1114111 {
			buffer[0] = 240 | byte(icode >> 18) // 0xF0 - 11110xxx
			buffer[1] = 128 | (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[2] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[3] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
			buffer[4] = 0
			res = tos(buffer, 4)
		}
	}
	res.is_lit = 1 // let autofree know this string doesn't have to be freed
	return res
}

// Convert utf8 to utf32
pub fn (_rune string) utf32_code() int {
	if _rune.len == 0 {
		return 0
	}
	// save ASC symbol as is
	if _rune.len == 1 {
		return int(_rune[0])
	}
	mut b := byte(int(_rune[0]))
	// TODO should be
	// res := int( rune[0] << rune.len)
	b = b << _rune.len
	mut res := int(b)
	mut shift := 6 - _rune.len
	for i := 1; i < _rune.len; i++ {
		c := int(_rune[i])
		res = res << shift
		res |= c & 63 // 0x3f
		shift = 6
	}
	return res
}

// Calculate length to read from the first byte
fn utf8_len(c byte) int {
	mut b := 0
	mut x := c
	if (x & 240) != 0 {
		// 0xF0
		x >>= 4
	} else {
		b += 4
	}
	if (x & 12) != 0 {
		// 0x0C
		x >>= 2
	} else {
		b += 2
	}
	if (x & 2) == 0 {
		// 0x02
		b++
	}
	return b
}

// Calculate string length for in number of codepoints
fn utf8_str_len(s string) int {
	mut l := 0
	for i := 0; i < s.len; i++ {
		l++
		c := unsafe { s.str[i] }
		if (c & (1 << 7)) != 0 {
			for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
				i++
			}
		}
	}
	return l
}

// Calculate string length for formatting, i.e. number of "characters"
// This is simplified implementation. if you need specification compliant width,
// use utf8.east_asian.display_width.
pub fn utf8_str_visible_length(s string) int {
	mut l := 0
	mut ul := 1
	for i := 0; i < s.len; i += ul {
		ul = 1
		c := unsafe { s.str[i] }
		if (c & (1 << 7)) != 0 {
			for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
				ul++
			}
		}
		if i + ul > s.len { // incomplete UTF-8 sequence
			return l
		}
		l++
		// recognize combining characters and wide characters
		match ul {
			2 {
				r := u64((u16(c) << 8) | unsafe { s.str[i + 1] })
				if r >= 0xcc80 && r < 0xcdb0 {
					// diacritical marks
					l--
				}
			}
			3 {
				r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] })
				// diacritical marks extended
				// diacritical marks supplement
				// diacritical marks for symbols
				if (r >= 0xe1aab0 && r <= 0xe1ac7f)
					|| (r >= 0xe1b780 && r <= 0xe1b87f)
					|| (r >= 0xe28390 && r <= 0xe2847f)
					|| (r >= 0xefb8a0 && r <= 0xefb8af) {
					// diacritical marks
					l--
				}
				// Hangru
				// CJK Unified Ideographics
				// Hangru
				// CJK
				else if (r >= 0xe18480 && r <= 0xe1859f)
					|| (r >= 0xe2ba80 && r <= 0xe2bf95)
					|| (r >= 0xe38080 && r <= 0xe4b77f)
					|| (r >= 0xe4b880 && r <= 0xea807f)
					|| (r >= 0xeaa5a0 && r <= 0xeaa79f)
					|| (r >= 0xeab080 && r <= 0xed9eaf)
					|| (r >= 0xefa480 && r <= 0xefac7f)
					|| (r >= 0xefb8b8 && r <= 0xefb9af) {
					// half marks
					l++
				}
			}
			4 {
				r := u64((u32(c) << 24) | unsafe {
					(u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3]
				})
				// Enclosed Ideographic Supplement
				// Emoji
				// CJK Unified Ideographs Extension B-G
				if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
					|| (r >= 0xf09f8c80 && r <= 0xf09f9c90)
					|| (r >= 0xf09fa490 && r <= 0xf09fa7af)
					|| (r >= 0xf0a08080 && r <= 0xf180807f) {
					l++
				}
			}
			else {}
		}
	}
	return l
}
all: update copyright to 2019-2021 (#8029) 2021-01-18 15:20:06 +03:00			`// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved.`
all: add copyright 2019-06-23 05:21:30 +03:00			`// Use of this source code is governed by an MIT license`
			`// that can be found in the LICENSE file.`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`module builtin`

more C warnings fixed 2019-09-15 15:36:05 +03:00			`pub fn utf8_char_len(b byte) int {`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`}`

			`// Convert utf32 to utf8`
			`// utf32 == Codepoint`
builtin: public/private functions, remove lots of duplicate functionality (string.eq, compare_strings, etc) 2019-06-27 14:14:59 +03:00			`pub fn utf32_to_str(code u32) string {`
checker: check `unsafe` V function calls (#8752) 2021-02-14 21:31:42 +03:00			`unsafe {`
			`mut buffer := malloc(5)`
			`return utf32_to_str_no_malloc(code, buffer)`
			`}`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`}`

checker: check `unsafe` V function calls (#8752) 2021-02-14 21:31:42 +03:00			`[unsafe]`
builtin: public/private functions, remove lots of duplicate functionality (string.eq, compare_strings, etc) 2019-06-27 14:14:59 +03:00			`pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`icode := int(code) // Prevents doing casts everywhere`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`mut res := ''`
v/checker: Warn about pointer indexing outside unsafe {} (#5918) 2020-07-22 20:28:53 +03:00			`unsafe {`
ci: fix some of `v test-cleancode` 2 2021-04-05 20:55:03 +03:00			`mut buffer := &byte(buf)`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`if icode <= 127 {`
			`// 0x7F`
v/checker: Warn about pointer indexing outside unsafe {} (#5918) 2020-07-22 20:28:53 +03:00			`buffer[0] = byte(icode)`
builtin: add 0 terminators for strings returned by .to_lower, .to_upper, utf32_to_str_no_malloc/2 2021-03-14 19:21:45 +03:00			`buffer[1] = 0`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`res = tos(buffer, 1)`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`} else if icode <= 2047 {`
			`// 0x7FF`
			`buffer[0] = 192 \| byte(icode >> 6) // 0xC0 - 110xxxxx`
			`buffer[1] = 128 \| byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx`
builtin: add 0 terminators for strings returned by .to_lower, .to_upper, utf32_to_str_no_malloc/2 2021-03-14 19:21:45 +03:00			`buffer[2] = 0`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`res = tos(buffer, 2)`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`} else if icode <= 65535 {`
			`// 0xFFFF`
			`buffer[0] = 224 \| byte(icode >> 12) // 0xE0 - 1110xxxx`
			`buffer[1] = 128 \| (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx`
			`buffer[2] = 128 \| byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx`
builtin: add 0 terminators for strings returned by .to_lower, .to_upper, utf32_to_str_no_malloc/2 2021-03-14 19:21:45 +03:00			`buffer[3] = 0`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`res = tos(buffer, 3)`
v/checker: Warn about pointer indexing outside unsafe {} (#5918) 2020-07-22 20:28:53 +03:00			`}`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`// 0x10FFFF`
			`else if icode <= 1114111 {`
			`buffer[0] = 240 \| byte(icode >> 18) // 0xF0 - 11110xxx`
			`buffer[1] = 128 \| (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx`
			`buffer[2] = 128 \| (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx`
			`buffer[3] = 128 \| byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx`
builtin: add 0 terminators for strings returned by .to_lower, .to_upper, utf32_to_str_no_malloc/2 2021-03-14 19:21:45 +03:00			`buffer[4] = 0`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`res = tos(buffer, 4)`
v/checker: Warn about pointer indexing outside unsafe {} (#5918) 2020-07-22 20:28:53 +03:00			`}`
compiler & builtin: bitshifts CAO fix and C code removal in utf8 utf8: removed bitshift cao in code utf8: added pub on functions utf8: byteptr to voidptr utf8: converts voidptr to byteptr 2019-06-27 03:33:49 +03:00			`}`
autofree: builtin fixes 2020-11-22 00:33:31 +03:00			`res.is_lit = 1 // let autofree know this string doesn't have to be freed`
			`return res`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`}`

			`// Convert utf8 to utf32`
builtin: public/private functions, remove lots of duplicate functionality (string.eq, compare_strings, etc) 2019-06-27 14:14:59 +03:00			`pub fn (_rune string) utf32_code() int {`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`if _rune.len == 0 {`
			`return 0`
			`}`
			`// save ASC symbol as is`
			`if _rune.len == 1 {`
			`return int(_rune[0])`
			`}`
compiler & builtin: bitshifts CAO fix and C code removal in utf8 utf8: removed bitshift cao in code utf8: added pub on functions utf8: byteptr to voidptr utf8: converts voidptr to byteptr 2019-06-27 03:33:49 +03:00			`mut b := byte(int(_rune[0]))`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`// TODO should be`
			`// res := int( rune[0] << rune.len)`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`b = b << _rune.len`
compiler & builtin: bitshifts CAO fix and C code removal in utf8 utf8: removed bitshift cao in code utf8: added pub on functions utf8: byteptr to voidptr utf8: converts voidptr to byteptr 2019-06-27 03:33:49 +03:00			`mut res := int(b)`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`mut shift := 6 - _rune.len`
			`for i := 1; i < _rune.len; i++ {`
			`c := int(_rune[i])`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`res = res << shift`
parser: skip $if windows etc blocks on a different os 2019-11-13 21:14:37 +03:00			`res \|= c & 63 // 0x3f`
V 0.0.12 open-source release 2019-06-22 21:20:28 +03:00			`shift = 6`
			`}`
			`return res`
			`}`

builtin: getchar for unicode characters 2019-09-15 04:41:24 +03:00			`// Calculate length to read from the first byte`
			`fn utf8_len(c byte) int {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`mut b := 0`
			`mut x := c`
checker: do not allow extra () in if conditions 2020-03-27 16:57:19 +03:00			`if (x & 240) != 0 {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`// 0xF0`
			`x >>= 4`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`} else {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`b += 4`
			`}`
checker: do not allow extra () in if conditions 2020-03-27 16:57:19 +03:00			`if (x & 12) != 0 {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`// 0x0C`
			`x >>= 2`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`} else {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`b += 2`
			`}`
checker: do not allow extra () in if conditions 2020-03-27 16:57:19 +03:00			`if (x & 2) == 0 {`
run vfmt on vlib/builtin 2019-12-19 23:52:45 +03:00			`// 0x02`
			`b++`
			`}`
			`return b`
builtin: getchar for unicode characters 2019-09-15 04:41:24 +03:00			`}`

utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`// Calculate string length for in number of codepoints`
cgen: string interpolation field width support for large integers and utf8 strings 2020-05-02 01:43:59 +03:00			`fn utf8_str_len(s string) int {`
			`mut l := 0`
			`for i := 0; i < s.len; i++ {`
			`l++`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`c := unsafe { s.str[i] }`
cgen: string interpolation field width support for large integers and utf8 strings 2020-05-02 01:43:59 +03:00			`if (c & (1 << 7)) != 0 {`
			`for t := byte(1 << 6); (c & t) != 0; t >>= 1 {`
			`i++`
			`}`
			`}`
			`}`
			`return l`
			`}`

utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`// Calculate string length for formatting, i.e. number of "characters"`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`// This is simplified implementation. if you need specification compliant width,`
			`// use utf8.east_asian.display_width.`
			`pub fn utf8_str_visible_length(s string) int {`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`mut l := 0`
			`mut ul := 1`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`for i := 0; i < s.len; i += ul {`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`ul = 1`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`c := unsafe { s.str[i] }`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`if (c & (1 << 7)) != 0 {`
			`for t := byte(1 << 6); (c & t) != 0; t >>= 1 {`
			`ul++`
			`}`
			`}`
			`if i + ul > s.len { // incomplete UTF-8 sequence`
			`return l`
			`}`
			`l++`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`// recognize combining characters and wide characters`
			`match ul {`
			`2 {`
			`r := u64((u16(c) << 8) \| unsafe { s.str[i + 1] })`
			`if r >= 0xcc80 && r < 0xcdb0 {`
			`// diacritical marks`
			`l--`
			`}`
			`}`
			`3 {`
			`r := u64((u32(c) << 16) \| unsafe { (u32(s.str[i + 1]) << 8) \| s.str[i + 2] })`
			`// diacritical marks extended`
			`// diacritical marks supplement`
			`// diacritical marks for symbols`
			`if (r >= 0xe1aab0 && r <= 0xe1ac7f)`
			`\|\| (r >= 0xe1b780 && r <= 0xe1b87f)`
			`\|\| (r >= 0xe28390 && r <= 0xe2847f)`
			`\|\| (r >= 0xefb8a0 && r <= 0xefb8af) {`
			`// diacritical marks`
			`l--`
			`}`
			`// Hangru`
			`// CJK Unified Ideographics`
			`// Hangru`
			`// CJK`
			`else if (r >= 0xe18480 && r <= 0xe1859f)`
			`\|\| (r >= 0xe2ba80 && r <= 0xe2bf95)`
			`\|\| (r >= 0xe38080 && r <= 0xe4b77f)`
			`\|\| (r >= 0xe4b880 && r <= 0xea807f)`
			`\|\| (r >= 0xeaa5a0 && r <= 0xeaa79f)`
			`\|\| (r >= 0xeab080 && r <= 0xed9eaf)`
			`\|\| (r >= 0xefa480 && r <= 0xefac7f)`
			`\|\| (r >= 0xefb8b8 && r <= 0xefb9af) {`
			`// half marks`
			`l++`
			`}`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`}`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`4 {`
			`r := u64((u32(c) << 24) \| unsafe {`
			`(u32(s.str[i + 1]) << 16) \| (u32(s.str[i + 2]) << 8) \| s.str[i + 3]`
			`})`
			`// Enclosed Ideographic Supplement`
			`// Emoji`
			`// CJK Unified Ideographs Extension B-G`
			`if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)`
			`\|\| (r >= 0xf09f8c80 && r <= 0xf09f9c90)`
			`\|\| (r >= 0xf09fa490 && r <= 0xf09fa7af)`
builtin: fix for utf8_str_visible_length for CJK Unified Ideographs Extension G (#9739) 2021-04-15 09:20:53 +03:00			`\|\| (r >= 0xf0a08080 && r <= 0xf180807f) {`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`l++`
			`}`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`}`
builtin: correct error underline for unicode wide chars (#9010) 2021-03-01 02:18:02 +03:00			`else {}`
utf: fix string length calculation for combining characters 2020-05-04 14:21:11 +03:00			`}`
			`}`
			`return l`
			`}`