From ded6c3806167d30449bc4059e39f135a75e52c76 Mon Sep 17 00:00:00 2001
From: Delyan Angelov <delian66@gmail.com>
Date: Fri, 7 Jul 2023 02:40:11 +0300
Subject: [PATCH] vlib: add a new module `builtin.wchar`, to ease dealing with
 C APIs that accept `wchar_t*` (#18794)

---
 vlib/builtin/cfns.c.v           |   2 +-
 vlib/builtin/utf8.c.v           |  14 +++-
 vlib/builtin/utf8_test.v        |   3 +-
 vlib/builtin/wchar/wchar.c.v    | 116 ++++++++++++++++++++++++++++++++
 vlib/builtin/wchar/wchar_test.v |  36 ++++++++++
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 vlib/builtin/wchar/wchar.c.v
 create mode 100644 vlib/builtin/wchar/wchar_test.v

diff --git a/vlib/builtin/cfns.c.v b/vlib/builtin/cfns.c.v
index 0979f06112..597d3f8e8f 100644
--- a/vlib/builtin/cfns.c.v
+++ b/vlib/builtin/cfns.c.v
@@ -295,7 +295,7 @@ fn C.SymCleanup(hProcess voidptr)
 
 fn C.MultiByteToWideChar(codePage u32, dwFlags u32, lpMultiMyteStr &char, cbMultiByte int, lpWideCharStr &u16, cchWideChar int) int
 
-fn C.wcslen(str &u16) int
+fn C.wcslen(str voidptr) usize
 
 fn C.WideCharToMultiByte(codePage u32, dwFlags u32, lpWideCharStr &u16, cchWideChar int, lpMultiByteStr &char, cbMultiByte int, lpDefaultChar &char, lpUsedDefaultChar &int) int
 
diff --git a/vlib/builtin/utf8.c.v b/vlib/builtin/utf8.c.v
index de366ed388..30f6d8056c 100644
--- a/vlib/builtin/utf8.c.v
+++ b/vlib/builtin/utf8.c.v
@@ -10,6 +10,8 @@ const cp_utf8 = 65001
 // The returned pointer of .to_wide(), has a type of &u16, and is suitable
 // for passing to Windows APIs that expect LPWSTR or wchar_t* parameters.
 // See also MultiByteToWideChar ( https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar )
+// See also builtin.wchar.from_string/1, for a version, that produces a
+// platform dependant L"" C style wchar_t* wide string.
 pub fn (_str string) to_wide() &u16 {
 	$if windows {
 		unsafe {
@@ -29,19 +31,25 @@ pub fn (_str string) to_wide() &u16 {
 			for i, r in srunes {
 				result[i] = u16(r)
 			}
+			result[srunes.len] = 0
 			return result
 		}
 	}
 }
 
 // string_from_wide creates a V string, encoded in UTF-8, given a windows
-// style string encoded in UTF-16.
+// style string encoded in UTF-16. Note that this function first searches
+// for the string terminator 0 character, and is thus slower, while more
+// convenient compared to string_from_wide2/2 (you have to know the length
+// in advance to use string_from_wide2/2).
+// See also builtin.wchar.to_string/1, for a version that eases working with
+// the platform dependent &wchar_t L"" strings.
 [manualfree; unsafe]
 pub fn string_from_wide(_wstr &u16) string {
 	$if windows {
 		unsafe {
 			wstr_len := C.wcslen(_wstr)
-			return string_from_wide2(_wstr, wstr_len)
+			return string_from_wide2(_wstr, int(wstr_len))
 		}
 	} $else {
 		mut i := 0
@@ -56,6 +64,8 @@ pub fn string_from_wide(_wstr &u16) string {
 // style string, encoded in UTF-16. It is more efficient, compared to
 // string_from_wide, but it requires you to know the input string length,
 // and to pass it as the second argument.
+// See also builtin.wchar.to_string2/2, for a version that eases working
+// with the platform dependent &wchar_t L"" strings.
 [manualfree; unsafe]
 pub fn string_from_wide2(_wstr &u16, len int) string {
 	$if windows {
diff --git a/vlib/builtin/utf8_test.v b/vlib/builtin/utf8_test.v
index 72321251f9..275ea14b3a 100644
--- a/vlib/builtin/utf8_test.v
+++ b/vlib/builtin/utf8_test.v
@@ -72,6 +72,7 @@ fn test_string_from_wide2() {
 
 fn test_reverse_cyrillic_with_string_from_wide() {
 	s := 'Проба'
-	z := unsafe { string_from_wide(s.to_wide()) }
+	ws := s.to_wide()
+	z := unsafe { string_from_wide(ws) }
 	assert z == s
 }
diff --git a/vlib/builtin/wchar/wchar.c.v b/vlib/builtin/wchar/wchar.c.v
new file mode 100644
index 0000000000..a04c320b3f
--- /dev/null
+++ b/vlib/builtin/wchar/wchar.c.v
@@ -0,0 +1,116 @@
+module wchar
+
+import strings
+
+#include <wchar.h>
+
+[typedef]
+struct C.wchar_t {}
+
+// Character is a type, that eases working with the platform dependent C.wchar_t type.
+// Note: the size of C.wchar_t varies between platforms, it is 2 bytes on windows,
+// and usually 4 bytes elsewhere.
+pub type Character = C.wchar_t
+
+// zero is a Character, that in C L"" strings represents the string end character (terminator).
+pub const zero = from_rune(0)
+
+// return a string representation of the given Character
+pub fn (a Character) str() string {
+	return a.to_rune().str()
+}
+
+// == is an equality operator, to ease comparing Characters
+// TODO: the default == operator, that V generates, does not work for C.wchar_t .
+[inline]
+pub fn (a Character) == (b Character) bool {
+	return u64(a) == u64(b)
+}
+
+// to_rune creates a V rune, given a Character
+[inline]
+pub fn (c Character) to_rune() rune {
+	return unsafe { *(&rune(&c)) }
+}
+
+// from_rune creates a Character, given a V rune
+[inline]
+pub fn from_rune(r rune) Character {
+	return unsafe { *(&Character(&r)) }
+}
+
+// length_in_characters returns the length of the given wchar_t* wide C style L"" string.
+// Example: assert unsafe { wchar.length_in_characters(wchar.from_string('abc')) } == 3
+// See also `length_in_bytes` .
+[unsafe]
+pub fn length_in_characters(p voidptr) int {
+	mut len := 0
+	pc := &Character(p)
+	for unsafe { pc[len] != wchar.zero } {
+		len++
+	}
+	return len
+}
+
+// length_in_bytes returns the length of the given wchar_t* wide C style L"" string in bytes.
+// Note that the size of wchar_t is different on the different platforms, thus the length in
+// bytes for the same data converted from UTF-8 to a &Character buffer, will be different as well.
+// i.e. unsafe { wchar.length_in_bytes(wchar.from_string('abc')) } will be 12 on unix, but
+// 6 on windows.
+[unsafe]
+pub fn length_in_bytes(p voidptr) int {
+	return unsafe { length_in_characters(p) } * int(sizeof(Character))
+}
+
+// to_string creates a V string, encoded in UTF-8, given a wchar_t*
+// wide C style L"" string. It relies that the string has a 0 terminator at its end,
+// to determine the string's length.
+// Note, that the size of wchar_t is platform-dependent, and is *2 bytes* on windows,
+// while it is *4 bytes* on most everything else.
+// Unless you are interfacing with a C library, that does specifically use `wchar_t`,
+// consider using `string_from_wide` instead, which will always assume that the input
+// data is in an UTF-16 encoding, no matter what the platform is.
+[unsafe]
+pub fn to_string(p voidptr) string {
+	unsafe {
+		len := length_in_characters(p)
+		return to_string2(p, len)
+	}
+}
+
+// to_string2 creates a V string, encoded in UTF-8, given a `C.wchar_t*`
+// wide C style L"" string. Note, that the size of `C.wchar_t` is platform-dependent,
+// and is *2 bytes* on windows, while *4* on most everything else.
+// Unless you are interfacing with a C library, that does specifically use wchar_t,
+// consider using string_from_wide2 instead, which will always assume that the input
+// data is in an UTF-16 encoding, no matter what the platform is.
+[manualfree; unsafe]
+pub fn to_string2(p voidptr, len int) string {
+	pc := &Character(p)
+	mut sb := strings.new_builder(len)
+	defer {
+		unsafe { sb.free() }
+	}
+	for i := 0; i < len; i++ {
+		u := unsafe { rune(pc[i]) }
+		sb.write_rune(u)
+	}
+	res := sb.str()
+	return res
+}
+
+// from_string converts the V string (in UTF-8 encoding), into a newly allocated
+// platform specific buffer of C.wchar_t .
+// The conversion is done by processing each rune of the input string 1 by 1.
+[manualfree]
+pub fn from_string(s string) &Character {
+	srunes := s.runes()
+	unsafe {
+		mut result := &Character(vcalloc_noscan((srunes.len + 1) * int(sizeof(Character))))
+		for i, r in srunes {
+			result[i] = from_rune(r)
+		}
+		result[srunes.len] = wchar.zero
+		return result
+	}
+}
diff --git a/vlib/builtin/wchar/wchar_test.v b/vlib/builtin/wchar/wchar_test.v
new file mode 100644
index 0000000000..ff84229fe8
--- /dev/null
+++ b/vlib/builtin/wchar/wchar_test.v
@@ -0,0 +1,36 @@
+import builtin.wchar
+
+const wide_serial_number_unix = [u16(67), 0, 76, 0, 52, 0, 54, 0, 73, 0, 49, 0, 65, 0, 48, 0, 48,
+	0, 54, 0, 52, 0, 57, 0, 0, 0, 0]
+
+const wide_serial_number_windows = wide_serial_number_unix.map(u8(it))
+
+const swide_serial_number = 'CL46I1A00649'
+
+fn test_from_to_rune() {
+	for r in swide_serial_number.runes() {
+		c := wchar.from_rune(r)
+		assert c.to_rune() == r
+	}
+	assert wchar.from_rune(0).to_rune() == 0
+}
+
+fn test_to_string() {
+	mut p := voidptr(wide_serial_number_unix.data)
+	$if windows {
+		p = wide_serial_number_windows.data
+	}
+	assert unsafe { wchar.length_in_characters(p) } == swide_serial_number.len
+	s := unsafe { wchar.to_string(p) }
+	dump(s)
+	assert s == swide_serial_number
+}
+
+fn test_from_string() {
+	x := wchar.from_string(swide_serial_number)
+	assert unsafe { x[0] } == wchar.from_rune(`C`)
+	assert unsafe { x[1] } == wchar.from_rune(`L`)
+	assert unsafe { x[2] } == wchar.from_rune(`4`)
+	assert unsafe { x[11] } == wchar.from_rune(`9`)
+	assert unsafe { x[12] } == wchar.zero
+}