From ded6c3806167d30449bc4059e39f135a75e52c76 Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Fri, 7 Jul 2023 02:40:11 +0300 Subject: [PATCH] vlib: add a new module `builtin.wchar`, to ease dealing with C APIs that accept `wchar_t*` (#18794) --- vlib/builtin/cfns.c.v | 2 +- vlib/builtin/utf8.c.v | 14 +++- vlib/builtin/utf8_test.v | 3 +- vlib/builtin/wchar/wchar.c.v | 116 ++++++++++++++++++++++++++++++++ vlib/builtin/wchar/wchar_test.v | 36 ++++++++++ 5 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 vlib/builtin/wchar/wchar.c.v create mode 100644 vlib/builtin/wchar/wchar_test.v diff --git a/vlib/builtin/cfns.c.v b/vlib/builtin/cfns.c.v index 0979f06112..597d3f8e8f 100644 --- a/vlib/builtin/cfns.c.v +++ b/vlib/builtin/cfns.c.v @@ -295,7 +295,7 @@ fn C.SymCleanup(hProcess voidptr) fn C.MultiByteToWideChar(codePage u32, dwFlags u32, lpMultiMyteStr &char, cbMultiByte int, lpWideCharStr &u16, cchWideChar int) int -fn C.wcslen(str &u16) int +fn C.wcslen(str voidptr) usize fn C.WideCharToMultiByte(codePage u32, dwFlags u32, lpWideCharStr &u16, cchWideChar int, lpMultiByteStr &char, cbMultiByte int, lpDefaultChar &char, lpUsedDefaultChar &int) int diff --git a/vlib/builtin/utf8.c.v b/vlib/builtin/utf8.c.v index de366ed388..30f6d8056c 100644 --- a/vlib/builtin/utf8.c.v +++ b/vlib/builtin/utf8.c.v @@ -10,6 +10,8 @@ const cp_utf8 = 65001 // The returned pointer of .to_wide(), has a type of &u16, and is suitable // for passing to Windows APIs that expect LPWSTR or wchar_t* parameters. // See also MultiByteToWideChar ( https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar ) +// See also builtin.wchar.from_string/1, for a version, that produces a +// platform dependant L"" C style wchar_t* wide string. pub fn (_str string) to_wide() &u16 { $if windows { unsafe { @@ -29,19 +31,25 @@ pub fn (_str string) to_wide() &u16 { for i, r in srunes { result[i] = u16(r) } + result[srunes.len] = 0 return result } } } // string_from_wide creates a V string, encoded in UTF-8, given a windows -// style string encoded in UTF-16. +// style string encoded in UTF-16. Note that this function first searches +// for the string terminator 0 character, and is thus slower, while more +// convenient compared to string_from_wide2/2 (you have to know the length +// in advance to use string_from_wide2/2). +// See also builtin.wchar.to_string/1, for a version that eases working with +// the platform dependent &wchar_t L"" strings. [manualfree; unsafe] pub fn string_from_wide(_wstr &u16) string { $if windows { unsafe { wstr_len := C.wcslen(_wstr) - return string_from_wide2(_wstr, wstr_len) + return string_from_wide2(_wstr, int(wstr_len)) } } $else { mut i := 0 @@ -56,6 +64,8 @@ pub fn string_from_wide(_wstr &u16) string { // style string, encoded in UTF-16. It is more efficient, compared to // string_from_wide, but it requires you to know the input string length, // and to pass it as the second argument. +// See also builtin.wchar.to_string2/2, for a version that eases working +// with the platform dependent &wchar_t L"" strings. [manualfree; unsafe] pub fn string_from_wide2(_wstr &u16, len int) string { $if windows { diff --git a/vlib/builtin/utf8_test.v b/vlib/builtin/utf8_test.v index 72321251f9..275ea14b3a 100644 --- a/vlib/builtin/utf8_test.v +++ b/vlib/builtin/utf8_test.v @@ -72,6 +72,7 @@ fn test_string_from_wide2() { fn test_reverse_cyrillic_with_string_from_wide() { s := 'Проба' - z := unsafe { string_from_wide(s.to_wide()) } + ws := s.to_wide() + z := unsafe { string_from_wide(ws) } assert z == s } diff --git a/vlib/builtin/wchar/wchar.c.v b/vlib/builtin/wchar/wchar.c.v new file mode 100644 index 0000000000..a04c320b3f --- /dev/null +++ b/vlib/builtin/wchar/wchar.c.v @@ -0,0 +1,116 @@ +module wchar + +import strings + +#include + +[typedef] +struct C.wchar_t {} + +// Character is a type, that eases working with the platform dependent C.wchar_t type. +// Note: the size of C.wchar_t varies between platforms, it is 2 bytes on windows, +// and usually 4 bytes elsewhere. +pub type Character = C.wchar_t + +// zero is a Character, that in C L"" strings represents the string end character (terminator). +pub const zero = from_rune(0) + +// return a string representation of the given Character +pub fn (a Character) str() string { + return a.to_rune().str() +} + +// == is an equality operator, to ease comparing Characters +// TODO: the default == operator, that V generates, does not work for C.wchar_t . +[inline] +pub fn (a Character) == (b Character) bool { + return u64(a) == u64(b) +} + +// to_rune creates a V rune, given a Character +[inline] +pub fn (c Character) to_rune() rune { + return unsafe { *(&rune(&c)) } +} + +// from_rune creates a Character, given a V rune +[inline] +pub fn from_rune(r rune) Character { + return unsafe { *(&Character(&r)) } +} + +// length_in_characters returns the length of the given wchar_t* wide C style L"" string. +// Example: assert unsafe { wchar.length_in_characters(wchar.from_string('abc')) } == 3 +// See also `length_in_bytes` . +[unsafe] +pub fn length_in_characters(p voidptr) int { + mut len := 0 + pc := &Character(p) + for unsafe { pc[len] != wchar.zero } { + len++ + } + return len +} + +// length_in_bytes returns the length of the given wchar_t* wide C style L"" string in bytes. +// Note that the size of wchar_t is different on the different platforms, thus the length in +// bytes for the same data converted from UTF-8 to a &Character buffer, will be different as well. +// i.e. unsafe { wchar.length_in_bytes(wchar.from_string('abc')) } will be 12 on unix, but +// 6 on windows. +[unsafe] +pub fn length_in_bytes(p voidptr) int { + return unsafe { length_in_characters(p) } * int(sizeof(Character)) +} + +// to_string creates a V string, encoded in UTF-8, given a wchar_t* +// wide C style L"" string. It relies that the string has a 0 terminator at its end, +// to determine the string's length. +// Note, that the size of wchar_t is platform-dependent, and is *2 bytes* on windows, +// while it is *4 bytes* on most everything else. +// Unless you are interfacing with a C library, that does specifically use `wchar_t`, +// consider using `string_from_wide` instead, which will always assume that the input +// data is in an UTF-16 encoding, no matter what the platform is. +[unsafe] +pub fn to_string(p voidptr) string { + unsafe { + len := length_in_characters(p) + return to_string2(p, len) + } +} + +// to_string2 creates a V string, encoded in UTF-8, given a `C.wchar_t*` +// wide C style L"" string. Note, that the size of `C.wchar_t` is platform-dependent, +// and is *2 bytes* on windows, while *4* on most everything else. +// Unless you are interfacing with a C library, that does specifically use wchar_t, +// consider using string_from_wide2 instead, which will always assume that the input +// data is in an UTF-16 encoding, no matter what the platform is. +[manualfree; unsafe] +pub fn to_string2(p voidptr, len int) string { + pc := &Character(p) + mut sb := strings.new_builder(len) + defer { + unsafe { sb.free() } + } + for i := 0; i < len; i++ { + u := unsafe { rune(pc[i]) } + sb.write_rune(u) + } + res := sb.str() + return res +} + +// from_string converts the V string (in UTF-8 encoding), into a newly allocated +// platform specific buffer of C.wchar_t . +// The conversion is done by processing each rune of the input string 1 by 1. +[manualfree] +pub fn from_string(s string) &Character { + srunes := s.runes() + unsafe { + mut result := &Character(vcalloc_noscan((srunes.len + 1) * int(sizeof(Character)))) + for i, r in srunes { + result[i] = from_rune(r) + } + result[srunes.len] = wchar.zero + return result + } +} diff --git a/vlib/builtin/wchar/wchar_test.v b/vlib/builtin/wchar/wchar_test.v new file mode 100644 index 0000000000..ff84229fe8 --- /dev/null +++ b/vlib/builtin/wchar/wchar_test.v @@ -0,0 +1,36 @@ +import builtin.wchar + +const wide_serial_number_unix = [u16(67), 0, 76, 0, 52, 0, 54, 0, 73, 0, 49, 0, 65, 0, 48, 0, 48, + 0, 54, 0, 52, 0, 57, 0, 0, 0, 0] + +const wide_serial_number_windows = wide_serial_number_unix.map(u8(it)) + +const swide_serial_number = 'CL46I1A00649' + +fn test_from_to_rune() { + for r in swide_serial_number.runes() { + c := wchar.from_rune(r) + assert c.to_rune() == r + } + assert wchar.from_rune(0).to_rune() == 0 +} + +fn test_to_string() { + mut p := voidptr(wide_serial_number_unix.data) + $if windows { + p = wide_serial_number_windows.data + } + assert unsafe { wchar.length_in_characters(p) } == swide_serial_number.len + s := unsafe { wchar.to_string(p) } + dump(s) + assert s == swide_serial_number +} + +fn test_from_string() { + x := wchar.from_string(swide_serial_number) + assert unsafe { x[0] } == wchar.from_rune(`C`) + assert unsafe { x[1] } == wchar.from_rune(`L`) + assert unsafe { x[2] } == wchar.from_rune(`4`) + assert unsafe { x[11] } == wchar.from_rune(`9`) + assert unsafe { x[12] } == wchar.zero +}