diff --git a/vlib/encoding/utf8/utf8_tables.v b/vlib/encoding/utf8/utf8_tables.v index eeef3dd560..658ee29507 100644 --- a/vlib/encoding/utf8/utf8_tables.v +++ b/vlib/encoding/utf8/utf8_tables.v @@ -1132,6 +1132,20 @@ const letter_table = RangeTable{ latin_offset: 6 } +const white_space_table = RangeTable{ + r16: [ + Range16{0x0009, 0x000d, 1}, + Range16{0x0020, 0x0085, 101}, + Range16{0x00a0, 0x1680, 5600}, + Range16{0x2000, 0x200a, 1}, + Range16{0x2028, 0x2029, 1}, + Range16{0x202f, 0x205f, 48}, + Range16{0x3000, 0x3000, 1}, + ] + r32: [] + latin_offset: 2 +} + struct RangeTable { pub: r16 []Range16 diff --git a/vlib/encoding/utf8/utf8_util.v b/vlib/encoding/utf8/utf8_util.v index 85853dc71c..c9a0f35df8 100644 --- a/vlib/encoding/utf8/utf8_util.v +++ b/vlib/encoding/utf8/utf8_util.v @@ -160,6 +160,24 @@ pub fn is_letter(r rune) bool { return is_excluding_latin(letter_table, r) } +// is_space returns true if the rune is character in unicode category Z with property white space or the following character set: +// ``` +// `\t`, `\n`, `\v`, `\f`, `\r`, ` `, 0x85 (NEL), 0xA0 (NBSP) +// ``` +pub fn is_space(r rune) bool { + if r <= max_latin_1 { + match r { + `\t`, `\n`, `\v`, `\f`, `\r`, ` `, 0x85, 0xA0 { + return true + } + else { + return false + } + } + } + return is_excluding_latin(white_space_table, r) +} + // is_uchar_punct return true if the input unicode is a western unicode punctuation pub fn is_uchar_punct(uchar int) bool { return find_punct_in_table(uchar, utf8.unicode_punct_western) != 0 diff --git a/vlib/encoding/utf8/utf8_util_test.v b/vlib/encoding/utf8/utf8_util_test.v index 74900415bc..99c7fcda65 100644 --- a/vlib/encoding/utf8/utf8_util_test.v +++ b/vlib/encoding/utf8/utf8_util_test.v @@ -91,3 +91,17 @@ fn test_is_letter() { assert utf8.is_letter(`ȶ`) == true assert utf8.is_letter(`ȹ`) == true } + +fn test_is_space() { + for ra in `a` .. `z` { + assert utf8.is_space(ra) == false + } + + for ra in `A` .. `Z` { + assert utf8.is_space(ra) == false + } + + assert utf8.is_space(`\u202f`) == true + assert utf8.is_space(`\u2009`) == true + assert utf8.is_space(`\u00A0`) == true +}