From 8e1c27d1291fc2e37405642eafeabbde35d08713 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Thu, 12 Dec 2019 19:08:44 +0100 Subject: [PATCH] utf8: punctuation --- vlib/encoding/utf8/utf8_util.v | 786 +++++++++++++++++++++++++++- vlib/encoding/utf8/utf8_util_test.v | 21 + 2 files changed, 792 insertions(+), 15 deletions(-) diff --git a/vlib/encoding/utf8/utf8_util.v b/vlib/encoding/utf8/utf8_util.v index 26927b5024..f21fa6b36b 100644 --- a/vlib/encoding/utf8/utf8_util.v +++ b/vlib/encoding/utf8/utf8_util.v @@ -11,6 +11,11 @@ **********************************************************************/ module utf8 +/********************************************************************** +* +* Utility functions +* +**********************************************************************/ // len return the leght as number of unicode chars from a string pub fn len(s string) int { @@ -33,6 +38,54 @@ pub fn u_len(s ustring) int { return len(s.s) } +// get_uchar convert a unicode glyph in string[index] into a int unicode char +pub fn get_uchar(s string, index int) int { + mut res := 0 + mut ch_len := 0 + if s.len > 0 { + ch_len = utf8util_char_len(s.str[index]) + + if ch_len == 1 { + return u16(s.str[0]) + }if ch_len > 1 && ch_len < 5{ + mut lword := 0 + for i:=0; i < ch_len ; i++ { + lword = (lword << 8 ) | int( s.str[index + i] ) + } + + // 2 byte utf-8 + // byte format: 110xxxxx 10xxxxxx + // + if ch_len == 2 { + res = (lword & 0x1f00) >> 2 | (lword & 0x3f) + } + // 3 byte utf-8 + // byte format: 1110xxxx 10xxxxxx 10xxxxxx + // + else if ch_len == 3 { + res = ( lword & 0x0f0000 ) >> 4 | ( lword & 0x3f00 ) >> 2 | ( lword & 0x3f ) + } + // 4 byte utf-8 + // byte format: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + else if ch_len == 4 { + res = (( lword & 0x07000000 ) >> 6) | (( lword & 0x003f0000 ) >> 4) | + (( lword & 0x00003F00 ) >> 2 ) | ( lword & 0x0000003f ) + } + + + } + + } + return res +} + + +/********************************************************************** +* +* Conversion functions +* +**********************************************************************/ // to_upper return an uppercase string from a string pub fn to_upper(s string) string { @@ -59,16 +112,52 @@ pub fn u_to_lower(s ustring) ustring { /********************************************************************** * -* Private functions +* Punctuation functions +* +* The "western" function search on a small table, that is quicker than +* the global unicode table search. **Use only for western chars**. * **********************************************************************/ -// utf8util_char_len calculate the lenght in bytes of a utf8 rune +// +// Western +// + +// is_punct return true if the string[index] byte is the start of a unicode western punctuation +pub fn is_punct( s string , index int) bool { + return is_uchar_punct(get_uchar(s, index)) +} + +// is_uchar_punct return true if the input unicode is a western unicode punctuation +pub fn is_uchar_punct( uchar int ) bool { + return find_punct_in_table(uchar, unicode_punct_western ) != 0 +} + +// +// Global +// + +// is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation +pub fn is_global_punct( s string , index int) bool { + return is_uchar_global_punct(get_uchar(s, index)) +} + +// is_uchar_global_punct return true if the input unicode is a global unicode punctuation +pub fn is_uchar_global_punct( uchar int ) bool { + return find_punct_in_table( uchar , unicode_punct ) != 0 +} + + +/********************************************************************** +* +* Private functions +* +**********************************************************************/ +// utf8util_char_len calculate the length in bytes of a utf8 char fn utf8util_char_len(b byte) int { return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1 } - // // if upper_flag == true then make low ==> upper conversion // if upper_flag == false then make upper ==> low conversion @@ -76,7 +165,6 @@ fn utf8util_char_len(b byte) int { // up_low make the dirt job fn up_low(s string, upper_flag bool) string { mut _index := 0 - mut old_index := 0 mut str_res := malloc(s.len + 1) for { @@ -98,7 +186,7 @@ fn up_low(s string, upper_flag bool) string { //C.printf(" #%d (%x) ", _index, lword) - mut res := int(0) + mut res := 0 // 2 byte utf-8 // byte format: 110xxxxx 10xxxxxx @@ -131,7 +219,7 @@ fn up_low(s string, upper_flag bool) string { } //C.printf("\n") }else{ - tab_char := u16(unicode_con_table_up_to_low[ch_index]) + tab_char := unicode_con_table_up_to_low[ch_index] //C.printf("tab_char: %04x ",tab_char) if ch_len == 2 { @@ -176,7 +264,6 @@ fn up_low(s string, upper_flag bool) string { } } - old_index = _index _index += ch_len // we are done, exit the loop @@ -199,13 +286,13 @@ fn find_char_in_table( in_code u16, upper_flag bool) int { // We will use a simple binary search // - mut first_index := int(0) // first index of our utf8 char range - mut last_index := int(unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range - mut index := int(0) + mut first_index := 0 // first index of our utf8 char range + mut last_index := (unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range + mut index := 0 mut x := u16(0) - mut offset:=int(0) // up to low - mut i_step:=int(1) // up to low + mut offset:=0 // up to low + mut i_step:=1 // up to low if upper_flag==true { offset=1 // low to up i_step=0 // low to up @@ -220,11 +307,10 @@ fn find_char_in_table( in_code u16, upper_flag bool) int { if x == in_code { //C.printf(" Found!\n") - return int( (index<<1) + i_step) + return ( (index<<1) + i_step) } else if x>in_code { last_index=index - }else { first_index=index } @@ -234,7 +320,40 @@ fn find_char_in_table( in_code u16, upper_flag bool) int { } } //C.printf("not found.\n") - return int(0) + return 0 +} + +// find punct in lockup table +fn find_punct_in_table( in_code int , in_table []int ) int { + // + // We will use a simple binary search + // + + mut first_index := 0 + mut last_index := (in_table.len) + mut index := 0 + mut x := 0 + + for { + index = (first_index+last_index) >> 1 + x = in_table[ index ] + //C.printf("(%d..%d) index:%d base[%08x]==>[%08x]\n",first_index,last_index,index,in_code,x) + + if x == in_code { + return index + } + else if x>in_code { + last_index=index + }else { + first_index=index + } + + if (last_index-first_index)<=1 { + break + } + } + //C.printf("not found.\n") + return 0 } @@ -927,3 +1046,640 @@ u16(0x0041), 0x0061, //LATIN CAPITAL LETTER A LATIN SMALL LETTER A 0xFF3A, 0xFF5A, //FULLWIDTH LATIN CAPITAL LETTER Z FULLWIDTH LATIN SMALL LETTER Z ] ) + +/***************************************************************************** +* +* Unicode punctuation chars +* +* source: http://www.unicode.org/faq/punctuation_symbols.html +* +*****************************************************************************/ +const( + +// Western punctuation mark +// Character Name Browser Image +unicode_punct_western=[ +0x0021, // EXCLAMATION MARK ! +0x0022, // QUOTATION MARK " +0x0027, // APOSTROPHE ' +0x002A, // ASTERISK * +0x002C, // COMMA , +0x002E, // FULL STOP . +0x002F, // SOLIDUS / +0x003A, // COLON : +0x003B, // SEMICOLON ; +0x003F, // QUESTION MARK ? +0x00A1, // INVERTED EXCLAMATION MARK ¡ +0x00A7, // SECTION SIGN § +0x00B6, // PILCROW SIGN ¶ +0x00B7, // MIDDLE DOT · +0x00BF, // INVERTED QUESTION MARK ¿ +0x037E, // GREEK QUESTION MARK ; +0x0387, // GREEK ANO TELEIA · +0x055A, // ARMENIAN APOSTROPHE ՚ +0x055B, // ARMENIAN EMPHASIS MARK ՛ +0x055C, // ARMENIAN EXCLAMATION MARK ՜ +0x055D, // ARMENIAN COMMA ՝ +0x055E, // ARMENIAN QUESTION MARK ՞ +0x055F, // ARMENIAN ABBREVIATION MARK ՟ +0x0589, // ARMENIAN FULL STOP ։ +0x05C0, // HEBREW PUNCTUATION PASEQ ׀ +0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃ +0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆ +0x05F3, // HEBREW PUNCTUATION GERESH ׳ +0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״ +] + +// Unicode Characters in the 'Punctuation, Other' Category +// Character Name Browser Image +unicode_punct=[ +0x0021, // EXCLAMATION MARK ! +0x0022, // QUOTATION MARK " +0x0023, // NUMBER SIGN # +0x0025, // PERCENT SIGN % +0x0026, // AMPERSAND & +0x0027, // APOSTROPHE ' +0x002A, // ASTERISK * +0x002C, // COMMA , +0x002E, // FULL STOP . +0x002F, // SOLIDUS / +0x003A, // COLON : +0x003B, // SEMICOLON ; +0x003F, // QUESTION MARK ? +0x0040, // COMMERCIAL AT @ +0x005C, // REVERSE SOLIDUS \ +0x00A1, // INVERTED EXCLAMATION MARK ¡ +0x00A7, // SECTION SIGN § +0x00B6, // PILCROW SIGN ¶ +0x00B7, // MIDDLE DOT · +0x00BF, // INVERTED QUESTION MARK ¿ +0x037E, // GREEK QUESTION MARK ; +0x0387, // GREEK ANO TELEIA · +0x055A, // ARMENIAN APOSTROPHE ՚ +0x055B, // ARMENIAN EMPHASIS MARK ՛ +0x055C, // ARMENIAN EXCLAMATION MARK ՜ +0x055D, // ARMENIAN COMMA ՝ +0x055E, // ARMENIAN QUESTION MARK ՞ +0x055F, // ARMENIAN ABBREVIATION MARK ՟ +0x0589, // ARMENIAN FULL STOP ։ +0x05C0, // HEBREW PUNCTUATION PASEQ ׀ +0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃ +0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆ +0x05F3, // HEBREW PUNCTUATION GERESH ׳ +0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״ +0x0609, // ARABIC-INDIC PER MILLE SIGN ؉ +0x060A, // ARABIC-INDIC PER TEN THOUSAND SIGN ؊ +0x060C, // ARABIC COMMA ، +0x060D, // ARABIC DATE SEPARATOR ؍ +0x061B, // ARABIC SEMICOLON ؛ +0x061E, // ARABIC TRIPLE DOT PUNCTUATION MARK ؞ +0x061F, // ARABIC QUESTION MARK ؟ +0x066A, // ARABIC PERCENT SIGN ٪ +0x066B, // ARABIC DECIMAL SEPARATOR ٫ +0x066C, // ARABIC THOUSANDS SEPARATOR ٬ +0x066D, // ARABIC FIVE POINTED STAR ٭ +0x06D4, // ARABIC FULL STOP ۔ +0x0700, // SYRIAC END OF PARAGRAPH ܀ +0x0701, // SYRIAC SUPRALINEAR FULL STOP ܁ +0x0702, // SYRIAC SUBLINEAR FULL STOP ܂ +0x0703, // SYRIAC SUPRALINEAR COLON ܃ +0x0704, // SYRIAC SUBLINEAR COLON ܄ +0x0705, // SYRIAC HORIZONTAL COLON ܅ +0x0706, // SYRIAC COLON SKEWED LEFT ܆ +0x0707, // SYRIAC COLON SKEWED RIGHT ܇ +0x0708, // SYRIAC SUPRALINEAR COLON SKEWED LEFT ܈ +0x0709, // SYRIAC SUBLINEAR COLON SKEWED RIGHT ܉ +0x070A, // SYRIAC CONTRACTION ܊ +0x070B, // SYRIAC HARKLEAN OBELUS ܋ +0x070C, // SYRIAC HARKLEAN METOBELUS ܌ +0x070D, // SYRIAC HARKLEAN ASTERISCUS ܍ +0x07F7, // NKO SYMBOL GBAKURUNEN ߷ +0x07F8, // NKO COMMA ߸ +0x07F9, // NKO EXCLAMATION MARK ߹ +0x0830, // SAMARITAN PUNCTUATION NEQUDAA ࠰ +0x0831, // SAMARITAN PUNCTUATION AFSAAQ ࠱ +0x0832, // SAMARITAN PUNCTUATION ANGED ࠲ +0x0833, // SAMARITAN PUNCTUATION BAU ࠳ +0x0834, // SAMARITAN PUNCTUATION ATMAAU ࠴ +0x0835, // SAMARITAN PUNCTUATION SHIYYAALAA ࠵ +0x0836, // SAMARITAN ABBREVIATION MARK ࠶ +0x0837, // SAMARITAN PUNCTUATION MELODIC QITSA ࠷ +0x0838, // SAMARITAN PUNCTUATION ZIQAA ࠸ +0x0839, // SAMARITAN PUNCTUATION QITSA ࠹ +0x083A, // SAMARITAN PUNCTUATION ZAEF ࠺ +0x083B, // SAMARITAN PUNCTUATION TURU ࠻ +0x083C, // SAMARITAN PUNCTUATION ARKAANU ࠼ +0x083D, // SAMARITAN PUNCTUATION SOF MASHFAAT ࠽ +0x083E, // SAMARITAN PUNCTUATION ANNAAU ࠾ +0x085E, // MANDAIC PUNCTUATION ࡞ +0x0964, // DEVANAGARI DANDA । +0x0965, // DEVANAGARI DOUBLE DANDA ॥ +0x0970, // DEVANAGARI ABBREVIATION SIGN ॰ +0x09FD, // BENGALI ABBREVIATION SIGN ৽ +0x0A76, // GURMUKHI ABBREVIATION SIGN ੶ +0x0AF0, // GUJARATI ABBREVIATION SIGN ૰ +0x0C77, // TELUGU SIGN SIDDHAM ౷ +0x0C84, // KANNADA SIGN SIDDHAM ಄ +0x0DF4, // SINHALA PUNCTUATION KUNDDALIYA ෴ +0x0E4F, // THAI CHARACTER FONGMAN ๏ +0x0E5A, // THAI CHARACTER ANGKHANKHU ๚ +0x0E5B, // THAI CHARACTER KHOMUT ๛ +0x0F04, // TIBETAN MARK INITIAL YIG MGO MDUN MA ༄ +0x0F05, // TIBETAN MARK CLOSING YIG MGO SGAB MA ༅ +0x0F06, // TIBETAN MARK CARET YIG MGO PHUR SHAD MA ༆ +0x0F07, // TIBETAN MARK YIG MGO TSHEG SHAD MA ༇ +0x0F08, // TIBETAN MARK SBRUL SHAD ༈ +0x0F09, // TIBETAN MARK BSKUR YIG MGO ༉ +0x0F0A, // TIBETAN MARK BKA- SHOG YIG MGO ༊ +0x0F0B, // TIBETAN MARK INTERSYLLABIC TSHEG ་ +0x0F0C, // TIBETAN MARK DELIMITER TSHEG BSTAR ༌ +0x0F0D, // TIBETAN MARK SHAD ། +0x0F0E, // TIBETAN MARK NYIS SHAD ༎ +0x0F0F, // TIBETAN MARK TSHEG SHAD ༏ +0x0F10, // TIBETAN MARK NYIS TSHEG SHAD ༐ +0x0F11, // TIBETAN MARK RIN CHEN SPUNGS SHAD ༑ +0x0F12, // TIBETAN MARK RGYA GRAM SHAD ༒ +0x0F14, // TIBETAN MARK GTER TSHEG ༔ +0x0F85, // TIBETAN MARK PALUTA ྅ +0x0FD0, // TIBETAN MARK BSKA- SHOG GI MGO RGYAN ࿐ +0x0FD1, // TIBETAN MARK MNYAM YIG GI MGO RGYAN ࿑ +0x0FD2, // TIBETAN MARK NYIS TSHEG ࿒ +0x0FD3, // TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA ࿓ +0x0FD4, // TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA ࿔ +0x0FD9, // TIBETAN MARK LEADING MCHAN RTAGS ࿙ +0x0FDA, // TIBETAN MARK TRAILING MCHAN RTAGS ࿚ +0x104A, // MYANMAR SIGN LITTLE SECTION ၊ +0x104B, // MYANMAR SIGN SECTION ။ +0x104C, // MYANMAR SYMBOL LOCATIVE ၌ +0x104D, // MYANMAR SYMBOL COMPLETED ၍ +0x104E, // MYANMAR SYMBOL AFOREMENTIONED ၎ +0x104F, // MYANMAR SYMBOL GENITIVE ၏ +0x10FB, // GEORGIAN PARAGRAPH SEPARATOR ჻ +0x1360, // ETHIOPIC SECTION MARK ፠ +0x1361, // ETHIOPIC WORDSPACE ፡ +0x1362, // ETHIOPIC FULL STOP ። +0x1363, // ETHIOPIC COMMA ፣ +0x1364, // ETHIOPIC SEMICOLON ፤ +0x1365, // ETHIOPIC COLON ፥ +0x1366, // ETHIOPIC PREFACE COLON ፦ +0x1367, // ETHIOPIC QUESTION MARK ፧ +0x1368, // ETHIOPIC PARAGRAPH SEPARATOR ፨ +0x166E, // CANADIAN SYLLABICS FULL STOP ᙮ +0x16EB, // RUNIC SINGLE PUNCTUATION ᛫ +0x16EC, // RUNIC MULTIPLE PUNCTUATION ᛬ +0x16ED, // RUNIC CROSS PUNCTUATION ᛭ +0x1735, // PHILIPPINE SINGLE PUNCTUATION ᜵ +0x1736, // PHILIPPINE DOUBLE PUNCTUATION ᜶ +0x17D4, // KHMER SIGN KHAN ។ +0x17D5, // KHMER SIGN BARIYOOSAN ៕ +0x17D6, // KHMER SIGN CAMNUC PII KUUH ៖ +0x17D8, // KHMER SIGN BEYYAL ៘ +0x17D9, // KHMER SIGN PHNAEK MUAN ៙ +0x17DA, // KHMER SIGN KOOMUUT ៚ +0x1800, // MONGOLIAN BIRGA ᠀ +0x1801, // MONGOLIAN ELLIPSIS ᠁ +0x1802, // MONGOLIAN COMMA ᠂ +0x1803, // MONGOLIAN FULL STOP ᠃ +0x1804, // MONGOLIAN COLON ᠄ +0x1805, // MONGOLIAN FOUR DOTS ᠅ +0x1807, // MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER ᠇ +0x1808, // MONGOLIAN MANCHU COMMA ᠈ +0x1809, // MONGOLIAN MANCHU FULL STOP ᠉ +0x180A, // MONGOLIAN NIRUGU ᠊ +0x1944, // LIMBU EXCLAMATION MARK ᥄ +0x1945, // LIMBU QUESTION MARK ᥅ +0x1A1E, // BUGINESE PALLAWA ᨞ +0x1A1F, // BUGINESE END OF SECTION ᨟ +0x1AA0, // TAI THAM SIGN WIANG ᪠ +0x1AA1, // TAI THAM SIGN WIANGWAAK ᪡ +0x1AA2, // TAI THAM SIGN SAWAN ᪢ +0x1AA3, // TAI THAM SIGN KEOW ᪣ +0x1AA4, // TAI THAM SIGN HOY ᪤ +0x1AA5, // TAI THAM SIGN DOKMAI ᪥ +0x1AA6, // TAI THAM SIGN REVERSED ROTATED RANA ᪦ +0x1AA8, // TAI THAM SIGN KAAN ᪨ +0x1AA9, // TAI THAM SIGN KAANKUU ᪩ +0x1AAA, // TAI THAM SIGN SATKAAN ᪪ +0x1AAB, // TAI THAM SIGN SATKAANKUU ᪫ +0x1AAC, // TAI THAM SIGN HANG ᪬ +0x1AAD, // TAI THAM SIGN CAANG ᪭ +0x1B5A, // BALINESE PANTI ᭚ +0x1B5B, // BALINESE PAMADA ᭛ +0x1B5C, // BALINESE WINDU ᭜ +0x1B5D, // BALINESE CARIK PAMUNGKAH ᭝ +0x1B5E, // BALINESE CARIK SIKI ᭞ +0x1B5F, // BALINESE CARIK PAREREN ᭟ +0x1B60, // BALINESE PAMENENG ᭠ +0x1BFC, // BATAK SYMBOL BINDU NA METEK ᯼ +0x1BFD, // BATAK SYMBOL BINDU PINARBORAS ᯽ +0x1BFE, // BATAK SYMBOL BINDU JUDUL ᯾ +0x1BFF, // BATAK SYMBOL BINDU PANGOLAT ᯿ +0x1C3B, // LEPCHA PUNCTUATION TA-ROL ᰻ +0x1C3C, // LEPCHA PUNCTUATION NYET THYOOM TA-ROL ᰼ +0x1C3D, // LEPCHA PUNCTUATION CER-WA ᰽ +0x1C3E, // LEPCHA PUNCTUATION TSHOOK CER-WA ᰾ +0x1C3F, // LEPCHA PUNCTUATION TSHOOK ᰿ +0x1C7E, // OL CHIKI PUNCTUATION MUCAAD ᱾ +0x1C7F, // OL CHIKI PUNCTUATION DOUBLE MUCAAD ᱿ +0x1CC0, // SUNDANESE PUNCTUATION BINDU SURYA ᳀ +0x1CC1, // SUNDANESE PUNCTUATION BINDU PANGLONG ᳁ +0x1CC2, // SUNDANESE PUNCTUATION BINDU PURNAMA ᳂ +0x1CC3, // SUNDANESE PUNCTUATION BINDU CAKRA ᳃ +0x1CC4, // SUNDANESE PUNCTUATION BINDU LEU SATANGA ᳄ +0x1CC5, // SUNDANESE PUNCTUATION BINDU KA SATANGA ᳅ +0x1CC6, // SUNDANESE PUNCTUATION BINDU DA SATANGA ᳆ +0x1CC7, // SUNDANESE PUNCTUATION BINDU BA SATANGA ᳇ +0x1CD3, // VEDIC SIGN NIHSHVASA ᳓ +0x2016, // DOUBLE VERTICAL LINE ‖ +0x2017, // DOUBLE LOW LINE ‗ +0x2020, // DAGGER † +0x2021, // DOUBLE DAGGER ‡ +0x2022, // BULLET • +0x2023, // TRIANGULAR BULLET ‣ +0x2024, // ONE DOT LEADER ․ +0x2025, // TWO DOT LEADER ‥ +0x2026, // HORIZONTAL ELLIPSIS … +0x2027, // HYPHENATION POINT ‧ +0x2030, // PER MILLE SIGN ‰ +0x2031, // PER TEN THOUSAND SIGN ‱ +0x2032, // PRIME ′ +0x2033, // DOUBLE PRIME ″ +0x2034, // TRIPLE PRIME ‴ +0x2035, // REVERSED PRIME ‵ +0x2036, // REVERSED DOUBLE PRIME ‶ +0x2037, // REVERSED TRIPLE PRIME ‷ +0x2038, // CARET ‸ +0x203B, // REFERENCE MARK ※ +0x203C, // DOUBLE EXCLAMATION MARK ‼ +0x203D, // INTERROBANG ‽ +0x203E, // OVERLINE ‾ +0x2041, // CARET INSERTION POINT ⁁ +0x2042, // ASTERISM ⁂ +0x2043, // HYPHEN BULLET ⁃ +0x2047, // DOUBLE QUESTION MARK ⁇ +0x2048, // QUESTION EXCLAMATION MARK ⁈ +0x2049, // EXCLAMATION QUESTION MARK ⁉ +0x204A, // TIRONIAN SIGN ET ⁊ +0x204B, // REVERSED PILCROW SIGN ⁋ +0x204C, // BLACK LEFTWARDS BULLET ⁌ +0x204D, // BLACK RIGHTWARDS BULLET ⁍ +0x204E, // LOW ASTERISK ⁎ +0x204F, // REVERSED SEMICOLON ⁏ +0x2050, // CLOSE UP ⁐ +0x2051, // TWO ASTERISKS ALIGNED VERTICALLY ⁑ +0x2053, // SWUNG DASH ⁓ +0x2055, // FLOWER PUNCTUATION MARK ⁕ +0x2056, // THREE DOT PUNCTUATION ⁖ +0x2057, // QUADRUPLE PRIME ⁗ +0x2058, // FOUR DOT PUNCTUATION ⁘ +0x2059, // FIVE DOT PUNCTUATION ⁙ +0x205A, // TWO DOT PUNCTUATION ⁚ +0x205B, // FOUR DOT MARK ⁛ +0x205C, // DOTTED CROSS ⁜ +0x205D, // TRICOLON ⁝ +0x205E, // VERTICAL FOUR DOTS ⁞ +0x2CF9, // COPTIC OLD NUBIAN FULL STOP ⳹ +0x2CFA, // COPTIC OLD NUBIAN DIRECT QUESTION MARK ⳺ +0x2CFB, // COPTIC OLD NUBIAN INDIRECT QUESTION MARK ⳻ +0x2CFC, // COPTIC OLD NUBIAN VERSE DIVIDER ⳼ +0x2CFE, // COPTIC FULL STOP ⳾ +0x2CFF, // COPTIC MORPHOLOGICAL DIVIDER ⳿ +0x2D70, // TIFINAGH SEPARATOR MARK ⵰ +0x2E00, // RIGHT ANGLE SUBSTITUTION MARKER ⸀ +0x2E01, // RIGHT ANGLE DOTTED SUBSTITUTION MARKER ⸁ +0x2E06, // RAISED INTERPOLATION MARKER ⸆ +0x2E07, // RAISED DOTTED INTERPOLATION MARKER ⸇ +0x2E08, // DOTTED TRANSPOSITION MARKER ⸈ +0x2E0B, // RAISED SQUARE ⸋ +0x2E0E, // EDITORIAL CORONIS ⸎ +0x2E0F, // PARAGRAPHOS ⸏ +0x2E10, // FORKED PARAGRAPHOS ⸐ +0x2E11, // REVERSED FORKED PARAGRAPHOS ⸑ +0x2E12, // HYPODIASTOLE ⸒ +0x2E13, // DOTTED OBELOS ⸓ +0x2E14, // DOWNWARDS ANCORA ⸔ +0x2E15, // UPWARDS ANCORA ⸕ +0x2E16, // DOTTED RIGHT-POINTING ANGLE ⸖ +0x2E18, // INVERTED INTERROBANG ⸘ +0x2E19, // PALM BRANCH ⸙ +0x2E1B, // TILDE WITH RING ABOVE ⸛ +0x2E1E, // TILDE WITH DOT ABOVE ⸞ +0x2E1F, // TILDE WITH DOT BELOW ⸟ +0x2E2A, // TWO DOTS OVER ONE DOT PUNCTUATION ⸪ +0x2E2B, // ONE DOT OVER TWO DOTS PUNCTUATION ⸫ +0x2E2C, // SQUARED FOUR DOT PUNCTUATION ⸬ +0x2E2D, // FIVE DOT MARK ⸭ +0x2E2E, // REVERSED QUESTION MARK ⸮ +0x2E30, // RING POINT ⸰ +0x2E31, // WORD SEPARATOR MIDDLE DOT ⸱ +0x2E32, // TURNED COMMA ⸲ +0x2E33, // RAISED DOT ⸳ +0x2E34, // RAISED COMMA ⸴ +0x2E35, // TURNED SEMICOLON ⸵ +0x2E36, // DAGGER WITH LEFT GUARD ⸶ +0x2E37, // DAGGER WITH RIGHT GUARD ⸷ +0x2E38, // TURNED DAGGER ⸸ +0x2E39, // TOP HALF SECTION SIGN ⸹ +0x2E3C, // STENOGRAPHIC FULL STOP ⸼ +0x2E3D, // VERTICAL SIX DOTS ⸽ +0x2E3E, // WIGGLY VERTICAL LINE ⸾ +0x2E3F, // CAPITULUM ⸿ +0x2E41, // REVERSED COMMA ⹁ +0x2E43, // DASH WITH LEFT UPTURN ⹃ +0x2E44, // DOUBLE SUSPENSION MARK ⹄ +0x2E45, // INVERTED LOW KAVYKA ⹅ +0x2E46, // INVERTED LOW KAVYKA WITH KAVYKA ABOVE ⹆ +0x2E47, // LOW KAVYKA ⹇ +0x2E48, // LOW KAVYKA WITH DOT ⹈ +0x2E49, // DOUBLE STACKED COMMA ⹉ +0x2E4A, // DOTTED SOLIDUS ⹊ +0x2E4B, // TRIPLE DAGGER ⹋ +0x2E4C, // MEDIEVAL COMMA ⹌ +0x2E4D, // PARAGRAPHUS MARK ⹍ +0x2E4E, // PUNCTUS ELEVATUS MARK ⹎ +0x2E4F, // CORNISH VERSE DIVIDER ⹏ +0x3001, // IDEOGRAPHIC COMMA 、 +0x3002, // IDEOGRAPHIC FULL STOP 。 +0x3003, // DITTO MARK 〃 +0x303D, // PART ALTERNATION MARK 〽 +0x30FB, // KATAKANA MIDDLE DOT ・ +0xA4FE, // LISU PUNCTUATION COMMA ꓾ +0xA4FF, // LISU PUNCTUATION FULL STOP ꓿ +0xA60D, // VAI COMMA ꘍ +0xA60E, // VAI FULL STOP ꘎ +0xA60F, // VAI QUESTION MARK ꘏ +0xA673, // SLAVONIC ASTERISK ꙳ +0xA67E, // CYRILLIC KAVYKA ꙾ +0xA6F2, // BAMUM NJAEMLI ꛲ +0xA6F3, // BAMUM FULL STOP ꛳ +0xA6F4, // BAMUM COLON ꛴ +0xA6F5, // BAMUM COMMA ꛵ +0xA6F6, // BAMUM SEMICOLON ꛶ +0xA6F7, // BAMUM QUESTION MARK ꛷ +0xA874, // PHAGS-PA SINGLE HEAD MARK ꡴ +0xA875, // PHAGS-PA DOUBLE HEAD MARK ꡵ +0xA876, // PHAGS-PA MARK SHAD ꡶ +0xA877, // PHAGS-PA MARK DOUBLE SHAD ꡷ +0xA8CE, // SAURASHTRA DANDA ꣎ +0xA8CF, // SAURASHTRA DOUBLE DANDA ꣏ +0xA8F8, // DEVANAGARI SIGN PUSHPIKA ꣸ +0xA8F9, // DEVANAGARI GAP FILLER ꣹ +0xA8FA, // DEVANAGARI CARET ꣺ +0xA8FC, // DEVANAGARI SIGN SIDDHAM ꣼ +0xA92E, // KAYAH LI SIGN CWI ꤮ +0xA92F, // KAYAH LI SIGN SHYA ꤯ +0xA95F, // REJANG SECTION MARK ꥟ +0xA9C1, // JAVANESE LEFT RERENGGAN ꧁ +0xA9C2, // JAVANESE RIGHT RERENGGAN ꧂ +0xA9C3, // JAVANESE PADA ANDAP ꧃ +0xA9C4, // JAVANESE PADA MADYA ꧄ +0xA9C5, // JAVANESE PADA LUHUR ꧅ +0xA9C6, // JAVANESE PADA WINDU ꧆ +0xA9C7, // JAVANESE PADA PANGKAT ꧇ +0xA9C8, // JAVANESE PADA LINGSA ꧈ +0xA9C9, // JAVANESE PADA LUNGSI ꧉ +0xA9CA, // JAVANESE PADA ADEG ꧊ +0xA9CB, // JAVANESE PADA ADEG ADEG ꧋ +0xA9CC, // JAVANESE PADA PISELEH ꧌ +0xA9CD, // JAVANESE TURNED PADA PISELEH ꧍ +0xA9DE, // JAVANESE PADA TIRTA TUMETES ꧞ +0xA9DF, // JAVANESE PADA ISEN-ISEN ꧟ +0xAA5C, // CHAM PUNCTUATION SPIRAL ꩜ +0xAA5D, // CHAM PUNCTUATION DANDA ꩝ +0xAA5E, // CHAM PUNCTUATION DOUBLE DANDA ꩞ +0xAA5F, // CHAM PUNCTUATION TRIPLE DANDA ꩟ +0xAADE, // TAI VIET SYMBOL HO HOI ꫞ +0xAADF, // TAI VIET SYMBOL KOI KOI ꫟ +0xAAF0, // MEETEI MAYEK CHEIKHAN ꫰ +0xAAF1, // MEETEI MAYEK AHANG KHUDAM ꫱ +0xABEB, // MEETEI MAYEK CHEIKHEI ꯫ +0xFE10, // PRESENTATION FORM FOR VERTICAL COMMA ︐ +0xFE11, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA ︑ +0xFE12, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP ︒ +0xFE13, // PRESENTATION FORM FOR VERTICAL COLON ︓ +0xFE14, // PRESENTATION FORM FOR VERTICAL SEMICOLON ︔ +0xFE15, // PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK ︕ +0xFE16, // PRESENTATION FORM FOR VERTICAL QUESTION MARK ︖ +0xFE19, // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS ︙ +0xFE30, // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER ︰ +0xFE45, // SESAME DOT ﹅ +0xFE46, // WHITE SESAME DOT ﹆ +0xFE49, // DASHED OVERLINE ﹉ +0xFE4A, // CENTRELINE OVERLINE ﹊ +0xFE4B, // WAVY OVERLINE ﹋ +0xFE4C, // DOUBLE WAVY OVERLINE ﹌ +0xFE50, // SMALL COMMA ﹐ +0xFE51, // SMALL IDEOGRAPHIC COMMA ﹑ +0xFE52, // SMALL FULL STOP ﹒ +0xFE54, // SMALL SEMICOLON ﹔ +0xFE55, // SMALL COLON ﹕ +0xFE56, // SMALL QUESTION MARK ﹖ +0xFE57, // SMALL EXCLAMATION MARK ﹗ +0xFE5F, // SMALL NUMBER SIGN ﹟ +0xFE60, // SMALL AMPERSAND ﹠ +0xFE61, // SMALL ASTERISK ﹡ +0xFE68, // SMALL REVERSE SOLIDUS ﹨ +0xFE6A, // SMALL PERCENT SIGN ﹪ +0xFE6B, // SMALL COMMERCIAL AT ﹫ +0xFF01, // FULLWIDTH EXCLAMATION MARK ! +0xFF02, // FULLWIDTH QUOTATION MARK " +0xFF03, // FULLWIDTH NUMBER SIGN # +0xFF05, // FULLWIDTH PERCENT SIGN % +0xFF06, // FULLWIDTH AMPERSAND & +0xFF07, // FULLWIDTH APOSTROPHE ' +0xFF0A, // FULLWIDTH ASTERISK * +0xFF0C, // FULLWIDTH COMMA , +0xFF0E, // FULLWIDTH FULL STOP . +0xFF0F, // FULLWIDTH SOLIDUS / +0xFF1A, // FULLWIDTH COLON : +0xFF1B, // FULLWIDTH SEMICOLON ; +0xFF1F, // FULLWIDTH QUESTION MARK ? +0xFF20, // FULLWIDTH COMMERCIAL AT @ +0xFF3C, // FULLWIDTH REVERSE SOLIDUS \ +0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP 。 +0xFF64, // HALFWIDTH IDEOGRAPHIC COMMA 、 +0xFF65, // HALFWIDTH KATAKANA MIDDLE DOT ・ +0x10100, // AEGEAN WORD SEPARATOR LINE 𐄀 +0x10101, // AEGEAN WORD SEPARATOR DOT 𐄁 +0x10102, // AEGEAN CHECK MARK 𐄂 +0x1039F, // UGARITIC WORD DIVIDER 𐎟 +0x103D0, // OLD PERSIAN WORD DIVIDER 𐏐 +0x1056F, // CAUCASIAN ALBANIAN CITATION MARK 𐕯 +0x10857, // IMPERIAL ARAMAIC SECTION SIGN 𐡗 +0x1091F, // PHOENICIAN WORD SEPARATOR 𐤟 +0x1093F, // LYDIAN TRIANGULAR MARK 𐤿 +0x10A50, // KHAROSHTHI PUNCTUATION DOT 𐩐 +0x10A51, // KHAROSHTHI PUNCTUATION SMALL CIRCLE 𐩑 +0x10A52, // KHAROSHTHI PUNCTUATION CIRCLE 𐩒 +0x10A53, // KHAROSHTHI PUNCTUATION CRESCENT BAR 𐩓 +0x10A54, // KHAROSHTHI PUNCTUATION MANGALAM 𐩔 +0x10A55, // KHAROSHTHI PUNCTUATION LOTUS 𐩕 +0x10A56, // KHAROSHTHI PUNCTUATION DANDA 𐩖 +0x10A57, // KHAROSHTHI PUNCTUATION DOUBLE DANDA 𐩗 +0x10A58, // KHAROSHTHI PUNCTUATION LINES 𐩘 +0x10A7F, // OLD SOUTH ARABIAN NUMERIC INDICATOR 𐩿 +0x10AF0, // MANICHAEAN PUNCTUATION STAR 𐫰 +0x10AF1, // MANICHAEAN PUNCTUATION FLEURON 𐫱 +0x10AF2, // MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT 𐫲 +0x10AF3, // MANICHAEAN PUNCTUATION DOT WITHIN DOT 𐫳 +0x10AF4, // MANICHAEAN PUNCTUATION DOT 𐫴 +0x10AF5, // MANICHAEAN PUNCTUATION TWO DOTS 𐫵 +0x10AF6, // MANICHAEAN PUNCTUATION LINE FILLER 𐫶 +0x10B39, // AVESTAN ABBREVIATION MARK 𐬹 +0x10B3A, // TINY TWO DOTS OVER ONE DOT PUNCTUATION 𐬺 +0x10B3B, // SMALL TWO DOTS OVER ONE DOT PUNCTUATION 𐬻 +0x10B3C, // LARGE TWO DOTS OVER ONE DOT PUNCTUATION 𐬼 +0x10B3D, // LARGE ONE DOT OVER TWO DOTS PUNCTUATION 𐬽 +0x10B3E, // LARGE TWO RINGS OVER ONE RING PUNCTUATION 𐬾 +0x10B3F, // LARGE ONE RING OVER TWO RINGS PUNCTUATION 𐬿 +0x10B99, // PSALTER PAHLAVI SECTION MARK 𐮙 +0x10B9A, // PSALTER PAHLAVI TURNED SECTION MARK 𐮚 +0x10B9B, // PSALTER PAHLAVI FOUR DOTS WITH CROSS 𐮛 +0x10B9C, // PSALTER PAHLAVI FOUR DOTS WITH DOT 𐮜 +0x10F55, // SOGDIAN PUNCTUATION TWO VERTICAL BARS 𐽕 +0x10F56, // SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS 𐽖 +0x10F57, // SOGDIAN PUNCTUATION CIRCLE WITH DOT 𐽗 +0x10F58, // SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS 𐽘 +0x10F59, // SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 𐽙 +0x11047, // BRAHMI DANDA 𑁇 +0x11048, // BRAHMI DOUBLE DANDA 𑁈 +0x11049, // BRAHMI PUNCTUATION DOT 𑁉 +0x1104A, // BRAHMI PUNCTUATION DOUBLE DOT 𑁊 +0x1104B, // BRAHMI PUNCTUATION LINE 𑁋 +0x1104C, // BRAHMI PUNCTUATION CRESCENT BAR 𑁌 +0x1104D, // BRAHMI PUNCTUATION LOTUS 𑁍 +0x110BB, // KAITHI ABBREVIATION SIGN 𑂻 +0x110BC, // KAITHI ENUMERATION SIGN 𑂼 +0x110BE, // KAITHI SECTION MARK 𑂾 +0x110BF, // KAITHI DOUBLE SECTION MARK 𑂿 +0x110C0, // KAITHI DANDA 𑃀 +0x110C1, // KAITHI DOUBLE DANDA 𑃁 +0x11140, // CHAKMA SECTION MARK 𑅀 +0x11141, // CHAKMA DANDA 𑅁 +0x11142, // CHAKMA DOUBLE DANDA 𑅂 +0x11143, // CHAKMA QUESTION MARK 𑅃 +0x11174, // MAHAJANI ABBREVIATION SIGN 𑅴 +0x11175, // MAHAJANI SECTION MARK 𑅵 +0x111C5, // SHARADA DANDA 𑇅 +0x111C6, // SHARADA DOUBLE DANDA 𑇆 +0x111C7, // SHARADA ABBREVIATION SIGN 𑇇 +0x111C8, // SHARADA SEPARATOR 𑇈 +0x111CD, // SHARADA SUTRA MARK 𑇍 +0x111DB, // SHARADA SIGN SIDDHAM 𑇛 +0x111DD, // SHARADA CONTINUATION SIGN 𑇝 +0x111DE, // SHARADA SECTION MARK-1 𑇞 +0x111DF, // SHARADA SECTION MARK-2 𑇟 +0x11238, // KHOJKI DANDA 𑈸 +0x11239, // KHOJKI DOUBLE DANDA 𑈹 +0x1123A, // KHOJKI WORD SEPARATOR 𑈺 +0x1123B, // KHOJKI SECTION MARK 𑈻 +0x1123C, // KHOJKI DOUBLE SECTION MARK 𑈼 +0x1123D, // KHOJKI ABBREVIATION SIGN 𑈽 +0x112A9, // MULTANI SECTION MARK 𑊩 +0x1144B, // NEWA DANDA 𑑋 +0x1144C, // NEWA DOUBLE DANDA 𑑌 +0x1144D, // NEWA COMMA 𑑍 +0x1144E, // NEWA GAP FILLER 𑑎 +0x1144F, // NEWA ABBREVIATION SIGN 𑑏 +0x1145B, // NEWA PLACEHOLDER MARK 𑑛 +0x1145D, // NEWA INSERTION SIGN 𑑝 +0x114C6, // TIRHUTA ABBREVIATION SIGN 𑓆 +0x115C1, // SIDDHAM SIGN SIDDHAM 𑗁 +0x115C2, // SIDDHAM DANDA 𑗂 +0x115C3, // SIDDHAM DOUBLE DANDA 𑗃 +0x115C4, // SIDDHAM SEPARATOR DOT 𑗄 +0x115C5, // SIDDHAM SEPARATOR BAR 𑗅 +0x115C6, // SIDDHAM REPETITION MARK-1 𑗆 +0x115C7, // SIDDHAM REPETITION MARK-2 𑗇 +0x115C8, // SIDDHAM REPETITION MARK-3 𑗈 +0x115C9, // SIDDHAM END OF TEXT MARK 𑗉 +0x115CA, // SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS 𑗊 +0x115CB, // SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS 𑗋 +0x115CC, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS 𑗌 +0x115CD, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS 𑗍 +0x115CE, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS 𑗎 +0x115CF, // SIDDHAM SECTION MARK DOUBLE RING 𑗏 +0x115D0, // SIDDHAM SECTION MARK DOUBLE RING WITH RAYS 𑗐 +0x115D1, // SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS 𑗑 +0x115D2, // SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS 𑗒 +0x115D3, // SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS 𑗓 +0x115D4, // SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS 𑗔 +0x115D5, // SIDDHAM SECTION MARK WITH CIRCLES AND RAYS 𑗕 +0x115D6, // SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES 𑗖 +0x115D7, // SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 𑗗 +0x11641, // MODI DANDA 𑙁 +0x11642, // MODI DOUBLE DANDA 𑙂 +0x11643, // MODI ABBREVIATION SIGN 𑙃 +0x11660, // MONGOLIAN BIRGA WITH ORNAMENT 𑙠 +0x11661, // MONGOLIAN ROTATED BIRGA 𑙡 +0x11662, // MONGOLIAN DOUBLE BIRGA WITH ORNAMENT 𑙢 +0x11663, // MONGOLIAN TRIPLE BIRGA WITH ORNAMENT 𑙣 +0x11664, // MONGOLIAN BIRGA WITH DOUBLE ORNAMENT 𑙤 +0x11665, // MONGOLIAN ROTATED BIRGA WITH ORNAMENT 𑙥 +0x11666, // MONGOLIAN ROTATED BIRGA WITH DOUBLE ORNAMENT 𑙦 +0x11667, // MONGOLIAN INVERTED BIRGA 𑙧 +0x11668, // MONGOLIAN INVERTED BIRGA WITH DOUBLE ORNAMENT 𑙨 +0x11669, // MONGOLIAN SWIRL BIRGA 𑙩 +0x1166A, // MONGOLIAN SWIRL BIRGA WITH ORNAMENT 𑙪 +0x1166B, // MONGOLIAN SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙫 +0x1166C, // MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙬 +0x1173C, // AHOM SIGN SMALL SECTION 𑜼 +0x1173D, // AHOM SIGN SECTION 𑜽 +0x1173E, // AHOM SIGN RULAI 𑜾 +0x1183B, // DOGRA ABBREVIATION SIGN 𑠻 +0x119E2, // NANDINAGARI SIGN SIDDHAM 𑧢 +0x11A3F, // ZANABAZAR SQUARE INITIAL HEAD MARK 𑨿 +0x11A40, // ZANABAZAR SQUARE CLOSING HEAD MARK 𑩀 +0x11A41, // ZANABAZAR SQUARE MARK TSHEG 𑩁 +0x11A42, // ZANABAZAR SQUARE MARK SHAD 𑩂 +0x11A43, // ZANABAZAR SQUARE MARK DOUBLE SHAD 𑩃 +0x11A44, // ZANABAZAR SQUARE MARK LONG TSHEG 𑩄 +0x11A45, // ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK 𑩅 +0x11A46, // ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 𑩆 +0x11A9A, // SOYOMBO MARK TSHEG 𑪚 +0x11A9B, // SOYOMBO MARK SHAD 𑪛 +0x11A9C, // SOYOMBO MARK DOUBLE SHAD 𑪜 +0x11A9E, // SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME 𑪞 +0x11A9F, // SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME 𑪟 +0x11AA0, // SOYOMBO HEAD MARK WITH MOON AND SUN 𑪠 +0x11AA1, // SOYOMBO TERMINAL MARK-1 𑪡 +0x11AA2, // SOYOMBO TERMINAL MARK-2 𑪢 +0x11C41, // BHAIKSUKI DANDA 𑱁 +0x11C42, // BHAIKSUKI DOUBLE DANDA 𑱂 +0x11C43, // BHAIKSUKI WORD SEPARATOR 𑱃 +0x11C44, // BHAIKSUKI GAP FILLER-1 𑱄 +0x11C45, // BHAIKSUKI GAP FILLER-2 𑱅 +0x11C70, // MARCHEN HEAD MARK 𑱰 +0x11C71, // MARCHEN MARK SHAD 𑱱 +0x11EF7, // MAKASAR PASSIMBANG 𑻷 +0x11EF8, // MAKASAR END OF SECTION 𑻸 +0x11FFF, // TAMIL PUNCTUATION END OF TEXT 𑿿 +0x12470, // CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER 𒑰 +0x12471, // CUNEIFORM PUNCTUATION SIGN VERTICAL COLON 𒑱 +0x12472, // CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON 𒑲 +0x12473, // CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON 𒑳 +0x12474, // CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 𒑴 +0x16A6E, // MRO DANDA 𖩮 +0x16A6F, // MRO DOUBLE DANDA 𖩯 +0x16AF5, // BASSA VAH FULL STOP 𖫵 +0x16B37, // PAHAWH HMONG SIGN VOS THOM 𖬷 +0x16B38, // PAHAWH HMONG SIGN VOS TSHAB CEEB 𖬸 +0x16B39, // PAHAWH HMONG SIGN CIM CHEEM 𖬹 +0x16B3A, // PAHAWH HMONG SIGN VOS THIAB 𖬺 +0x16B3B, // PAHAWH HMONG SIGN VOS FEEM 𖬻 +0x16B44, // PAHAWH HMONG SIGN XAUS 𖭄 +0x16E97, // MEDEFAIDRIN COMMA 𖺗 +0x16E98, // MEDEFAIDRIN FULL STOP 𖺘 +0x16E99, // MEDEFAIDRIN SYMBOL AIVA 𖺙 +0x16E9A, // MEDEFAIDRIN EXCLAMATION OH 𖺚 +0x16FE2, // OLD CHINESE HOOK MARK 𖿢 +0x1BC9F, // DUPLOYAN PUNCTUATION CHINOOK FULL STOP 𛲟 +0x1DA87, // SIGNWRITING COMMA 𝪇 +0x1DA88, // SIGNWRITING FULL STOP 𝪈 +0x1DA89, // SIGNWRITING SEMICOLON 𝪉 +0x1DA8A, // SIGNWRITING COLON 𝪊 +0x1DA8B, // SIGNWRITING PARENTHESIS 𝪋 +0x1E95E, // ADLAM INITIAL EXCLAMATION MARK 𞥞 +0x1E95F, // ADLAM INITIAL QUESTION MARK +] +) diff --git a/vlib/encoding/utf8/utf8_util_test.v b/vlib/encoding/utf8/utf8_util_test.v index e2d4826ad8..208fdc8674 100644 --- a/vlib/encoding/utf8/utf8_util_test.v +++ b/vlib/encoding/utf8/utf8_util_test.v @@ -25,4 +25,25 @@ fn test_utf8_util() { // test u_len function assert utf8.u_len(src1)==15 //29 assert utf8.u_len("pippo".ustring())==5 + + // western punctuation + a := '.abc?abcòàè.' + assert utf8.is_punct(a,0)==true + assert utf8.is_punct('b',0)==false + assert utf8.is_uchar_punct(0x002E)==true + assert utf8.is_punct(a,4)==true // ? + assert utf8.is_punct(a,14)==true // last . + assert utf8.is_punct(a,12)==false // è + println("OK western") + + // global punctuation + b := '.ĂĂa. ÔÔ TESTO Æ€' + assert utf8.is_global_punct(b,0)==true + assert utf8.is_global_punct('.',0)==true + assert utf8.is_uchar_punct(0x002E)==true + assert utf8.is_global_punct(b,6)==true // . + assert utf8.is_global_punct(b,1)==false // a + + // test utility functions + assert utf8.get_uchar(b,0)==0x002E }