Anklang 0.3.0-460-gc4ef46ba
ASE — Anklang Sound Engine (C++)

« « « Anklang Documentation
Loading...
Searching...
No Matches
unicode.cc
Go to the documentation of this file.
1 // This Source Code Form is licensed MPL-2.0: http://mozilla.org/MPL/2.0
2#include "unicode.hh"
3#include "blob.hh"
4#include "platform.hh"
5#include "websocket.hh"
6
7#include <glib.h>
8
9namespace Ase {
10
11/* https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
12 * Table 3-6. UTF-8 Bit Distribution
13 * | Scalar Value | First Byte | Second Byte | Third Byte | Fourth Byte
14 * | 00000000 0xxxxxxx | 0xxxxxxx | | |
15 * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | |
16 * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx |
17 * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx
18 *
19 * Table 3-7. Well-Formed UTF-8 Byte Sequences
20 * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte
21 * | U+0000..U+007F | 00..7F | | |
22 * | U+0080..U+07FF | C2..DF | 80..BF | |
23 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF |
24 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF |
25 * | U+D000..U+D7FF | ED | 80..9F | 80..BF |
26 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF |
27 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF
28 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
29 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF
30 */
31
39template<int CODEPOINT> static inline size_t // returns length of unicode char
40utf8character (const char *str, uint32_t *unicode)
41{
42 /* https://en.wikipedia.org/wiki/UTF-8
43 : 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
44 : 000 001 002 003 004 005 006 007 010 011 012 013 014 015 016 017
45 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
46 : 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF
47 */
48 const uint8_t c = str[0];
49 // optimized for one-byte sequences
50 if (CODEPOINT <= 1 && __builtin_expect (c < 0xc0, true))
51 {
52 if (CODEPOINT)
53 *unicode = c; // valid if c <= 0x7F
54 return 1; // treat as Latin-1 otherwise
55 }
56 // multi-byte sequences
57 switch (c & 0xF8)
58 {
59 uint8_t d, e, f;
60 case 0xC0: case 0xC8: case 0xD0: case 0xD8: // 2-byte sequence
61 d = str[1];
62 if (__builtin_expect ((d & 0xC0) != 0x80, false))
63 goto one_byte;
64 if (CODEPOINT) {
65 *unicode = ((c & 0x1f) << 6) + (d & 0x3f);
66 }
67 return 2; // valid
68 case 0xE0: case 0xE8: // 3-byte sequence
69 d = str[1];
70 if (__builtin_expect ((d & 0xC0) != 0x80, false))
71 goto one_byte;
72 e = str[2];
73 if (__builtin_expect ((e & 0xC0) != 0x80, false))
74 goto one_byte;
75 if (CODEPOINT) {
76 *unicode = ((c & 0x0f) << 12) + ((d & 0x3f) << 6) + (e & 0x3f);
77 if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
78 goto one_byte; // UTF-16 surrogates are invalid in UTF-8
79 if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
80 goto one_byte; // MirBSD OPTU-8/16 private use
81 }
82 return 3; // valid
83 case 0xF0: // 4-byte sequence
84 d = str[1];
85 if (__builtin_expect ((d & 0xC0) != 0x80, false))
86 goto one_byte;
87 e = str[2];
88 if (__builtin_expect ((e & 0xC0) != 0x80, false))
89 goto one_byte;
90 f = str[3];
91 if (__builtin_expect ((f & 0xC0) != 0x80, false))
92 goto one_byte;
93 if (CODEPOINT) {
94 *unicode = ((c & 0x07) << 18) + ((d & 0x3f) << 12) + ((e & 0x3f) << 6) + (f & 0x3f);
95 if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
96 goto one_byte; // UTF-16 surrogates are invalid in UTF-8
97 if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
98 goto one_byte; // MirBSD OPTU-8/16 private use
99 }
100 return 4; // valid
101 default: one_byte:
102 if (CODEPOINT >= 2 && c >= 0x80)
103 *unicode = 0xef80 - 0x80 + c; // escape byte as surrogate
104 else if (CODEPOINT)
105 *unicode = c; // treat as Latin-1 otherwise
106 return 1;
107 }
108}
109
112encodefs (const std::string &fschars)
113{
114 const char *fstr = fschars.c_str();
115 std::string utf8str;
116 utf8str.reserve (fschars.size());
117 while (*fstr) {
118 uint32_t unicode;
119 const int w = utf8character<3> (fstr, &unicode);
120 if (unicode >= 0xef80 && unicode <= 0xefff)
121 utf8str += utf8encode (&unicode, 1);
122 else
123 utf8str += std::string_view (fstr, w);
124 fstr += w;
125 }
126 return utf8str;
127}
128
131decodefs (const std::string &utf8str)
132{
133 const char *ustr = utf8str.c_str();
134 std::string fschars;
135 fschars.reserve (utf8str.size());
136 while (*ustr) {
137 uint32_t unicode;
138 const int w = utf8character<1> (ustr, &unicode);
139 if (unicode >= 0xef80 && unicode <= 0xefff)
140 fschars += char (unicode - (0xef80 - 0x80));
141 else
142 fschars += std::string_view (ustr, w);
143 ustr += w;
144 }
145 return fschars;
146}
147
150displayfs (const std::string &utf8str)
151{
152 const char *ustr = utf8str.c_str();
153 std::string display;
154 display.reserve (utf8str.size());
155 while (*ustr) {
156 uint32_t unicode;
157 const int w = utf8character<2> (ustr, &unicode);
158 if (unicode >= 0xef80 && unicode <= 0xefff) {
159 unicode = unicode - (0xef80 - 0x80); // leaves 0x80..0xff
160 display += utf8encode (&unicode, 1);
161 } else
162 display += std::string_view (ustr, w);
163 ustr += w;
164 }
165 return display;
166}
167
169static inline size_t
170utf8codepoint (const char *str, uint32_t *unicode)
171{
172 return utf8character<1> (str, unicode);
173}
174
176static inline size_t
177utf8skip (const char *str)
178{
179 return utf8character<0> (str, NULL);
180}
181
183size_t
184utf8len (const char *str)
185{
186 size_t l;
187 for (l = 0; __builtin_expect (*str != 0, true); l++)
188 str += utf8skip (str);
189 return l;
190}
191
193size_t
195{
196 const char *c = str.data(), *e = c + str.size();
197 size_t l = 0;
198 while (c < e)
199 {
200 c += utf8skip (c);
201 l += 1;
202 }
203 return l;
204}
205
208utf8decode (const std::string &utf8str)
209{
210 std::vector<uint32_t> codepoints;
211 codepoints.resize (utf8str.size());
212 size_t l = utf8_to_unicode (utf8str.c_str(), &codepoints[0]);
213 codepoints.resize (l);
214 return codepoints;
215}
216
220size_t
221utf8_to_unicode (const char *str, uint32_t *codepoints)
222{
223 // assuming sizeof codepoints[] >= sizeof str[]
224 size_t l;
225 for (l = 0; __builtin_expect (*str != 0, true); l++)
226 str += utf8codepoint (str, &codepoints[l]);
227 return l;
228}
229
232size_t
234{
235 const size_t l = codepoints.size();
236 codepoints.reserve (codepoints.size() + str.size());
237 const char *c = str.data(), *const e = c + str.size();
238 while (c < e)
239 {
240 uint32_t codepoint;
241 c += utf8codepoint (c, &codepoint);
242 codepoints.push_back (codepoint);
243 }
244 return codepoints.size() - l;
245}
246
249utf8encode (const uint32_t *codepoints, size_t n_codepoints)
250{
251 std::string str;
252 str.reserve (n_codepoints);
253 for (size_t i = 0; i < n_codepoints; i++)
254 {
255 const uint32_t u = codepoints[i];
256 if (__builtin_expect (u <= 0x7F, true))
257 {
258 str.push_back (u);
259 continue;
260 }
261 switch (u)
262 {
263 case 0x00000080 ... 0x000007FF:
264 str.push_back (0xC0 + (u >> 6));
265 str.push_back (0x80 + (u & 0x3F));
266 break;
267 case 0x00000800 ... 0x0000FFFF:
268 str.push_back (0xE0 + (u >> 12));
269 str.push_back (0x80 + ((u >> 6) & 0x3F));
270 str.push_back (0x80 + (u & 0x3F));
271 break;
272 case 0x00010000 ... 0x0010FFFF:
273 str.push_back (0xF0 + (u >> 18));
274 str.push_back (0x80 + ((u >> 12) & 0x3F));
275 str.push_back (0x80 + ((u >> 6) & 0x3F));
276 str.push_back (0x80 + (u & 0x3F));
277 break;
278 default:
279 break;
280 }
281 }
282 return str;
283}
284
288{
289 return utf8encode (codepoints.data(), codepoints.size());
290}
291
295static bool
296codepoint_is_namestartchar (uint32_t c)
297{
298 const bool ok =
299 std::isalpha (c) || c == '_' ||
300 (c >= 0xC0 && c <= 0xD6) || (c >= 0xD8 && c <= 0xF6) ||
301 (c >= 0xF8 && c <= 0x2FF) || (c >= 0x370 && c <= 0x37D) || (c >= 0x37F && c <= 0x1FFF) ||
302 (c >= 0x200C && c <= 0x200D) || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) ||
303 (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || (c >= 0xFDF0 && c <= 0xFFFD) ||
304 (c >= 0x10000 && c <= 0xEFFFF);
305 return ok;
306}
307
311static bool
312codepoint_is_ncname (uint32_t c)
313{
314 const bool ok =
315 codepoint_is_namestartchar (c) ||
316 c == '-' || c == '.' || (c >= '0' && c <= '9') ||
317 c == 0xB7 || (c >= 0x0300 && c <= 0x036F) || (c >= 0x203F && c <= 0x2040);
318 return ok;
319}
320
324bool
326{
328 utf8_to_unicode (input, tmp);
329 for (auto c : tmp)
330 if (!codepoint_is_ncname (c))
331 return false;
332 return true;
333}
334
338String
339string_to_ncname (const String &input, uint32_t substitute)
340{
341 std::vector<uint32_t> ucstring;
342 utf8_to_unicode (input, ucstring);
343 for (auto it = ucstring.begin(); it != ucstring.end(); )
344 if (!codepoint_is_ncname (*it)) {
345 if (substitute)
346 *it++ = substitute;
347 else
348 it = ucstring.erase (it);
349 } else
350 ++it;
351 if (!ucstring.empty() && !codepoint_is_namestartchar (ucstring[0]))
352 ucstring.insert (ucstring.begin(), '_');
353 return utf8encode (ucstring);
354}
355
356} // Ase
357
358// == Testing ==
359#include "testing.hh"
360#include "internal.hh"
361
362namespace { // Anon
363using namespace Ase;
364
365TEST_INTEGRITY (unicode_displayfs_tests);
366static void
367unicode_displayfs_tests()
368{
369 // ASCII is fully preserved in encodefs, decodefs, displayfs
370 const char *const asciistr = "\001\t09AZaz|~\177";
373 TASSERT (decodefs (encodefs (asciistr)) == asciistr);
374 TASSERT (displayfs (encodefs (asciistr)) == asciistr);
376 TASSERT (displayfs (asciistr) == asciistr);
378 TASSERT (encodefs (asciistr) == asciistr);
379 TASSERT (decodefs (asciistr) == asciistr);
380 // non UTF-8 bytes need conversions
381 const char *const lowbytes = "\x80\x87\x88\x8f\x90\x97\x98\x9f\xa0\xa7\xa8\xaf\xb0\xb7\xb8\xbf";
382 const char *const low2utf8 = "\u0080\u0087\u0088\u008f\u0090\u0097\u0098\u009f\u00a0\u00a7\u00a8\u00af\u00b0\u00b7\u00b8\u00bf";
384 TASSERT (decodefs (encodefs (lowbytes)) == lowbytes);
385 TASSERT (displayfs (encodefs (lowbytes)) == low2utf8);
387 TASSERT (displayfs (lowbytes) == low2utf8);
389 TASSERT (encodefs (lowbytes) != lowbytes);
390 // non UTF-8 sequences need conversions
391 const char *const highbytes = "\xc0 \xc7 \xc8 \xcf \xd0 \xd7 \xd8 \xdf \xe0 \xe7 \xe8 \xef \xf0 \xf7 \xf8 \xff";
392 const char *const high2utf8 = "\u00c0 \u00c7 \u00c8 \u00cf \u00d0 \u00d7 \u00d8 \u00df \u00e0 \u00e7 \u00e8 \u00ef \u00f0 \u00f7 \u00f8 \u00ff";
394 TASSERT (decodefs (encodefs (highbytes)) == highbytes);
396 TASSERT (displayfs (highbytes) == high2utf8);
397 TASSERT (displayfs (encodefs (highbytes)) == high2utf8);
399 TASSERT (encodefs (highbytes) != highbytes);
401 // UTF-8 characters are fully preserved in encodefs, decodefs, displayfs
402 const char *const utf8str = "äöüßÄÖÜïÿ";
404 TASSERT (decodefs (encodefs (utf8str)) == utf8str);
406 TASSERT (displayfs (utf8str) == utf8str);
407 TASSERT (displayfs (encodefs (utf8str)) == utf8str);
409 TASSERT (encodefs (utf8str) == utf8str);
410 TASSERT (decodefs (utf8str) == utf8str);
411 const char *const lat1str = "\xe4\xf6\xfc\xdf\xc4\xd6\xdc\xef\xff";
413 TASSERT (displayfs (encodefs (lat1str)) == utf8str);
414 TASSERT (displayfs (lat1str) == utf8str);
415 // Preserve filenames containing UTF-8 encoded surrogates and private use codes
416 const char *const srg8str = "\xed\xb2\x80\xed\xb3\xbf\xee\xbf\xa4\xee\xbf\xbf";
417 TASSERT (encodefs (srg8str) != srg8str);
419 const std::string srg8enc = encodefs (srg8str);
420 TASSERT (decodefs (srg8enc) == srg8str);
421 TASSERT (displayfs (srg8enc) != srg8enc);
423 TASSERT (displayfs (srg8enc) != displayfs (srg8str));
424}
425
426TEST_INTEGRITY (unicode_tests);
427static void
428unicode_tests()
429{
430 Blob b = Blob::from_file ("/etc/mailcap");
431 const std::string str = b.string();
432 size_t ase_utf8len, glib_utf8len;
433 ase_utf8len = utf8len (str.c_str());
434 glib_utf8len = g_utf8_strlen (str.c_str(), -1);
435 TCMP (ase_utf8len, ==, glib_utf8len);
436 std::vector<uint32_t> codepoints;
437 size_t nc = 0, cc = 0, pc = 0;
438 for (size_t i = 0; i <= unicode_last_codepoint; i++)
439 {
442 nc += unicode_is_noncharacter (i);
443 cc += unicode_is_control_code (i);
444 pc += unicode_is_private (i);
445 if (i && unicode_is_assigned (i))
446 codepoints.push_back (i);
447 }
448 TASSERT (nc == 66);
449 TASSERT (cc == 65);
450 TASSERT (pc == 6400 + 65534 + 65534);
451 std::string big = utf8encode (codepoints);
452 ase_utf8len = utf8len (big.c_str());
453 glib_utf8len = g_utf8_strlen (big.c_str(), -1);
454 TCMP (ase_utf8len, ==, glib_utf8len);
455 TCMP (ase_utf8len, ==, codepoints.size());
456 if (true)
457 {
459 const size_t tmp_result = utf8_to_unicode (big, tmp);
460 TASSERT (tmp_result == tmp.size() && codepoints.size() == tmp_result);
461 for (size_t i = 0; i < codepoints.size(); ++i)
462 TASSERT (tmp[i] == codepoints[i]);
463 }
464 TCMP (false, ==, string_is_ncname ("0abc@def^foo"));
465 TCMP ("_0abcdeffoo", ==, string_to_ncname ("0abc@def^foo"));
466 TCMP ("abc_def_foo", ==, string_to_ncname ("abc@def^foo", '_'));
467 TCMP (true, ==, string_is_ncname ("_0abc_def_foo"));
468}
469
470} // Anon
T c_str(T... args)
Binary large object storage container.
Definition blob.hh:12
static Blob from_file(const String &filename)
Create Blob by loading from filename.
Definition blob.cc:196
String string()
Copy Blob data into a zero terminated string.
Definition blob.cc:117
static bool utf8_validate(const std::string &utf8string)
Validate UTF-8 string with websocketpp::utf8_validator.
Definition websocket.cc:617
T data(T... args)
#define TEST_INTEGRITY(FUNC)
Register func as an integrity test.
Definition internal.hh:77
typedef char
The Anklang C++ API namespace.
Definition api.hh:9
bool string_is_ncname(const String &input)
Definition unicode.cc:325
std::string utf8encode(const uint32_t *codepoints, size_t n_codepoints)
Convert codepoints into an UTF-8 string, using the shortest possible encoding.
Definition unicode.cc:249
constexpr bool unicode_is_private(uint32_t u)
Return whether u is in one of the 3 private use areas of Unicode.
Definition unicode.hh:82
size_t utf8len(const char *str)
Count valid UTF-8 sequences, invalid sequences are counted as Latin-1 characters.
Definition unicode.cc:184
constexpr bool unicode_is_noncharacter(uint32_t u)
Return whether u is one of the 66 Unicode noncharacters.
Definition unicode.hh:59
size_t utf8_to_unicode(const char *str, uint32_t *codepoints)
Definition unicode.cc:221
constexpr bool unicode_is_character(uint32_t u)
Return whether u is not one of the 66 Unicode noncharacters.
Definition unicode.hh:67
std::string decodefs(const std::string &utf8str)
Decode UTF-8 string back into file system path representation, extracting surrogate code points as by...
Definition unicode.cc:131
constexpr bool unicode_is_valid(uint32_t u)
Return whether u is an allowed Unicode codepoint within 0x10FFFF and not part of a UTF-16 surrogate p...
Definition unicode.hh:51
std::string displayfs(const std::string &utf8str)
Convert UTF-8 encoded file system path into human readable display format, the conversion is lossy bu...
Definition unicode.cc:150
constexpr bool unicode_is_assigned(uint32_t u)
Return whether u matches any of the assigned Unicode planes.
Definition unicode.hh:33
constexpr bool unicode_is_control_code(uint32_t u)
Return whether u is one of the 65 Unicode control codes.
Definition unicode.hh:74
std::string encodefs(const std::string &fschars)
Encode a file system path consisting of bytes into UTF-8, using surrogate code points to store non UT...
Definition unicode.cc:112
String string_to_ncname(const String &input, uint32_t substitute)
Definition unicode.cc:339
std::vector< uint32_t > utf8decode(const std::string &utf8str)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...
Definition unicode.cc:208
T push_back(T... args)
T reserve(T... args)
T size(T... args)
typedef uint8_t
#define TASSERT(cond)
Unconditional test assertion, enters breakpoint if not fullfilled.
Definition testing.hh:24
#define TCMP(a, cmp, b)
Compare a and b according to operator cmp, verbose on failiure.
Definition testing.hh:23