39template<
int CODEPOINT>
static inline size_t
40utf8character (
const char *str, uint32_t *unicode)
50 if (CODEPOINT <= 1 && __builtin_expect (c < 0xc0,
true))
60 case 0xC0:
case 0xC8:
case 0xD0:
case 0xD8:
62 if (__builtin_expect ((d & 0xC0) != 0x80,
false))
65 *unicode = ((c & 0x1f) << 6) + (d & 0x3f);
70 if (__builtin_expect ((d & 0xC0) != 0x80,
false))
73 if (__builtin_expect ((e & 0xC0) != 0x80,
false))
76 *unicode = ((c & 0x0f) << 12) + ((d & 0x3f) << 6) + (e & 0x3f);
77 if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
79 if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
85 if (__builtin_expect ((d & 0xC0) != 0x80,
false))
88 if (__builtin_expect ((e & 0xC0) != 0x80,
false))
91 if (__builtin_expect ((f & 0xC0) != 0x80,
false))
94 *unicode = ((c & 0x07) << 18) + ((d & 0x3f) << 12) + ((e & 0x3f) << 6) + (f & 0x3f);
95 if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
97 if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
102 if (CODEPOINT >= 2 && c >= 0x80)
103 *unicode = 0xef80 - 0x80 + c;
114 const char *fstr = fschars.
c_str();
119 const int w = utf8character<3> (fstr, &unicode);
120 if (unicode >= 0xef80 && unicode <= 0xefff)
133 const char *ustr = utf8str.
c_str();
138 const int w = utf8character<1> (ustr, &unicode);
139 if (unicode >= 0xef80 && unicode <= 0xefff)
140 fschars +=
char (unicode - (0xef80 - 0x80));
152 const char *ustr = utf8str.
c_str();
157 const int w = utf8character<2> (ustr, &unicode);
158 if (unicode >= 0xef80 && unicode <= 0xefff) {
159 unicode = unicode - (0xef80 - 0x80);
170utf8codepoint (
const char *str, uint32_t *unicode)
172 return utf8character<1> (str, unicode);
177utf8skip (
const char *str)
179 return utf8character<0> (str, NULL);
187 for (l = 0; __builtin_expect (*str != 0,
true); l++)
188 str += utf8skip (str);
196 const char *c = str.
data(), *e = c + str.
size();
211 codepoints.resize (utf8str.
size());
213 codepoints.resize (l);
225 for (l = 0; __builtin_expect (*str != 0,
true); l++)
226 str += utf8codepoint (str, &codepoints[l]);
235 const size_t l = codepoints.size();
236 codepoints.reserve (codepoints.size() + str.
size());
237 const char *c = str.
data(), *
const e = c + str.
size();
241 c += utf8codepoint (c, &codepoint);
242 codepoints.push_back (codepoint);
244 return codepoints.size() - l;
253 for (
size_t i = 0; i < n_codepoints; i++)
256 if (__builtin_expect (u <= 0x7F,
true))
263 case 0x00000080 ... 0x000007FF:
267 case 0x00000800 ... 0x0000FFFF:
269 str.
push_back (0x80 + ((u >> 6) & 0x3F));
272 case 0x00010000 ... 0x0010FFFF:
274 str.
push_back (0x80 + ((u >> 12) & 0x3F));
275 str.
push_back (0x80 + ((u >> 6) & 0x3F));
289 return utf8encode (codepoints.data(), codepoints.size());
296codepoint_is_namestartchar (uint32_t c)
299 std::isalpha (c) || c ==
'_' ||
300 (c >= 0xC0 && c <= 0xD6) || (c >= 0xD8 && c <= 0xF6) ||
301 (c >= 0xF8 && c <= 0x2FF) || (c >= 0x370 && c <= 0x37D) || (c >= 0x37F && c <= 0x1FFF) ||
302 (c >= 0x200C && c <= 0x200D) || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) ||
303 (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || (c >= 0xFDF0 && c <= 0xFFFD) ||
304 (c >= 0x10000 && c <= 0xEFFFF);
312codepoint_is_ncname (uint32_t c)
315 codepoint_is_namestartchar (c) ||
316 c ==
'-' || c ==
'.' || (c >=
'0' && c <=
'9') ||
317 c == 0xB7 || (c >= 0x0300 && c <= 0x036F) || (c >= 0x203F && c <= 0x2040);
330 if (!codepoint_is_ncname (c))
343 for (
auto it = ucstring.begin(); it != ucstring.end(); )
344 if (!codepoint_is_ncname (*it)) {
348 it = ucstring.erase (it);
351 if (!ucstring.empty() && !codepoint_is_namestartchar (ucstring[0]))
352 ucstring.insert (ucstring.begin(),
'_');
367unicode_displayfs_tests()
370 const char *
const asciistr =
"\001\t09AZaz|~\177";
381 const char *
const lowbytes =
"\x80\x87\x88\x8f\x90\x97\x98\x9f\xa0\xa7\xa8\xaf\xb0\xb7\xb8\xbf";
382 const char *
const low2utf8 =
"\u0080\u0087\u0088\u008f\u0090\u0097\u0098\u009f\u00a0\u00a7\u00a8\u00af\u00b0\u00b7\u00b8\u00bf";
391 const char *
const highbytes =
"\xc0 \xc7 \xc8 \xcf \xd0 \xd7 \xd8 \xdf \xe0 \xe7 \xe8 \xef \xf0 \xf7 \xf8 \xff";
392 const char *
const high2utf8 =
"\u00c0 \u00c7 \u00c8 \u00cf \u00d0 \u00d7 \u00d8 \u00df \u00e0 \u00e7 \u00e8 \u00ef \u00f0 \u00f7 \u00f8 \u00ff";
402 const char *
const utf8str =
"äöüßÄÖÜïÿ";
411 const char *
const lat1str =
"\xe4\xf6\xfc\xdf\xc4\xd6\xdc\xef\xff";
416 const char *
const srg8str =
"\xed\xb2\x80\xed\xb3\xbf\xee\xbf\xa4\xee\xbf\xbf";
432 size_t ase_utf8len, glib_utf8len;
434 glib_utf8len = g_utf8_strlen (str.
c_str(), -1);
435 TCMP (ase_utf8len, ==, glib_utf8len);
437 size_t nc = 0, cc = 0, pc = 0;
438 for (
size_t i = 0; i <= unicode_last_codepoint; i++)
446 codepoints.push_back (i);
450 TASSERT (pc == 6400 + 65534 + 65534);
453 glib_utf8len = g_utf8_strlen (big.
c_str(), -1);
454 TCMP (ase_utf8len, ==, glib_utf8len);
455 TCMP (ase_utf8len, ==, codepoints.size());
460 TASSERT (tmp_result == tmp.size() && codepoints.size() == tmp_result);
461 for (
size_t i = 0; i < codepoints.size(); ++i)
462 TASSERT (tmp[i] == codepoints[i]);
Binary large object storage container.
static Blob from_file(const String &filename)
Create Blob by loading from filename.
String string()
Copy Blob data into a zero terminated string.
static bool utf8_validate(const std::string &utf8string)
Validate UTF-8 string with websocketpp::utf8_validator.
#define TEST_INTEGRITY(FUNC)
Register func as an integrity test.
The Anklang C++ API namespace.
bool string_is_ncname(const String &input)
std::string utf8encode(const uint32_t *codepoints, size_t n_codepoints)
Convert codepoints into an UTF-8 string, using the shortest possible encoding.
constexpr bool unicode_is_private(uint32_t u)
Return whether u is in one of the 3 private use areas of Unicode.
size_t utf8len(const char *str)
Count valid UTF-8 sequences, invalid sequences are counted as Latin-1 characters.
constexpr bool unicode_is_noncharacter(uint32_t u)
Return whether u is one of the 66 Unicode noncharacters.
size_t utf8_to_unicode(const char *str, uint32_t *codepoints)
constexpr bool unicode_is_character(uint32_t u)
Return whether u is not one of the 66 Unicode noncharacters.
std::string decodefs(const std::string &utf8str)
Decode UTF-8 string back into file system path representation, extracting surrogate code points as by...
constexpr bool unicode_is_valid(uint32_t u)
Return whether u is an allowed Unicode codepoint within 0x10FFFF and not part of a UTF-16 surrogate p...
std::string displayfs(const std::string &utf8str)
Convert UTF-8 encoded file system path into human readable display format, the conversion is lossy bu...
constexpr bool unicode_is_assigned(uint32_t u)
Return whether u matches any of the assigned Unicode planes.
constexpr bool unicode_is_control_code(uint32_t u)
Return whether u is one of the 65 Unicode control codes.
std::string encodefs(const std::string &fschars)
Encode a file system path consisting of bytes into UTF-8, using surrogate code points to store non UT...
String string_to_ncname(const String &input, uint32_t substitute)
std::vector< uint32_t > utf8decode(const std::string &utf8str)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...
#define TASSERT(cond)
Unconditional test assertion, enters breakpoint if not fullfilled.
#define TCMP(a, cmp, b)
Compare a and b according to operator cmp, verbose on failiure.