39template<
int CODEPOINT> 
static inline size_t  
   40utf8character (
const char *str, uint32_t *unicode)
 
   50  if (CODEPOINT <= 1 && __builtin_expect (c < 0xc0, 
true))
 
   60    case 0xC0: 
case 0xC8: 
case 0xD0: 
case 0xD8:         
 
   62      if (__builtin_expect ((d & 0xC0) != 0x80, 
false))
 
   65        *unicode = ((c & 0x1f) << 6) + (d & 0x3f);
 
   70      if (__builtin_expect ((d & 0xC0) != 0x80, 
false))
 
   73      if (__builtin_expect ((e & 0xC0) != 0x80, 
false))
 
   76        *unicode = ((c & 0x0f) << 12) + ((d & 0x3f) << 6) + (e & 0x3f);
 
   77        if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
 
   79        if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
 
   85      if (__builtin_expect ((d & 0xC0) != 0x80, 
false))
 
   88      if (__builtin_expect ((e & 0xC0) != 0x80, 
false))
 
   91      if (__builtin_expect ((f & 0xC0) != 0x80, 
false))
 
   94        *unicode = ((c & 0x07) << 18) + ((d & 0x3f) << 12) + ((e & 0x3f) << 6) + (f & 0x3f);
 
   95        if (CODEPOINT >= 2 && *unicode >= 0xd800 && *unicode <= 0xdfff)
 
   97        if (CODEPOINT >= 3 && *unicode >= 0xef80 && *unicode <= 0xefff)
 
  102      if (CODEPOINT >= 2 && c >= 0x80)
 
  103        *unicode = 0xef80 - 0x80 + c;                   
 
  114  const char *fstr = fschars.
c_str();
 
  119    const int w = utf8character<3> (fstr, &unicode);
 
  120    if (unicode >= 0xef80 && unicode <= 0xefff)
 
 
  133  const char *ustr = utf8str.
c_str();
 
  138    const int w = utf8character<1> (ustr, &unicode);
 
  139    if (unicode >= 0xef80 && unicode <= 0xefff)
 
  140      fschars += 
char (unicode - (0xef80 - 0x80));
 
 
  152  const char *ustr = utf8str.
c_str();
 
  157    const int w = utf8character<2> (ustr, &unicode);
 
  158    if (unicode >= 0xef80 && unicode <= 0xefff) {
 
  159      unicode = unicode - (0xef80 - 0x80); 
 
 
  170utf8codepoint (
const char *str, uint32_t *unicode)
 
  172  return utf8character<1> (str, unicode);
 
  177utf8skip (
const char *str)
 
  179  return utf8character<0> (str, NULL);
 
  187  for (l = 0; __builtin_expect (*str != 0, 
true); l++)
 
  188    str += utf8skip (str);
 
 
  196  const char *c = str.
data(), *e = c + str.
size();
 
 
  211  codepoints.resize (utf8str.
size());
 
  213  codepoints.resize (l);
 
 
  225  for (l = 0; __builtin_expect (*str != 0, 
true); l++)
 
  226    str += utf8codepoint (str, &codepoints[l]);
 
 
  235  const size_t l = codepoints.size();
 
  236  codepoints.reserve (codepoints.size() + str.
size());
 
  237  const char *c = str.
data(), *
const e = c + str.
size();
 
  241      c += utf8codepoint (c, &codepoint);
 
  242      codepoints.push_back (codepoint);
 
  244  return codepoints.size() - l;
 
 
  253  for (
size_t i = 0; i < n_codepoints; i++)
 
  256      if (__builtin_expect (u <= 0x7F, 
true))
 
  263        case 0x00000080 ... 0x000007FF:
 
  267        case 0x00000800 ... 0x0000FFFF:
 
  269          str.
push_back (0x80 + ((u >>  6) & 0x3F));
 
  272        case 0x00010000 ... 0x0010FFFF:
 
  274          str.
push_back (0x80 + ((u >> 12) & 0x3F));
 
  275          str.
push_back (0x80 + ((u >>  6) & 0x3F));
 
 
  289  return utf8encode (codepoints.data(), codepoints.size());
 
 
  296codepoint_is_namestartchar (uint32_t c)
 
  299    std::isalpha (c) || c == 
'_' ||
 
  300    (c >= 0xC0 && c <= 0xD6) || (c >= 0xD8 && c <= 0xF6) ||
 
  301    (c >= 0xF8 && c <= 0x2FF) || (c >= 0x370 && c <= 0x37D) || (c >= 0x37F && c <= 0x1FFF) ||
 
  302    (c >= 0x200C && c <= 0x200D) || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) ||
 
  303    (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || (c >= 0xFDF0 && c <= 0xFFFD) ||
 
  304    (c >= 0x10000 && c <= 0xEFFFF);
 
  312codepoint_is_ncname (uint32_t c)
 
  315    codepoint_is_namestartchar (c) ||
 
  316    c == 
'-' || c == 
'.' || (c >= 
'0' && c <= 
'9') ||
 
  317    c == 0xB7 || (c >= 0x0300 && c <= 0x036F) || (c >= 0x203F && c <= 0x2040);
 
  330    if (!codepoint_is_ncname (c))
 
 
  343  for (
auto it = ucstring.begin(); it != ucstring.end(); )
 
  344    if (!codepoint_is_ncname (*it)) {
 
  348        it = ucstring.erase (it);
 
  351  if (!ucstring.empty() && !codepoint_is_namestartchar (ucstring[0]))
 
  352    ucstring.insert (ucstring.begin(), 
'_');
 
 
  367unicode_displayfs_tests()
 
  370  const char *
const asciistr = 
"\001\t09AZaz|~\177";
 
  381  const char *
const lowbytes = 
"\x80\x87\x88\x8f\x90\x97\x98\x9f\xa0\xa7\xa8\xaf\xb0\xb7\xb8\xbf";
 
  382  const char *
const low2utf8 = 
"\u0080\u0087\u0088\u008f\u0090\u0097\u0098\u009f\u00a0\u00a7\u00a8\u00af\u00b0\u00b7\u00b8\u00bf";
 
  391  const char *
const highbytes = 
"\xc0 \xc7 \xc8 \xcf \xd0 \xd7 \xd8 \xdf \xe0 \xe7 \xe8 \xef \xf0 \xf7 \xf8 \xff";
 
  392  const char *
const high2utf8 = 
"\u00c0 \u00c7 \u00c8 \u00cf \u00d0 \u00d7 \u00d8 \u00df \u00e0 \u00e7 \u00e8 \u00ef \u00f0 \u00f7 \u00f8 \u00ff";
 
  402  const char *
const utf8str = 
"äöüßÄÖÜïÿ";
 
  411  const char *
const lat1str = 
"\xe4\xf6\xfc\xdf\xc4\xd6\xdc\xef\xff";
 
  416  const char *
const srg8str = 
"\xed\xb2\x80\xed\xb3\xbf\xee\xbf\xa4\xee\xbf\xbf";
 
  432  size_t ase_utf8len, glib_utf8len;
 
  434  glib_utf8len = g_utf8_strlen (str.
c_str(), -1);
 
  435  TCMP (ase_utf8len, ==, glib_utf8len);
 
  437  size_t nc = 0, cc = 0, pc = 0;
 
  438  for (
size_t i = 0; i <= unicode_last_codepoint; i++)
 
  446        codepoints.push_back (i);
 
  450  TASSERT (pc == 6400 + 65534 + 65534);
 
  453  glib_utf8len = g_utf8_strlen (big.
c_str(), -1);
 
  454  TCMP (ase_utf8len, ==, glib_utf8len);
 
  455  TCMP (ase_utf8len, ==, codepoints.size());
 
  460      TASSERT (tmp_result == tmp.size() && codepoints.size() == tmp_result);
 
  461      for (
size_t i = 0; i < codepoints.size(); ++i)
 
  462        TASSERT (tmp[i] == codepoints[i]);
 
Binary large object storage container.
 
static Blob from_file(const String &filename)
Create Blob by loading from filename.
 
String string()
Copy Blob data into a zero terminated string.
 
static bool utf8_validate(const std::string &utf8string)
Validate UTF-8 string with websocketpp::utf8_validator.
 
#define TEST_INTEGRITY(FUNC)
Register func as an integrity test.
 
The Anklang C++ API namespace.
 
bool string_is_ncname(const String &input)
 
std::string utf8encode(const uint32_t *codepoints, size_t n_codepoints)
Convert codepoints into an UTF-8 string, using the shortest possible encoding.
 
constexpr bool unicode_is_private(uint32_t u)
Return whether u is in one of the 3 private use areas of Unicode.
 
size_t utf8len(const char *str)
Count valid UTF-8 sequences, invalid sequences are counted as Latin-1 characters.
 
constexpr bool unicode_is_noncharacter(uint32_t u)
Return whether u is one of the 66 Unicode noncharacters.
 
size_t utf8_to_unicode(const char *str, uint32_t *codepoints)
 
constexpr bool unicode_is_character(uint32_t u)
Return whether u is not one of the 66 Unicode noncharacters.
 
std::string decodefs(const std::string &utf8str)
Decode UTF-8 string back into file system path representation, extracting surrogate code points as by...
 
constexpr bool unicode_is_valid(uint32_t u)
Return whether u is an allowed Unicode codepoint within 0x10FFFF and not part of a UTF-16 surrogate p...
 
std::string displayfs(const std::string &utf8str)
Convert UTF-8 encoded file system path into human readable display format, the conversion is lossy bu...
 
constexpr bool unicode_is_assigned(uint32_t u)
Return whether u matches any of the assigned Unicode planes.
 
constexpr bool unicode_is_control_code(uint32_t u)
Return whether u is one of the 65 Unicode control codes.
 
std::string encodefs(const std::string &fschars)
Encode a file system path consisting of bytes into UTF-8, using surrogate code points to store non UT...
 
String string_to_ncname(const String &input, uint32_t substitute)
 
std::vector< uint32_t > utf8decode(const std::string &utf8str)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...
 
#define TASSERT(cond)
Unconditional test assertion, enters breakpoint if not fullfilled.
 
#define TCMP(a, cmp, b)
Compare a and b according to operator cmp, verbose on failiure.