2#ifndef __ASE_UNICODE_HH__
3#define __ASE_UNICODE_HH__
16size_t utf8len (
const char *str);
23constexpr uint32_t unicode_last_codepoint = 0x10FFFF;
37 (u >= 0xE000 && u <= 0xFFFF) ||
38 (u >= 0x10000 && u <= 0x14FFF) ||
39 (u >= 0x16000 && u <= 0x18FFF) ||
40 (u >= 0x1B000 && u <= 0x1BFFF) ||
41 (u >= 0x1D000 && u <= 0x1FFFF) ||
42 (u >= 0x20000 && u <= 0x2FFFF) ||
43 (u >= 0xE0000 && u <= 0xE0FFF) ||
44 (u >= 0xF0000 && u <= 0xFFFFF) ||
45 (u >= 0x100000 && u <= 0x10FFFF);
46 return __builtin_expect (assigned,
true);
53 const bool valid = u <= 0x10FFFF && (u & 0x1FF800) != 0xD800;
54 return __builtin_expect (valid,
true);
61 const bool noncharacter = (u >= 0xFDD0 && u <= 0xFDEF) || (u & 0xFFFE) == 0xFFFE;
62 return __builtin_expect (noncharacter,
false);
76 const bool control = ( u <= 0x1F) || (u >= 0x7F && u <= 0x9f);
77 return __builtin_expect (control,
false);
84 const bool priv = (u >= 0xE000 && u <= 0xF8FF) || (u >= 0xF0000 && u <= 0xFFFFD) || (u >= 0x100000 && u <= 0x10FFFD);
85 return __builtin_expect (priv,
false);
The Anklang C++ API namespace.
bool string_is_ncname(const String &input)
std::string utf8encode(const uint32_t *codepoints, size_t n_codepoints)
Convert codepoints into an UTF-8 string, using the shortest possible encoding.
constexpr bool unicode_is_private(uint32_t u)
Return whether u is in one of the 3 private use areas of Unicode.
size_t utf8len(const char *str)
Count valid UTF-8 sequences, invalid sequences are counted as Latin-1 characters.
constexpr bool unicode_is_noncharacter(uint32_t u)
Return whether u is one of the 66 Unicode noncharacters.
size_t utf8_to_unicode(const char *str, uint32_t *codepoints)
constexpr bool unicode_is_character(uint32_t u)
Return whether u is not one of the 66 Unicode noncharacters.
std::string decodefs(const std::string &utf8str)
Decode UTF-8 string back into file system path representation, extracting surrogate code points as by...
constexpr bool unicode_is_valid(uint32_t u)
Return whether u is an allowed Unicode codepoint within 0x10FFFF and not part of a UTF-16 surrogate p...
std::string displayfs(const std::string &utf8str)
Convert UTF-8 encoded file system path into human readable display format, the conversion is lossy bu...
std::string String
Convenience alias for std::string.
constexpr bool unicode_is_assigned(uint32_t u)
Return whether u matches any of the assigned Unicode planes.
constexpr bool unicode_is_control_code(uint32_t u)
Return whether u is one of the 65 Unicode control codes.
std::string encodefs(const std::string &fschars)
Encode a file system path consisting of bytes into UTF-8, using surrogate code points to store non UT...
String string_to_ncname(const String &input, uint32_t substitute)
std::vector< uint32_t > utf8decode(const std::string &utf8str)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...