Anklang-0.3.0.dev956+gd75ac925 anklang-0.3.0.dev956+gd75ac925
ASE — Anklang Sound Engine (C++)

« « « Anklang Documentation
Loading...
Searching...
No Matches
unicode.hh
Go to the documentation of this file.
1 // This Source Code Form is licensed MPL-2.0: http://mozilla.org/MPL/2.0
2#pragma once
3
4#include <ase/cxxaux.hh>
5
6namespace Ase {
7
9std::string utf8encode (const uint32_t *codepoints, size_t n_codepoints);
10String string_to_ncname (const String &input, uint32_t substitute = 0);
11bool string_is_ncname (const String &input);
12size_t utf8_to_unicode (const std::string &str, std::vector<uint32_t> &codepoints);
13size_t utf8_to_unicode (const char *str, uint32_t *codepoints);
14size_t utf8len (const std::string &str);
15size_t utf8len (const char *str);
16constexpr inline bool unicode_is_valid (uint32_t u);
17constexpr inline bool unicode_is_assigned (uint32_t u);
18constexpr inline bool unicode_is_noncharacter (uint32_t u);
19constexpr inline bool unicode_is_character (uint32_t u);
20constexpr inline bool unicode_is_control_code (uint32_t u);
21constexpr inline bool unicode_is_private (uint32_t u);
22constexpr uint32_t unicode_last_codepoint = 0x10FFFF;
23std::string encodefs (const std::string &fschars);
24std::string decodefs (const std::string &utf8str);
25std::string displayfs (const std::string &utf8str);
27
28
29// == Implementations ==
31constexpr inline bool
33{
34 const bool assigned =
35 (/*u >= 0x00 &&*/ u <= 0xD7FF) || // BMP - Basic Multilingual Plane (below surrogates at 0xD800)
36 (u >= 0xE000 && u <= 0xFFFF) || // BMP - Basic Multilingual Plane (above surrogates at 0xDFFF)
37 (u >= 0x10000 && u <= 0x14FFF) || // SMP - Supplementary Multilingual Plane
38 (u >= 0x16000 && u <= 0x18FFF) || // SMP - Supplementary Multilingual Plane
39 (u >= 0x1B000 && u <= 0x1BFFF) || // SMP - Supplementary Multilingual Plane
40 (u >= 0x1D000 && u <= 0x1FFFF) || // SMP - Supplementary Multilingual Plane
41 (u >= 0x20000 && u <= 0x2FFFF) || // SIP - Supplementary Ideographic Plane
42 (u >= 0xE0000 && u <= 0xE0FFF) || // SSP - Supplementary Special-purpose Plane
43 (u >= 0xF0000 && u <= 0xFFFFF) || // SPUA-A - Supplementary Private Use Area Plane
44 (u >= 0x100000 && u <= 0x10FFFF); // SPUA-B - Supplementary Private Use Area Plane
45 return __builtin_expect (assigned, true);
46}
47
49constexpr inline bool
50unicode_is_valid (uint32_t u)
51{
52 const bool valid = u <= 0x10FFFF && (u & 0x1FF800) != 0xD800;
53 return __builtin_expect (valid, true);
54}
55
57constexpr inline bool
59{
60 const bool noncharacter = (u >= 0xFDD0 && u <= 0xFDEF) || (u & 0xFFFE) == 0xFFFE;
61 return __builtin_expect (noncharacter, false);
62}
63
65constexpr inline bool
67{
68 return __builtin_expect (!unicode_is_noncharacter (u), true);
69}
70
72constexpr inline bool
74{
75 const bool control = (/*u >= 0x00 &&*/ u <= 0x1F) || (u >= 0x7F && u <= 0x9f);
76 return __builtin_expect (control, false);
77}
78
80constexpr inline bool
82{
83 const bool priv = (u >= 0xE000 && u <= 0xF8FF) || (u >= 0xF0000 && u <= 0xFFFFD) || (u >= 0x100000 && u <= 0x10FFFD);
84 return __builtin_expect (priv, false);
85}
86
87} // Ase
88
The Anklang C++ API namespace.
Definition api.hh:8
bool string_is_ncname(const String &input)
Check input to be a NCName, according to the QName EBNF.
Definition unicode.cc:325
std::string utf8encode(const uint32_t *codepoints, size_t n_codepoints)
Convert codepoints into an UTF-8 string, using the shortest possible encoding.
Definition unicode.cc:249
constexpr bool unicode_is_private(uint32_t u)
Return whether u is in one of the 3 private use areas of Unicode.
Definition unicode.hh:81
size_t utf8len(const char *str)
Count valid UTF-8 sequences, invalid sequences are counted as Latin-1 characters.
Definition unicode.cc:184
constexpr bool unicode_is_noncharacter(uint32_t u)
Return whether u is one of the 66 Unicode noncharacters.
Definition unicode.hh:58
size_t utf8_to_unicode(const char *str, uint32_t *codepoints)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...
Definition unicode.cc:221
constexpr bool unicode_is_character(uint32_t u)
Return whether u is not one of the 66 Unicode noncharacters.
Definition unicode.hh:66
std::string decodefs(const std::string &utf8str)
Decode UTF-8 string back into file system path representation, extracting surrogate code points as by...
Definition unicode.cc:131
constexpr bool unicode_is_valid(uint32_t u)
Return whether u is an allowed Unicode codepoint within 0x10FFFF and not part of a UTF-16 surrogate p...
Definition unicode.hh:50
std::string displayfs(const std::string &utf8str)
Convert UTF-8 encoded file system path into human readable display format, the conversion is lossy bu...
Definition unicode.cc:150
std::string String
Convenience alias for std::string.
Definition cxxaux.hh:34
constexpr bool unicode_is_assigned(uint32_t u)
Return whether u matches any of the assigned Unicode planes.
Definition unicode.hh:32
constexpr bool unicode_is_control_code(uint32_t u)
Return whether u is one of the 65 Unicode control codes.
Definition unicode.hh:73
std::string encodefs(const std::string &fschars)
Encode a file system path consisting of bytes into UTF-8, using surrogate code points to store non UT...
Definition unicode.cc:112
String string_to_ncname(const String &input, uint32_t substitute)
Convert input to a NCName, according to the QName EBNF.
Definition unicode.cc:339
std::vector< uint32_t > utf8decode(const std::string &utf8str)
Convert valid UTF-8 sequences to Unicode codepoints, invalid sequences are treated as Latin-1 charact...
Definition unicode.cc:208
typedef uint32_t