Anklang 0.3.0-460-gc4ef46ba
ASE — Anklang Sound Engine (C++)

« « « Anklang Documentation
Loading...
Searching...
No Matches
regex.cc
Go to the documentation of this file.
1 // This Source Code Form is licensed MPL-2.0: http://mozilla.org/MPL/2.0
2#include "regex.hh"
3#include "logging.hh"
4#include "internal.hh"
5#include <cstring>
6
7#define PCRE2_CODE_UNIT_WIDTH 8
8#include <pcre2.h>
9
10namespace Ase {
11
12static pcre2_compile_context*
13pcre2compilecontext ()
14{
15 static pcre2_compile_context *ccontext = [] {
16 pcre2_compile_context *ccontext = pcre2_compile_context_create (nullptr);
17 pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_ALT_BSUX); // \u{abcdef} (ECMAScript 6)
18 pcre2_set_bsr (ccontext, PCRE2_BSR_UNICODE);
19 pcre2_set_newline (ccontext, PCRE2_NEWLINE_ANY);
20 return ccontext;
21 // pcre2_compile_context_free (ccontext);
22 } ();
23 return ccontext;
24}
25
26static uint32_t
27flags_to_pcre2_compile_options (Re::Flags flags)
28{
29 uint32_t o =
30 0 // use PCRE2_NO_UTF_CHECK if regex is validated
31 | PCRE2_UTF // UTF-8 Unicode mode
32 | PCRE2_UCP // Unicode properties for \d \s \w
33 | (flags & Re::I ? PCRE2_CASELESS : 0)
34 | (flags & Re::M ? PCRE2_MULTILINE : 0)
35 | (flags & Re::N ? PCRE2_NO_AUTO_CAPTURE : 0)
36 | (flags & Re::S ? PCRE2_CASELESS : 0)
37 | (flags & Re::X ? PCRE2_EXTENDED : 0) // allows #comments\n
38 | (flags & Re::XX ? PCRE2_EXTENDED_MORE : 0)
39 | (flags & Re::J ? PCRE2_DUPNAMES : 0)
40 | (flags & Re::U ? PCRE2_UNGREEDY : 0)
41 | PCRE2_ALT_BSUX // allow \x22 \u4444
42 | PCRE2_NEVER_BACKSLASH_C; // prevent matching point in the middle of UTF-8
43 return o;
44}
45
46struct PcRe2 {
47 pcre2_code *prcode = nullptr;
48 int errorcode = 0;
49 explicit
50 PcRe2 (const std::string &pattern, Re::Flags flags)
51 {
52 pcre2_compile_context *const ccontext = pcre2compilecontext();
53 size_t erroroffset = -1;
54 prcode = pcre2_compile ((const uint8_t*) pattern.c_str(), PCRE2_ZERO_TERMINATED, flags_to_pcre2_compile_options (flags), &errorcode, &erroroffset, ccontext);
55 if (!prcode)
56 log ("Re: failed to compile regex, error=%d: %s", errorcode, pattern);
57 }
58 ~PcRe2()
59 {
60 pcre2_code_free (prcode);
61 }
63 search (const std::string &input)
64 {
65 pcre2_match_data *md = pcre2_match_data_create_from_pattern (prcode, nullptr);
66 const uint32_t MATCH_OPTIONS =
67 0; // PCRE2_ANCHORED PCRE2_ENDANCHORED PCRE2_NOTEMPTY etc
68 const int ret = pcre2_match (prcode, (const uint8_t*) input.c_str(), PCRE2_ZERO_TERMINATED, 0 /*startoffset*/, MATCH_OPTIONS, md, nullptr);
69 ssize_t start = -1;
70 if (ret >= 0) {
71 const uint32_t ovecs = pcre2_get_ovector_count (md);
72 if (ovecs > 0) {
73 const size_t *ovector = pcre2_get_ovector_pointer (md);
74 start = ovector[0];
75 }
76 }
77 pcre2_match_data_free (md); md = nullptr;
78 return start;
79 }
81 grep (const String &input, int group)
82 {
83 pcre2_match_data *md = pcre2_match_data_create_from_pattern (prcode, nullptr);
84 const uint32_t MATCH_OPTIONS =
85 0; // PCRE2_ANCHORED PCRE2_ENDANCHORED PCRE2_NOTEMPTY etc
86 const int ret = pcre2_match (prcode, (const uint8_t*) input.c_str(), PCRE2_ZERO_TERMINATED, 0 /*startoffset*/, MATCH_OPTIONS, md, nullptr);
87 std::string result;
88 if (ret >= 0) {
89 const uint32_t ovecs = pcre2_get_ovector_count (md);
90 if (group < 0)
91 group = uint (-group) < ovecs ? uint (-group) : 0;
92 if (group < ovecs) {
93 const size_t *ovector = pcre2_get_ovector_pointer (md);
94 const size_t start = ovector[group*2], end = ovector[group*2+1];
95 result.assign (&input[0] + start, &input[0] + end);
96 }
97 }
98 pcre2_match_data_free (md); md = nullptr;
99 return result;
100 }
102 findall (const String &input_string)
103 {
105 pcre2_match_data *md = pcre2_match_data_create_from_pattern (prcode, nullptr);
106 const uint32_t MATCH_OPTIONS =
107 0; // PCRE2_ANCHORED PCRE2_ENDANCHORED PCRE2_NOTEMPTY etc
108 const uint8_t *input = (const uint8_t*) input_string.c_str();
109 const size_t input_length = strlen (input_string.c_str());
110 int ret = pcre2_match (prcode, input, input_length, 0 /*startoffset*/, MATCH_OPTIONS, md, nullptr);
111 size_t *ovector = ret <= 0 ? nullptr : pcre2_get_ovector_pointer (md);
112 // guard against patterns such as /(?=.\K)/ that use \K to set match start>end, see pcre2pattern(3)
113 if (ret < 1 || ovector[0] > ovector[1]) {
114 errorcode = ret < 0 ? ret : ret == 0 ? PCRE2_ERROR_NOMEMORY : PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND;
115 if (ret != PCRE2_ERROR_NOMATCH)
116 log ("Re: findall matching error, error=%d", errorcode);
117 pcre2_match_data_free (md); md = nullptr;
118 return result;
119 }
120 result.push_back (std::string (input + ovector[0], input + ovector[1]));
121 uint32_t bits = 0;
122 pcre2_pattern_info (prcode, PCRE2_INFO_ALLOPTIONS, &bits);
123 const bool UTF8 = bits & PCRE2_UTF;
124 pcre2_pattern_info (prcode, PCRE2_INFO_NEWLINE, &bits);
125 const bool CRLF_IS_NEWLINE = bits == PCRE2_NEWLINE_ANY || bits == PCRE2_NEWLINE_CRLF || bits == PCRE2_NEWLINE_ANYCRLF;
126 while (ret >= 1)
127 {
128 PCRE2_SIZE start_offset = ovector[1]; // start at end of previous match
129 uint32_t options = 0;
130 if (ovector[0] == ovector[1]) { // previous match was for an empty string
131 if (ovector[0] == input_length)
132 break; // end of input
133 options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
134 } else { // previous match was non-empty
135 // handle \K within a lookbehind assertion at the start, see: https://www.pcre.org/current/doc/html/pcre2demo.html
136 const auto startchar = pcre2_get_startchar (md);
137 if (start_offset <= startchar) {
138 if (startchar >= input_length)
139 break; // end of input
140 start_offset = startchar + 1; // advance one code unit
141 for (; UTF8 && start_offset < input_length; start_offset++)
142 if ((input[start_offset] & 0xc0) != 0x80)
143 break; // complete UTF8 code unit
144 }
145 }
146 // try next match
147 ret = pcre2_match (prcode, input, input_length, start_offset, options, md, nullptr);
148 // advance in case we need to keep searching after empty string match
149 if (ret == PCRE2_ERROR_NOMATCH) {
150 if (options == 0)
151 break; // all matches found
152 ovector[1] = start_offset + 1; // advance one code unit
153 if (CRLF_IS_NEWLINE && // if CRLF is a newline &
154 start_offset < input_length - 1 && // we are at CRLF
155 input[start_offset] == '\r' &&
156 input[start_offset + 1] == '\n')
157 ovector[1] += 1; // skip over CR and LF
158 else if (UTF8) {
159 while (ovector[1] < input_length) {
160 if ((input[ovector[1]] & 0xc0) != 0x80)
161 break; // complete UTF8 code unit
162 ovector[1] += 1;
163 }
164 }
165 continue; // retry
166 }
167 // report match errors
168 if (ret < 1 || ovector[0] > ovector[1]) {
169 // guard against patterns such as /(?=.\K)/ that use \K to set match start>end, see pcre2pattern(3)
170 errorcode = ret < 0 ? ret : ret == 0 ? PCRE2_ERROR_NOMEMORY : PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND;
171 log ("Re: findall matching error, error=%d", errorcode);
172 break;
173 }
174 // collect matched substring
175 result.push_back (std::string (input + ovector[0], input + ovector[1]));
176 }
177 pcre2_match_data_free (md); md = nullptr;
178 return result;
179 }
181 sub (const std::string &substitution, const std::string &input, ssize_t maxsubst = SSIZE_MAX)
182 {
183 pcre2_match_data *md = pcre2_match_data_create_from_pattern (prcode, nullptr);
184 pcre2_match_context *mc = pcre2_match_context_create (nullptr);
185 const uint32_t MATCH_OPTIONS =
186 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
187 PCRE2_SUBSTITUTE_GLOBAL |
188 0; // PCRE2_ANCHORED PCRE2_ENDANCHORED PCRE2_NOTEMPTY etc
189 struct CalloutData {
190 ssize_t max_substitutions = SSIZE_MAX;
191 } callout_data;
192 callout_data.max_substitutions = maxsubst;
193 auto callout_function = [] (pcre2_substitute_callout_block*, void *callout_data_ptr) -> int {
194 CalloutData &callout_data = *(CalloutData*) callout_data_ptr;
195 return callout_data.max_substitutions-- >= 1 ? 0 : -1;
196 };
197 if (callout_data.max_substitutions < SSIZE_MAX)
198 pcre2_set_substitute_callout (mc, callout_function, &callout_data);
199 std::string result (input.size() + 4096, 0);
200 PCRE2_SIZE outlength = result.size() - 1;
201 int ret = pcre2_substitute (prcode, (const uint8_t*) input.c_str(), PCRE2_ZERO_TERMINATED, 0 /*startoffset*/, MATCH_OPTIONS, md, mc,
202 (const uint8_t*) substitution.c_str(), PCRE2_ZERO_TERMINATED, (uint8_t*) result.data(), &outlength);
203 if (ret == PCRE2_ERROR_NOMEMORY) {
204 result.resize (outlength + 128);
205 ret = pcre2_substitute (prcode, (const uint8_t*) input.c_str(), PCRE2_ZERO_TERMINATED, 0 /*startoffset*/, MATCH_OPTIONS, md, mc,
206 (const uint8_t*) substitution.c_str(), PCRE2_ZERO_TERMINATED, (uint8_t*) result.data(), &outlength);
207 }
208 result.resize (strlen (result.data()));
209 pcre2_match_data_free (md); md = nullptr;
210 pcre2_match_context_free (mc); mc = nullptr;
211 return result;
212 }
213};
214
217Re::search (const String &regex, const String &input, Flags flags)
218{
219 PcRe2 rx (regex, flags);
220 return rx.search (input);
221}
222
224String
225Re::grep (const String &regex, const String &input, int group, Flags flags)
226{
227 PcRe2 rx (regex, flags);
228 return rx.grep (input, group);
229}
230
233Re::findall (const String &regex, const String &input, Flags flags)
234{
235 PcRe2 rx (regex, flags);
236 return rx.findall (input);
237}
238
240String
241Re::sub (const String &regex, const String &subst, const String &input, uint count, Flags flags)
242{
243 PcRe2 rx (regex, flags);
244 return rx.sub (subst, input, count);
245}
246
248String
249Re::sub (const String &regex, const String &subst, const String &input, Flags flags)
250{
251 PcRe2 rx (regex, flags);
252 return rx.sub (subst, input);
253}
254
255} // Ase
256
257#include "testing.hh"
258
259namespace { // Anon
260using namespace Ase;
261
262TEST_INTEGRITY (regex_tests);
263static void
264regex_tests()
265{
266 ssize_t k;
267 k = Re::search ("fail", "abc abc"); TCMP (k, ==, -1);
268 k = Re::search (R"(\bb)", "abc bbc"); TCMP (k, ==, 4);
269 k = Re::search (R"(\d\d?\b)", "a123 b"); TCMP (k, ==, 2);
270 String u, v;
271 StringS ss;
272 u = "a1 b2 c3 d4"; v = Re::grep ("(\\w+) *(\\w+) *(\\w+)", u, -2); TCMP (v, ==, "b2");
273 u = "abc abc abc Abc"; v = Re::sub ("xyz", "ABC", u); TCMP (v, ==, "abc abc abc Abc");
274 u = "abc abc abc Abc"; v = Re::sub ("xyz", "ABC", u, 2); TCMP (v, ==, "abc abc abc Abc");
275 u = "abc abc abc Abc"; v = Re::sub ("abc", "ABC", u); TCMP (v, ==, "ABC ABC ABC Abc");
276 u = "abc abc abc Abc"; v = Re::sub ("abc", "ABC", u, 2); TCMP (v, ==, "ABC ABC abc Abc");
277 u = "abc abc abc Abc"; v = Re::sub ("abc", "ABC", u, 999); TCMP (v, ==, "ABC ABC ABC Abc");
278 u = "abc abc abc Abc"; v = Re::sub ("abc", "ABC", u, 4, Re::I); TCMP (v, ==, "ABC ABC ABC ABC");
279 u = "abc abc abc Abc"; v = Re::sub (R"(\bA)", "-", u); TCMP (v, ==, "abc abc abc -bc");
280 u = "abc abc abc Abc"; v = Re::sub (R"(\ba)", "-", u, 1); TCMP (v, ==, "-bc abc abc Abc");
281 u = "abc abc abc Abc"; v = Re::sub (R"(\bA\b)", "-", u); TCMP (v, ==, "abc abc abc Abc");
282 u = "a 1 0 2 b 3n 4 Z"; v = Re::sub (R"(([a-zA-Z]) ([0-9]+\b))", "$1$2", u); TCMP (v, ==, "a1 0 2 b 3n4 Z");
283 u = "abc 123 abc Abc"; ss = Re::findall (R"(\b\w)", u); TCMP (ss, ==, cstrings_to_vector ("a", "1", "a", "A", nullptr));
284 u = "abc ABC aBc Abc"; ss = Re::findall ("abc", u, Re::I); TCMP (ss, ==, cstrings_to_vector ("abc", "ABC", "aBc", "Abc", nullptr));
285 u = "a0bcd a1BC xa2bc a3cb"; ss = Re::findall ("a\\d(?=bc)", u); TCMP (ss, ==, cstrings_to_vector ("a0", "a2", nullptr));
286}
287
288} // Anon
T assign(T... args)
T c_str(T... args)
Wrapper for std::regex to simplify usage and reduce compilation time.
Definition regex.hh:10
static String sub(const String &regex, const String &subst, const String &input, Flags=DEFAULT)
Substitute regex in input by sbref with backreferences $00…$99 or $&.
Definition regex.cc:249
static String grep(const String &regex, const String &input, int group=0, Flags=DEFAULT)
Find regex in input and return matching string.
Definition regex.cc:225
static StringS findall(const String &regex, const String &input, Flags=DEFAULT)
Find regex in input and return non-overlapping matches.
Definition regex.cc:233
static ssize_t search(const String &regex, const String &input, Flags=DEFAULT)
Find regex in input and return match position >= 0 or return < 0 otherwise.
Definition regex.cc:217
T data(T... args)
#define TEST_INTEGRITY(FUNC)
Register func as an integrity test.
Definition internal.hh:77
#define SSIZE_MAX
log
The Anklang C++ API namespace.
Definition api.hh:9
StringS cstrings_to_vector(const char *s,...)
Construct a StringS from a NULL terminated list of string arguments.
Definition strings.cc:1265
uint32_t uint
Provide 'uint' as convenience type.
Definition cxxaux.hh:18
T resize(T... args)
T size(T... args)
typedef uint32_t
strlen
typedef ssize_t
#define TCMP(a, cmp, b)
Compare a and b according to operator cmp, verbose on failiure.
Definition testing.hh:23