zig/lib/libcxx/src/regex.cpp at master

  1//===----------------------------------------------------------------------===//
  2//
  3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4// See https://llvm.org/LICENSE.txt for license information.
  5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6//
  7//===----------------------------------------------------------------------===//
  8
  9#include <algorithm>
 10#include <iterator>
 11#include <regex>
 12
 13_LIBCPP_BEGIN_NAMESPACE_STD
 14
 15static const char* make_error_type_string(regex_constants::error_type ecode) {
 16  switch (ecode) {
 17  case regex_constants::error_collate:
 18    return "The expression contained an invalid collating element name.";
 19  case regex_constants::error_ctype:
 20    return "The expression contained an invalid character class name.";
 21  case regex_constants::error_escape:
 22    return "The expression contained an invalid escaped character, or a "
 23           "trailing escape.";
 24  case regex_constants::error_backref:
 25    return "The expression contained an invalid back reference.";
 26  case regex_constants::error_brack:
 27    return "The expression contained mismatched [ and ].";
 28  case regex_constants::error_paren:
 29    return "The expression contained mismatched ( and ).";
 30  case regex_constants::error_brace:
 31    return "The expression contained mismatched { and }.";
 32  case regex_constants::error_badbrace:
 33    return "The expression contained an invalid range in a {} expression.";
 34  case regex_constants::error_range:
 35    return "The expression contained an invalid character range, "
 36           "such as [b-a] in most encodings.";
 37  case regex_constants::error_space:
 38    return "There was insufficient memory to convert the expression into "
 39           "a finite state machine.";
 40  case regex_constants::error_badrepeat:
 41    return "One of *?+{ was not preceded by a valid regular expression.";
 42  case regex_constants::error_complexity:
 43    return "The complexity of an attempted match against a regular "
 44           "expression exceeded a pre-set level.";
 45  case regex_constants::error_stack:
 46    return "There was insufficient memory to determine whether the regular "
 47           "expression could match the specified character sequence.";
 48  case regex_constants::__re_err_grammar:
 49    return "An invalid regex grammar has been requested.";
 50  case regex_constants::__re_err_empty:
 51    return "An empty regex is not allowed in the POSIX grammar.";
 52  case regex_constants::__re_err_parse:
 53    return "The parser did not consume the entire regular expression.";
 54  default:
 55    break;
 56  }
 57  return "Unknown error type";
 58}
 59
 60regex_error::regex_error(regex_constants::error_type ecode)
 61    : runtime_error(make_error_type_string(ecode)), __code_(ecode) {}
 62
 63regex_error::~regex_error() throw() {}
 64
 65namespace {
 66
 67struct collationnames {
 68  const char* elem_;
 69  char char_;
 70};
 71
 72#if defined(__MVS__) && !defined(__NATIVE_ASCII_F)
 73// EBCDIC IBM-1047
 74// Sorted via the EBCDIC collating sequence
 75const collationnames collatenames[] = {
 76    {"a", 0x81},
 77    {"alert", 0x2f},
 78    {"ampersand", 0x50},
 79    {"apostrophe", 0x7d},
 80    {"asterisk", 0x5c},
 81    {"b", 0x82},
 82    {"backslash", 0xe0},
 83    {"backspace", 0x16},
 84    {"c", 0x83},
 85    {"carriage-return", 0xd},
 86    {"circumflex", 0x5f},
 87    {"circumflex-accent", 0x5f},
 88    {"colon", 0x7a},
 89    {"comma", 0x6b},
 90    {"commercial-at", 0x7c},
 91    {"d", 0x84},
 92    {"dollar-sign", 0x5b},
 93    {"e", 0x85},
 94    {"eight", 0xf8},
 95    {"equals-sign", 0x7e},
 96    {"exclamation-mark", 0x5a},
 97    {"f", 0x86},
 98    {"five", 0xf5},
 99    {"form-feed", 0xc},
100    {"four", 0xf4},
101    {"full-stop", 0x4b},
102    {"g", 0x87},
103    {"grave-accent", 0x79},
104    {"greater-than-sign", 0x6e},
105    {"h", 0x88},
106    {"hyphen", 0x60},
107    {"hyphen-minus", 0x60},
108    {"i", 0x89},
109    {"j", 0x91},
110    {"k", 0x92},
111    {"l", 0x93},
112    {"left-brace", 0xc0},
113    {"left-curly-bracket", 0xc0},
114    {"left-parenthesis", 0x4d},
115    {"left-square-bracket", 0xad},
116    {"less-than-sign", 0x4c},
117    {"low-line", 0x6d},
118    {"m", 0x94},
119    {"n", 0x95},
120    {"newline", 0x15},
121    {"nine", 0xf9},
122    {"number-sign", 0x7b},
123    {"o", 0x96},
124    {"one", 0xf1},
125    {"p", 0x97},
126    {"percent-sign", 0x6c},
127    {"period", 0x4b},
128    {"plus-sign", 0x4e},
129    {"q", 0x98},
130    {"question-mark", 0x6f},
131    {"quotation-mark", 0x7f},
132    {"r", 0x99},
133    {"reverse-solidus", 0xe0},
134    {"right-brace", 0xd0},
135    {"right-curly-bracket", 0xd0},
136    {"right-parenthesis", 0x5d},
137    {"right-square-bracket", 0xbd},
138    {"s", 0xa2},
139    {"semicolon", 0x5e},
140    {"seven", 0xf7},
141    {"six", 0xf6},
142    {"slash", 0x61},
143    {"solidus", 0x61},
144    {"space", 0x40},
145    {"t", 0xa3},
146    {"tab", 0x5},
147    {"three", 0xf3},
148    {"tilde", 0xa1},
149    {"two", 0xf2},
150    {"u", 0xa4},
151    {"underscore", 0x6d},
152    {"v", 0xa5},
153    {"vertical-line", 0x4f},
154    {"vertical-tab", 0xb},
155    {"w", 0xa6},
156    {"x", 0xa7},
157    {"y", 0xa8},
158    {"z", 0xa9},
159    {"zero", 0xf0},
160    {"A", 0xc1},
161    {"B", 0xc2},
162    {"C", 0xc3},
163    {"D", 0xc4},
164    {"E", 0xc5},
165    {"F", 0xc6},
166    {"G", 0xc7},
167    {"H", 0xc8},
168    {"I", 0xc9},
169    {"J", 0xd1},
170    {"K", 0xd2},
171    {"L", 0xd3},
172    {"M", 0xd4},
173    {"N", 0xd5},
174    {"NUL", 0},
175    {"O", 0xd6},
176    {"P", 0xd7},
177    {"Q", 0xd8},
178    {"R", 0xd9},
179    {"S", 0xe2},
180    {"T", 0xe3},
181    {"U", 0xe4},
182    {"V", 0xe5},
183    {"W", 0xe6},
184    {"X", 0xe7},
185    {"Y", 0xe8},
186    {"Z", 0xe9}};
187#else
188// ASCII
189const collationnames collatenames[] = {
190    {"A", 0x41},
191    {"B", 0x42},
192    {"C", 0x43},
193    {"D", 0x44},
194    {"E", 0x45},
195    {"F", 0x46},
196    {"G", 0x47},
197    {"H", 0x48},
198    {"I", 0x49},
199    {"J", 0x4a},
200    {"K", 0x4b},
201    {"L", 0x4c},
202    {"M", 0x4d},
203    {"N", 0x4e},
204    {"NUL", 0x00},
205    {"O", 0x4f},
206    {"P", 0x50},
207    {"Q", 0x51},
208    {"R", 0x52},
209    {"S", 0x53},
210    {"T", 0x54},
211    {"U", 0x55},
212    {"V", 0x56},
213    {"W", 0x57},
214    {"X", 0x58},
215    {"Y", 0x59},
216    {"Z", 0x5a},
217    {"a", 0x61},
218    {"alert", 0x07},
219    {"ampersand", 0x26},
220    {"apostrophe", 0x27},
221    {"asterisk", 0x2a},
222    {"b", 0x62},
223    {"backslash", 0x5c},
224    {"backspace", 0x08},
225    {"c", 0x63},
226    {"carriage-return", 0x0d},
227    {"circumflex", 0x5e},
228    {"circumflex-accent", 0x5e},
229    {"colon", 0x3a},
230    {"comma", 0x2c},
231    {"commercial-at", 0x40},
232    {"d", 0x64},
233    {"dollar-sign", 0x24},
234    {"e", 0x65},
235    {"eight", 0x38},
236    {"equals-sign", 0x3d},
237    {"exclamation-mark", 0x21},
238    {"f", 0x66},
239    {"five", 0x35},
240    {"form-feed", 0x0c},
241    {"four", 0x34},
242    {"full-stop", 0x2e},
243    {"g", 0x67},
244    {"grave-accent", 0x60},
245    {"greater-than-sign", 0x3e},
246    {"h", 0x68},
247    {"hyphen", 0x2d},
248    {"hyphen-minus", 0x2d},
249    {"i", 0x69},
250    {"j", 0x6a},
251    {"k", 0x6b},
252    {"l", 0x6c},
253    {"left-brace", 0x7b},
254    {"left-curly-bracket", 0x7b},
255    {"left-parenthesis", 0x28},
256    {"left-square-bracket", 0x5b},
257    {"less-than-sign", 0x3c},
258    {"low-line", 0x5f},
259    {"m", 0x6d},
260    {"n", 0x6e},
261    {"newline", 0x0a},
262    {"nine", 0x39},
263    {"number-sign", 0x23},
264    {"o", 0x6f},
265    {"one", 0x31},
266    {"p", 0x70},
267    {"percent-sign", 0x25},
268    {"period", 0x2e},
269    {"plus-sign", 0x2b},
270    {"q", 0x71},
271    {"question-mark", 0x3f},
272    {"quotation-mark", 0x22},
273    {"r", 0x72},
274    {"reverse-solidus", 0x5c},
275    {"right-brace", 0x7d},
276    {"right-curly-bracket", 0x7d},
277    {"right-parenthesis", 0x29},
278    {"right-square-bracket", 0x5d},
279    {"s", 0x73},
280    {"semicolon", 0x3b},
281    {"seven", 0x37},
282    {"six", 0x36},
283    {"slash", 0x2f},
284    {"solidus", 0x2f},
285    {"space", 0x20},
286    {"t", 0x74},
287    {"tab", 0x09},
288    {"three", 0x33},
289    {"tilde", 0x7e},
290    {"two", 0x32},
291    {"u", 0x75},
292    {"underscore", 0x5f},
293    {"v", 0x76},
294    {"vertical-line", 0x7c},
295    {"vertical-tab", 0x0b},
296    {"w", 0x77},
297    {"x", 0x78},
298    {"y", 0x79},
299    {"z", 0x7a},
300    {"zero", 0x30}};
301#endif
302
303struct classnames {
304  const char* elem_;
305  regex_traits<char>::char_class_type mask_;
306};
307
308const classnames ClassNames[] = {
309    {"alnum", ctype_base::alnum},
310    {"alpha", ctype_base::alpha},
311    {"blank", ctype_base::blank},
312    {"cntrl", ctype_base::cntrl},
313    {"d", ctype_base::digit},
314    {"digit", ctype_base::digit},
315    {"graph", ctype_base::graph},
316    {"lower", ctype_base::lower},
317    {"print", ctype_base::print},
318    {"punct", ctype_base::punct},
319    {"s", ctype_base::space},
320    {"space", ctype_base::space},
321    {"upper", ctype_base::upper},
322    {"w", regex_traits<char>::__regex_word},
323    {"xdigit", ctype_base::xdigit}};
324
325struct use_strcmp {
326  bool operator()(const collationnames& x, const char* y) const { return strcmp(x.elem_, y) < 0; }
327  bool operator()(const classnames& x, const char* y) const { return strcmp(x.elem_, y) < 0; }
328};
329
330} // namespace
331
332string __get_collation_name(const char* s) {
333  const collationnames* i = std::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
334  string r;
335  if (i != end(collatenames) && strcmp(s, i->elem_) == 0)
336    r = char(i->char_);
337  return r;
338}
339
340regex_traits<char>::char_class_type __get_classname(const char* s, bool __icase) {
341  const classnames* i                   = std::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
342  regex_traits<char>::char_class_type r = 0;
343  if (i != end(ClassNames) && strcmp(s, i->elem_) == 0) {
344    r = i->mask_;
345    if (r == regex_traits<char>::__regex_word)
346      r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower;
347    else if (__icase) {
348      if (r & (ctype_base::lower | ctype_base::upper))
349        r |= ctype_base::alpha;
350    }
351  }
352  return r;
353}
354
355template <>
356void __match_any_but_newline<char>::__exec(__state& __s) const {
357  if (__s.__current_ != __s.__last_) {
358    switch (*__s.__current_) {
359    case '\r':
360    case '\n':
361      __s.__do_   = __state::__reject;
362      __s.__node_ = nullptr;
363      break;
364    default:
365      __s.__do_ = __state::__accept_and_consume;
366      ++__s.__current_;
367      __s.__node_ = this->first();
368      break;
369    }
370  } else {
371    __s.__do_   = __state::__reject;
372    __s.__node_ = nullptr;
373  }
374}
375
376template <>
377void __match_any_but_newline<wchar_t>::__exec(__state& __s) const {
378  if (__s.__current_ != __s.__last_) {
379    switch (*__s.__current_) {
380    case '\r':
381    case '\n':
382    case 0x2028:
383    case 0x2029:
384      __s.__do_   = __state::__reject;
385      __s.__node_ = nullptr;
386      break;
387    default:
388      __s.__do_ = __state::__accept_and_consume;
389      ++__s.__current_;
390      __s.__node_ = this->first();
391      break;
392    }
393  } else {
394    __s.__do_   = __state::__reject;
395    __s.__node_ = nullptr;
396  }
397}
398
399_LIBCPP_END_NAMESPACE_STD