libstdc++
unicode.h
Go to the documentation of this file.
1// Unicode utilities -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
28 */
29
30#ifndef _GLIBCXX_UNICODE_H
31#define _GLIBCXX_UNICODE_H 1
32
33#if __cplusplus >= 202002L
34#include <array>
35#include <bit> // bit_width
36#include <charconv> // __detail::__from_chars_alnum_to_val_table
37#include <string_view>
38#include <cstdint>
39#include <bits/stl_algo.h>
40#include <bits/stl_iterator.h>
41#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42#include <bits/ranges_util.h> // view_interface
43
44namespace std _GLIBCXX_VISIBILITY(default)
45{
46_GLIBCXX_BEGIN_NAMESPACE_VERSION
47namespace __unicode
48{
49 // A Unicode code point that is not a high or low surrogate.
50 constexpr bool
51 __is_scalar_value(char32_t __c)
52 {
53 if (__c < 0xD800) [[likely]]
54 return true;
55 return 0xDFFF < __c && __c <= 0x10FFFF;
56 }
57
58 // A code point that can be encoded in a single code unit of type _CharT.
59 template<typename _CharT>
60 constexpr bool
61 __is_single_code_unit(char32_t __c)
62 {
63 if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
64 return __c < 0x7F; // ASCII character
65 else
66 return __c < __gnu_cxx::__int_traits<_CharT>::__max
67 && __is_scalar_value(__c);
68 }
69
70 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
71
72 struct _Repl
73 {
74 constexpr char32_t
75 operator()() const noexcept
76 { return 0xFFFD; }
77 };
78
79 struct _Null_sentinel_t
80 {
81 template<input_iterator _It>
82 requires default_initializable<iter_value_t<_It>>
83 && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84 friend constexpr auto
85 operator==(_It __it, _Null_sentinel_t)
86 { return *__it == iter_value_t<_It>{}; }
87 };
88
89 template<typename _FromFmt, typename _ToFmt,
90 input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
91 typename _ErrorHandler = _Repl>
92 requires convertible_to<iter_value_t<_Iter>, _FromFmt>
93 class _Utf_iterator
94 {
95 static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
96
97 public:
98 using value_type = _ToFmt;
99 using difference_type = iter_difference_t<_Iter>;
100 using reference = value_type;
101 using iterator_concept
102 = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
103 bidirectional_iterator_tag>;
104
105 constexpr _Utf_iterator() = default;
106
107 constexpr
108 _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
109 requires bidirectional_iterator<_Iter>
110 : _M_first_and_curr{__first, __it}, _M_last(__last)
111 {
112 if (_M_curr() != _M_last)
113 _M_read();
114 else
115 _M_buf = {};
116 }
117
118 constexpr
119 _Utf_iterator(_Iter __it, _Sent __last)
120 requires (!bidirectional_iterator<_Iter>)
121 : _M_first_and_curr{__it}, _M_last(__last)
122 {
123 if (_M_curr() != _M_last)
124 _M_read();
125 else
126 _M_buf = {};
127 }
128
129 template<class _Iter2, class _Sent2>
130 requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
131 constexpr
132 _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
133 _ErrorHandler>& __other)
134 : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
135 _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
136 _M_last(__other._M_last)
137 { }
138
139 [[nodiscard]]
140 constexpr _Iter
141 begin() const requires bidirectional_iterator<_Iter>
142 { return _M_first(); }
143
144 [[nodiscard]]
145 constexpr _Sent
146 end() const { return _M_last; }
147
148 [[nodiscard]]
149 constexpr _Iter
150 base() const requires forward_iterator<_Iter>
151 { return _M_curr(); }
152
153 [[nodiscard]]
154 constexpr iter_difference_t<_Iter>
155 _M_units() const requires forward_iterator<_Iter>
156 { return _M_to_increment; }
157
158 [[nodiscard]]
159 constexpr value_type
160 operator*() const { return _M_buf[_M_buf_index]; }
161
162 constexpr _Utf_iterator&
163 operator++()
164 {
165 if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
166 {
167 if constexpr (forward_iterator<_Iter>)
168 std::advance(_M_curr(), _M_to_increment);
169 if (_M_curr() == _M_last)
170 _M_buf_index = 0;
171 else
172 _M_read();
173 }
174 else if (_M_buf_index + 1 < _M_buf_last)
175 ++_M_buf_index;
176 return *this;
177 }
178
179 constexpr _Utf_iterator
180 operator++(int)
181 {
182 auto __tmp = *this;
183 ++*this;
184 return __tmp;
185 }
186
187 constexpr _Utf_iterator&
188 operator--() requires bidirectional_iterator<_Iter>
189 {
190 if (!_M_buf_index && _M_curr() != _M_first())
191 _M_read_reverse();
192 else if (_M_buf_index)
193 --_M_buf_index;
194 return *this;
195 }
196
197 constexpr _Utf_iterator
198 operator--(int)
199 {
200 auto __tmp = *this;
201 --*this;
202 return __tmp;
203 }
204
205 [[nodiscard]]
206 friend constexpr bool
207 operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
208 requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
209 {
210 if constexpr (forward_iterator<_Iter>)
211 return __lhs._M_curr() == __rhs._M_curr()
212 && __lhs._M_buf_index == __rhs._M_buf_index;
213 else if (__lhs._M_curr() != __rhs._M_curr())
214 return false;
215 else if (__lhs._M_buf_index == __rhs._M_buf_index
216 && __lhs._M_buf_last == __rhs._M_buf_last)
217 return true;
218 else
219 return __lhs._M_buf_index == __lhs._M_buf_last
220 && __rhs._M_buf_index == __rhs._M_buf_last;
221 }
222
223 [[nodiscard]]
224 friend constexpr bool
225 operator==(_Utf_iterator __lhs, _Sent __rhs)
226 {
227 if constexpr (forward_iterator<_Iter>)
228 return __lhs._M_curr() == __rhs;
229 else
230 return __lhs._M_curr() == __rhs
231 && __lhs._M_buf_index == __lhs._M_buf_last;
232 }
233
234 private:
235 constexpr void
236 _M_read()
237 {
238 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
239 _M_read_utf8();
240 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
241 _M_read_utf16();
242 else
243 {
244 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
245 _M_read_utf32();
246 }
247 }
248
249 constexpr void
250 _M_read_reverse(); // TODO
251
252 template<typename>
253 struct _Guard
254 {
255 _Guard(void*, _Iter&) { }
256 };
257
258 template<typename _It> requires forward_iterator<_It>
259 struct _Guard<_It>
260 {
261 constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
262 _Utf_iterator* _M_this;
263 _It _M_orig;
264 };
265
266 constexpr void
267 _M_read_utf8()
268 {
269 _Guard<_Iter> __g{this, _M_curr()};
270 char32_t __c{};
271 const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
272 uint8_t __u = *_M_curr()++;
273 uint8_t __to_incr = 1;
274 auto __incr = [&, this] {
275 ++__to_incr;
276 return ++_M_curr();
277 };
278
279 if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
280 __c = __u;
281 else if (__u < 0xC2) [[unlikely]]
282 __c = _S_error();
283 else if (_M_curr() == _M_last) [[unlikely]]
284 __c = _S_error();
285 else if (__u <= 0xDF) // 0xC2 to 0xDF
286 {
287 __c = __u & 0x1F;
288 __u = *_M_curr();
289
290 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
291 __c = _S_error();
292 else
293 {
294 __c = (__c << 6) | (__u & 0x3F);
295 __incr();
296 }
297 }
298 else if (__u <= 0xEF) // 0xE0 to 0xEF
299 {
300 const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
301 const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
302
303 __c = __u & 0x0F;
304 __u = *_M_curr();
305
306 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
307 __c = _S_error();
308 else if (__incr() == _M_last) [[unlikely]]
309 __c = _S_error();
310 else
311 {
312 __c = (__c << 6) | (__u & 0x3F);
313 __u = *_M_curr();
314
315 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
316 __c = _S_error();
317 else
318 {
319 __c = (__c << 6) | (__u & 0x3F);
320 __incr();
321 }
322 }
323 }
324 else if (__u <= 0xF4) // 0xF0 to 0xF4
325 {
326 const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
327 const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
328
329 __c = __u & 0x07;
330 __u = *_M_curr();
331
332 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
333 __c = _S_error();
334 else if (__incr() == _M_last) [[unlikely]]
335 __c = _S_error();
336 else
337 {
338 __c = (__c << 6) | (__u & 0x3F);
339 __u = *_M_curr();
340
341 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
342 __c = _S_error();
343 else if (__incr() == _M_last) [[unlikely]]
344 __c = _S_error();
345 else
346 {
347 __c = (__c << 6) | (__u & 0x3F);
348 __u = *_M_curr();
349
350 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
351 __c = _S_error();
352 else
353 {
354 __c = (__c << 6) | (__u & 0x3F);
355 __incr();
356 }
357 }
358 }
359 }
360 else [[unlikely]]
361 __c = _S_error();
362
363 _M_update(__c, __to_incr);
364 }
365
366 constexpr void
367 _M_read_utf16()
368 {
369 _Guard<_Iter> __g{this, _M_curr()};
370 char32_t __c{};
371 uint16_t __u = *_M_curr()++;
372 uint8_t __to_incr = 1;
373
374 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
375 __c = __u;
376 else if (__u < 0xDC00 && _M_curr() != _M_last)
377 {
378 uint16_t __u2 = *_M_curr();
379 if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
380 __c = _S_error();
381 else
382 {
383 ++_M_curr();
384 __to_incr = 2;
385 uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
386 uint32_t __w = (__u >> 6) & 0x1F;
387 __c = (__w + 1) << 16 | __x;
388 }
389 }
390 else
391 __c = _S_error();
392
393 _M_update(__c, __to_incr);
394 }
395
396 constexpr void
397 _M_read_utf32()
398 {
399 _Guard<_Iter> __g{this, _M_curr()};
400 char32_t __c = *_M_curr()++;
401 if (!__is_scalar_value(__c)) [[unlikely]]
402 __c = _S_error();
403 _M_update(__c, 1);
404 }
405
406 // Encode the code point __c as one or more code units in _M_buf.
407 constexpr void
408 _M_update(char32_t __c, uint8_t __to_incr)
409 {
410 _M_to_increment = __to_incr;
411 _M_buf_index = 0;
412 if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
413 {
414 _M_buf[0] = __c;
415 _M_buf_last = 1;
416 }
417 else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
418 {
419 if (__is_single_code_unit<_ToFmt>(__c))
420 {
421 _M_buf[0] = __c;
422 _M_buf[1] = 0;
423 _M_buf_last = 1;
424 }
425 else
426 {
427 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
428 const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
429 char16_t __lead = __lead_offset + (__c >> 10);
430 char16_t __trail = 0xDC00 + (__c & 0x3FF);
431 _M_buf[0] = __lead;
432 _M_buf[1] = __trail;
433 _M_buf_last = 2;
434 }
435 }
436 else
437 {
438 static_assert(sizeof(_ToFmt) == 1);
439 int __bits = std::bit_width((uint32_t)__c);
440 if (__bits <= 7) [[likely]]
441 {
442 _M_buf[0] = __c;
443 _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
444 _M_buf_last = 1;
445 }
446 else if (__bits <= 11)
447 {
448 _M_buf[0] = 0xC0 | (__c >> 6);
449 _M_buf[1] = 0x80 | (__c & 0x3F);
450 _M_buf[2] = _M_buf[3] = 0;
451 _M_buf_last = 2;
452 }
453 else if (__bits <= 16)
454 {
455 _M_buf[0] = 0xE0 | (__c >> 12);
456 _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
457 _M_buf[2] = 0x80 | (__c & 0x3F);
458 _M_buf[3] = 0;
459 _M_buf_last = 3;
460 }
461 else
462 {
463 _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
464 _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
465 _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
466 _M_buf[3] = 0x80 | (__c & 0x3F);
467 _M_buf_last = 4;
468 }
469 }
470 }
471
472 constexpr char32_t
473 _S_error()
474 {
475 char32_t __c = _ErrorHandler()();
476 __glibcxx_assert(__is_scalar_value(__c));
477 return __c;
478 }
479
480 constexpr _Iter
481 _M_first() const requires bidirectional_iterator<_Iter>
482 { return _M_first_and_curr._M_first; }
483
484 constexpr _Iter&
485 _M_curr() { return _M_first_and_curr._M_curr; }
486
487 constexpr _Iter
488 _M_curr() const { return _M_first_and_curr._M_curr; }
489
490 array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
491
492 template<typename _It>
493 struct _First_and_curr
494 {
495 _First_and_curr() = default;
496
497 constexpr
498 _First_and_curr(_It __curr) : _M_curr(__curr) { }
499
500 template<convertible_to<_It> _It2>
501 constexpr
502 _First_and_curr(const _First_and_curr<_It2>& __other)
503 : _M_curr(__other._M_curr) { }
504
505 _It _M_curr;
506 };
507
508 template<typename _It> requires bidirectional_iterator<_It>
509 struct _First_and_curr<_It>
510 {
511 _First_and_curr() = default;
512
513 constexpr
514 _First_and_curr(_It __first, _It __curr)
515 : _M_first(__first), _M_curr(__curr) { }
516
517 template<convertible_to<_It> _It2>
518 constexpr
519 _First_and_curr(const _First_and_curr<_It2>& __other)
520 : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
521
522 _It _M_first;
523 _It _M_curr;
524 };
525
526 _First_and_curr<_Iter> _M_first_and_curr;
527
528 uint8_t _M_buf_index = 0;
529 uint8_t _M_buf_last = 0;
530 uint8_t _M_to_increment = 0;
531
532 [[no_unique_address]] _Sent _M_last;
533
534 template<typename _FromFmt2, typename _ToFmt2,
535 input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
536 typename _ErrHandler>
537 requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
538 friend class _Utf_iterator;
539 };
540
541 template<typename _ToFormat, ranges::input_range _Range>
542 class _Utf_view
543 : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
544 {
545 using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
546 _ToFormat, ranges::iterator_t<_Range>,
547 ranges::sentinel_t<_Range>>;
548
549 template<typename _Iter, typename _Sent>
550 constexpr auto
551 _M_begin(_Iter __first, _Sent __last)
552 {
553 if constexpr (bidirectional_iterator<_Iter>)
554 return _Iterator(__first, __first, __last);
555 else
556 return _Iterator(__first, __last);
557 }
558
559 template<typename _Iter, typename _Sent>
560 constexpr auto
561 _M_end(_Iter __first, _Sent __last)
562 {
563 if constexpr (!is_same_v<_Iter, _Sent>)
564 return __last;
565 else if constexpr (bidirectional_iterator<_Iter>)
566 return _Iterator(__first, __last, __last);
567 else
568 return _Iterator(__last, __last);
569 }
570
571 _Range _M_base;
572
573 public:
574 constexpr explicit
575 _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
576
577 constexpr auto begin()
578 { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
579
580 constexpr auto end()
581 { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
582
583 constexpr bool empty() const { return ranges::empty(_M_base); }
584 };
585
586#ifdef __cpp_char8_t
587 template<typename _View>
588 using _Utf8_view = _Utf_view<char8_t, _View>;
589#else
590 template<typename _View>
591 using _Utf8_view = _Utf_view<char, _View>;
592#endif
593 template<typename _View>
594 using _Utf16_view = _Utf_view<char16_t, _View>;
595 template<typename _View>
596 using _Utf32_view = _Utf_view<char32_t, _View>;
597
598inline namespace __v16_0_0
599{
600#define _GLIBCXX_GET_UNICODE_DATA 160000
601#include "unicode-data.h"
602#ifdef _GLIBCXX_GET_UNICODE_DATA
603# error "Invalid unicode data"
604#endif
605
606 // The field width of a code point.
607 constexpr int
608 __field_width(char32_t __c) noexcept
609 {
610 if (__c < __width_edges[0]) [[likely]]
611 return 1;
612
613 auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
614 return (__p - __width_edges) % 2 + 1;
615 }
616
617 // @pre c <= 0x10FFFF
618 constexpr bool
619 __should_escape_category(char32_t __c) noexcept
620 {
621 constexpr uint32_t __mask = 0x01;
622 auto* __end = std::end(__escape_edges);
623 auto* __p = std::lower_bound(__escape_edges, __end,
624 (__c << 1u) + 2);
625 return __p[-1] & __mask;
626 }
627
628
629 // @pre c <= 0x10FFFF
630 constexpr _Gcb_property
631 __grapheme_cluster_break_property(char32_t __c) noexcept
632 {
633 constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
634 auto* __end = std::end(__gcb_edges);
635 auto* __p = std::lower_bound(__gcb_edges, __end,
636 (__c << __gcb_shift_bits) | __mask);
637 return _Gcb_property(__p[-1] & __mask);
638 }
639
640 constexpr bool
641 __is_incb_linker(char32_t __c) noexcept
642 {
643 const auto __end = std::end(__incb_linkers);
644 // Array is small enough that linear search is faster than binary search.
645 return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
646 }
647
648 // @pre c <= 0x10FFFF
649 constexpr _InCB
650 __incb_property(char32_t __c) noexcept
651 {
652 if ((__c << 2) < __incb_edges[0]) [[likely]]
653 return _InCB(0);
654
655 constexpr uint32_t __mask = 0x3;
656 auto* __end = std::end(__incb_edges);
657 auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
658 return _InCB(__p[-1] & __mask);
659 }
660
661 constexpr bool
662 __is_extended_pictographic(char32_t __c)
663 {
664 if (__c < __xpicto_edges[0]) [[likely]]
665 return 0;
666
667 auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
668 return (__p - __xpicto_edges) % 2;
669 }
670
671 struct _Grapheme_cluster_iterator_base
672 {
673 char32_t _M_c; // First code point in the cluster.
674 _Gcb_property _M_prop; // GCB property of _M_c.
675 enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
676 _XPicto _M_xpicto_seq_state = _XPicto::_Init;
677 unsigned char _M_RI_count = 0;
678 bool _M_incb_linker_seen = false;
679
680 constexpr void
681 _M_reset(char32_t __c, _Gcb_property __p)
682 {
683 _M_c = __c;
684 _M_prop = __p;
685 _M_xpicto_seq_state = _XPicto::_Init;
686 _M_RI_count = 0;
687 _M_incb_linker_seen = false;
688 }
689
690 constexpr void
691 _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
692 {
693 if (_M_xpicto_seq_state == _XPicto::_Failed)
694 return;
695
696 auto __next_state = _XPicto::_Failed;
697 if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
698 {
699 if (__p == _Gcb_property::_Gcb_ZWJ)
700 {
701 if (_M_xpicto_seq_state == _XPicto::_Matched)
702 __next_state = _XPicto::_Zwj;
703 // We check _M_c here so that we do the lookup at most once,
704 // and only for clusters containing at least one ZWJ.
705 else if (__is_extended_pictographic(_M_c))
706 __next_state = _XPicto::_Zwj;
707 }
708 else if (__p == _Gcb_property::_Gcb_Extend)
709 __next_state = _M_xpicto_seq_state; // no change
710 }
711 else // Zwj
712 {
713 // This assumes that all \p{Extended_Pictographic} emoji have
714 // Grapheme_Cluster_Break=Other.
715 if (__p == _Gcb_property::_Gcb_Other
716 && __is_extended_pictographic(__c))
717 __next_state = _XPicto::_Matched;
718 }
719 _M_xpicto_seq_state = __next_state;
720 }
721
722 constexpr void
723 _M_update_ri_count(_Gcb_property __p)
724 {
725 if (__p == _Gcb_property::_Gcb_Regional_Indicator)
726 ++_M_RI_count;
727 else
728 _M_RI_count = 0;
729 }
730
731 constexpr void
732 _M_update_incb_state(char32_t __c, _Gcb_property)
733 {
734 if (__is_incb_linker(__c))
735 _M_incb_linker_seen = true;
736 }
737 };
738
739 // Split a range into extended grapheme clusters.
740 template<ranges::forward_range _View> requires ranges::view<_View>
741 class _Grapheme_cluster_view
742 : public ranges::view_interface<_Grapheme_cluster_view<_View>>
743 {
744 public:
745
746 constexpr
747 _Grapheme_cluster_view(_View __v)
748 : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
749 { }
750
751 constexpr auto begin() const { return _M_begin; }
752 constexpr auto end() const { return _M_begin.end(); }
753
754 private:
755 struct _Iterator : private _Grapheme_cluster_iterator_base
756 {
757 private:
758 // Iterator over the underlying code points.
759 using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
760
761 public:
762 // TODO: Change value_type to be subrange<_U32_iterator> instead?
763 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
764 // That would be the whole cluster, not just the first code point.
765 // Would need to store two iterators and find end of current cluster
766 // on increment, so operator* returns value_type(_M_base, _M_next).
767 using value_type = char32_t;
768 using iterator_concept = forward_iterator_tag;
769 using difference_type = ptrdiff_t;
770
771 constexpr
772 _Iterator(_U32_iterator __i)
773 : _M_base(__i)
774 {
775 if (__i != __i.end())
776 {
777 _M_c = *__i;
778 _M_prop = __grapheme_cluster_break_property(_M_c);
779 }
780 }
781
782 // The first code point of the current extended grapheme cluster.
783 constexpr value_type
784 operator*() const
785 { return _M_c; }
786
787 constexpr auto
788 operator->() const
789 { return &_M_c; }
790
791 // Move to the next extended grapheme cluster.
792 constexpr _Iterator&
793 operator++()
794 {
795 const auto __end = _M_base.end();
796 if (_M_base != __end)
797 {
798 auto __p_prev = _M_prop;
799 auto __it = _M_base;
800 while (++__it != __end)
801 {
802 char32_t __c = *__it;
803 auto __p = __grapheme_cluster_break_property(*__it);
804 _M_update_xpicto_seq_state(__c, __p);
805 _M_update_ri_count(__p);
806 _M_update_incb_state(__c, __p);
807 if (_M_is_break(__p_prev, __p, __it))
808 {
809 // Found a grapheme cluster break
810 _M_reset(__c, __p);
811 break;
812 }
813 __p_prev = __p;
814 }
815 _M_base = __it;
816 }
817 return *this;
818 }
819
820 constexpr _Iterator
821 operator++(int)
822 {
823 auto __tmp = *this;
824 ++*this;
825 return __tmp;
826 }
827
828 constexpr bool
829 operator==(const _Iterator& __i) const
830 { return _M_base == __i._M_base; }
831
832 // This supports iter != iter.end()
833 constexpr bool
834 operator==(const ranges::sentinel_t<_View>& __i) const
835 { return _M_base == __i; }
836
837 // Iterator to the start of the current cluster.
838 constexpr auto base() const { return _M_base.base(); }
839
840 // The end of the underlying view (not the end of the current cluster!)
841 constexpr auto end() const { return _M_base.end(); }
842
843 // Field width of the first code point in the cluster.
844 constexpr int
845 width() const noexcept
846 { return __field_width(_M_c); }
847
848 private:
849 _U32_iterator _M_base;
850
851 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
852 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
853 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
854 // Return true if there is a break between code point with property p1
855 // and code point with property p2.
856 constexpr bool
857 _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
858 _U32_iterator __curr) const
859 {
860 using enum _Gcb_property;
861
862 if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
863 return true; // Break after Control or LF.
864
865 if (__p1 == _Gcb_CR)
866 return __p2 != _Gcb_LF; // Do not break between a CR and LF.
867
868 // Rule GB5
869 if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
870 return true; // Break before Control, CR or LF.
871
872 // Rule GB6
873 if (__p1 == _Gcb_L)
874 switch (__p2)
875 {
876 case _Gcb_L:
877 case _Gcb_V:
878 case _Gcb_LV:
879 case _Gcb_LVT:
880 return false; // Do not break Hangul syllable sequences.
881 default:
882 return true;
883 }
884
885 // Rule GB7
886 if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
887 switch (__p2)
888 {
889 case _Gcb_V:
890 case _Gcb_T:
891 return false; // Do not break Hangul syllable sequences.
892 default:
893 return true;
894 }
895
896 // Rule GB8
897 if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
898 return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
899
900 // Rule GB9
901 if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
902 return false; // Do not break before extending characters or ZWJ.
903
904 // The following GB9x rules only apply to extended grapheme clusters,
905 // which is what the C++ standard uses (not legacy grapheme clusters).
906
907 // Rule GB9a
908 if (__p2 == _Gcb_SpacingMark)
909 return false; // Do not break before SpacingMarks,
910 // Rule GB9b
911 if (__p1 == _Gcb_Prepend)
912 return false; // or after Prepend characters.
913
914 // Rule GB9c (Unicode 15.1.0)
915 // Do not break within certain combinations with
916 // Indic_Conjunct_Break (InCB)=Linker.
917 if (_M_incb_linker_seen
918 && __incb_property(_M_c) == _InCB::_Consonant
919 && __incb_property(*__curr) == _InCB::_Consonant)
920 {
921 // Match [_M_base, __curr] against regular expression
922 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
923 bool __have_linker = false;
924 auto __it = _M_base;
925 while (++__it != __curr)
926 {
927 if (__is_incb_linker(*__it))
928 __have_linker = true;
929 else
930 {
931 auto __incb = __incb_property(*__it);
932 if (__incb == _InCB::_Consonant)
933 __have_linker = false;
934 else if (__incb != _InCB::_Extend)
935 break;
936 }
937 }
938 if (__it == __curr && __have_linker)
939 return false;
940 }
941
942 // Rule GB11
943 // Do not break within emoji modifier sequences
944 // or emoji zwj sequences.
945 if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
946 return false;
947
948 // Rules GB12 and GB13
949 // Do not break within emoji flag sequences. That is, do not break
950 // between regional indicator (RI) symbols if there is an odd number
951 // of RI characters before the break point.
952 if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
953 return (_M_RI_count & 1) == 0;
954
955 // Rule GB999
956 return true; // Otherwise, break everywhere.
957 }
958 };
959
960 _Iterator _M_begin;
961 };
962
963} // namespace __v16_0_0
964
965 // Return the field width of a string.
966 template<typename _CharT>
967 constexpr size_t
968 __field_width(basic_string_view<_CharT> __s)
969 {
970 if (__s.empty()) [[unlikely]]
971 return 0;
972 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
973 auto __it = __gc.begin();
974 const auto __end = __gc.end();
975 size_t __n = __it.width();
976 while (++__it != __end)
977 __n += __it.width();
978 return __n;
979 }
980
981 // Truncate a string to at most `__max` field width units, and return the
982 // resulting field width.
983 template<typename _CharT>
984 constexpr size_t
985 __truncate(basic_string_view<_CharT>& __s, size_t __max)
986 {
987 if (__s.empty()) [[unlikely]]
988 return 0;
989
990 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
991 auto __it = __gc.begin();
992 const auto __end = __gc.end();
993 size_t __n = __it.width();
994 if (__n > __max)
995 {
996 __s = {};
997 return 0;
998 }
999 while (++__it != __end)
1000 {
1001 size_t __n2 = __n + __it.width();
1002 if (__n2 > __max)
1003 {
1004 __s = basic_string_view<_CharT>(__s.begin(), __it.base());
1005 return __n;
1006 }
1007 __n = __n2;
1008 }
1009 return __n;
1010 }
1011
1012 template<typename _CharT>
1013 consteval bool
1014 __literal_encoding_is_unicode()
1015 {
1016 if constexpr (is_same_v<_CharT, char16_t>)
1017 return true;
1018 else if constexpr (is_same_v<_CharT, char32_t>)
1019 return true;
1020#ifdef __cpp_char8_t
1021 else if constexpr (is_same_v<_CharT, char8_t>)
1022 return true;
1023#endif
1024
1025 const char* __enc = "";
1026
1027#ifdef __GNUC_EXECUTION_CHARSET_NAME
1028 auto __remove_iso10646_prefix = [](const char* __s) {
1029 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1030 if (__s[0] == 'I' || __s[0] == 'i')
1031 if (__s[1] == 'S' || __s[1] == 's')
1032 if (__s[2] == 'O' || __s[2] == 'o')
1033 if (string_view(__s + 3).starts_with("-10646/"))
1034 return __s + 10;
1035 return __s;
1036 };
1037
1038 if constexpr (is_same_v<_CharT, char>)
1039 __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1040# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1041 else
1042 __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1043# endif
1044
1045 if ((__enc[0] == 'U' || __enc[0] == 'u')
1046 && (__enc[1] == 'T' || __enc[1] == 't')
1047 && (__enc[2] == 'F' || __enc[2] == 'f'))
1048 {
1049 __enc += 3;
1050 if (__enc[0] == '-')
1051 ++__enc;
1052 if (__enc[0] == '8')
1053 return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1054 else if constexpr (!is_same_v<_CharT, char>)
1055 {
1056 string_view __s(__enc);
1057 if (__s.ends_with("//"))
1058 __s.remove_suffix(2);
1059 if (__s.ends_with("LE") || __s.ends_with("BE"))
1060 __s.remove_suffix(2);
1061 return __s == "16" || __s == "32";
1062 }
1063 }
1064#elif defined __clang_literal_encoding__
1065 if constexpr (is_same_v<_CharT, char>)
1066 __enc = __clang_literal_encoding__;
1067# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1068 else
1069 __enc = __clang_wide_literal_encoding__;
1070# endif
1071 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1072 string_view __s(__enc);
1073 if (__s == "UTF-8")
1074 return true;
1075 else if constexpr (!is_same_v<_CharT, char>)
1076 return __s == "UTF-16" || __s == "UTF-32";
1077#endif
1078
1079 return false;
1080 }
1081
1082 consteval bool
1083 __literal_encoding_is_utf8()
1084 { return __literal_encoding_is_unicode<char>(); }
1085
1086 consteval bool
1087 __literal_encoding_is_extended_ascii()
1088 {
1089 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1090 && 'a' == 0x61 && 'z' == 0x7a;
1091 }
1092
1093 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1094 constexpr bool
1095 __charset_alias_match(string_view __a, string_view __b)
1096 {
1097 // Map alphanumeric chars to their base 64 value, everything else to 127.
1098 auto __map = [](char __c, bool& __num) -> unsigned char {
1099 if (__c == '0') [[unlikely]]
1100 return __num ? 0 : 127;
1101 const auto __v = __detail::__from_chars_alnum_to_val(__c);
1102 __num = __v < 10;
1103 return __v;
1104 };
1105
1106 auto __ptr_a = __a.begin(), __end_a = __a.end();
1107 auto __ptr_b = __b.begin(), __end_b = __b.end();
1108 bool __num_a = false, __num_b = false;
1109
1110 while (true)
1111 {
1112 // Find the value of the next alphanumeric character in each string.
1113 unsigned char __val_a{}, __val_b{};
1114 while (__ptr_a != __end_a
1115 && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1116 ++__ptr_a;
1117 while (__ptr_b != __end_b
1118 && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1119 ++__ptr_b;
1120 // Stop when we reach the end of a string, or get a mismatch.
1121 if (__ptr_a == __end_a)
1122 return __ptr_b == __end_b;
1123 else if (__ptr_b == __end_b)
1124 return false;
1125 else if (__val_a != __val_b)
1126 return false; // Found non-matching characters.
1127 ++__ptr_a;
1128 ++__ptr_b;
1129 }
1130 return true;
1131 }
1132
1133} // namespace __unicode
1134
1135namespace ranges
1136{
1137 template<typename _To, typename _Range>
1138 inline constexpr bool
1139 enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1140 = enable_borrowed_range<_Range>;
1141
1142 template<typename _Range>
1143 inline constexpr bool
1144 enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1145 = enable_borrowed_range<_Range>;
1146} // namespace ranges
1147
1148_GLIBCXX_END_NAMESPACE_VERSION
1149} // namespace std
1150#endif // C++20
1151#endif // _GLIBCXX_UNICODE_H
constexpr complex< _Tp > operator*(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x times y.
Definition complex:434
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition move.h:138
constexpr _Tp && forward(typename std::remove_reference< _Tp >::type &__t) noexcept
Forward an lvalue.
Definition move.h:72
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition valarray:1251
_Tp * begin(valarray< _Tp > &__va) noexcept
Return an iterator pointing to the first element of the valarray.
Definition valarray:1229
ISO C++ entities toplevel namespace is std.
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.