Skip to content

Commit 2902ada

Browse files
authored
Merge pull request #119 from hanickadot/u8string
add std::u8string (with utf8 code point iterator)
2 parents 7723c4e + d7939ff commit 2902ada

File tree

10 files changed

+4826
-62
lines changed

10 files changed

+4826
-62
lines changed

include/ctre-unicode.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef CTRE_V2__CTRE_UNICODE__HPP
22
#define CTRE_V2__CTRE_UNICODE__HPP
33

4+
#include "ctre.hpp"
45
#include "unicode-db.hpp"
56

67
#endif

include/ctre/evaluation.hpp

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,34 +37,32 @@ template <size_t Limit> constexpr CTRE_FORCE_INLINE bool less_than(size_t i) {
3737
}
3838
}
3939

40+
template <typename ResultIterator, typename Pattern> using return_type = decltype(regex_results(std::declval<ResultIterator>(), find_captures(Pattern{})));
41+
4042
// calling with pattern prepare stack and triplet of iterators
41-
template <typename Iterator, typename EndIterator, typename Pattern>
42-
constexpr inline auto match_re(const Iterator begin, const EndIterator end, Pattern pattern) noexcept {
43-
using return_type = decltype(regex_results(std::declval<Iterator>(), find_captures(pattern)));
44-
return evaluate(begin, begin, end, return_type{}, ctll::list<start_mark, Pattern, assert_end, end_mark, accept>());
43+
template <typename Iterator, typename EndIterator, typename Pattern, typename ResultIterator = Iterator>
44+
constexpr inline auto match_re(const Iterator begin, const EndIterator end, Pattern) noexcept {
45+
return evaluate(begin, begin, end, return_type<ResultIterator, Pattern>{}, ctll::list<start_mark, Pattern, assert_end, end_mark, accept>());
4546
}
4647

47-
template <typename Iterator, typename EndIterator, typename Pattern>
48-
constexpr inline auto starts_with_re(const Iterator begin, const EndIterator end, Pattern pattern) noexcept {
49-
using return_type = decltype(regex_results(std::declval<Iterator>(), find_captures(pattern)));
50-
return evaluate(begin, begin, end, return_type{}, ctll::list<start_mark, Pattern, end_mark, accept>());
48+
template <typename Iterator, typename EndIterator, typename Pattern, typename ResultIterator = Iterator>
49+
constexpr inline auto starts_with_re(const Iterator begin, const EndIterator end, Pattern) noexcept {
50+
return evaluate(begin, begin, end, return_type<ResultIterator, Pattern>{}, ctll::list<start_mark, Pattern, end_mark, accept>());
5151
}
5252

53-
template <typename Iterator, typename EndIterator, typename Pattern>
54-
constexpr inline auto search_re(const Iterator begin, const EndIterator end, Pattern pattern) noexcept {
55-
using return_type = decltype(regex_results(std::declval<Iterator>(), find_captures(pattern)));
56-
53+
template <typename Iterator, typename EndIterator, typename Pattern, typename ResultIterator = Iterator>
54+
constexpr inline auto search_re(const Iterator begin, const EndIterator end, Pattern) noexcept {
5755
constexpr bool fixed = starts_with_anchor(ctll::list<Pattern>{});
5856

5957
auto it = begin;
6058
for (; end != it && !fixed; ++it) {
61-
if (auto out = evaluate(begin, it, end, return_type{}, ctll::list<start_mark, Pattern, end_mark, accept>())) {
59+
if (auto out = evaluate(begin, it, end, return_type<ResultIterator, Pattern>{}, ctll::list<start_mark, Pattern, end_mark, accept>())) {
6260
return out;
6361
}
6462
}
6563

6664
// in case the RE is empty or fixed
67-
return evaluate(begin, it, end, return_type{}, ctll::list<start_mark, Pattern, end_mark, accept>());
65+
return evaluate(begin, it, end, return_type<ResultIterator, Pattern>{}, ctll::list<start_mark, Pattern, end_mark, accept>());
6866
}
6967

7068

@@ -109,7 +107,7 @@ constexpr CTRE_FORCE_INLINE R evaluate(const Iterator, Iterator current, const E
109107

110108
template <typename R, typename Iterator, typename EndIterator, typename CharacterLike, typename... Tail, typename = std::enable_if_t<(MatchesCharacter<CharacterLike>::template value<decltype(*std::declval<Iterator>())>)>>
111109
constexpr CTRE_FORCE_INLINE R evaluate(const Iterator begin, Iterator current, const EndIterator end, R captures, ctll::list<CharacterLike, Tail...>) noexcept {
112-
if (end == current) return not_matched;
110+
if (current == end) return not_matched;
113111
if (!CharacterLike::match_char(*current)) return not_matched;
114112
return evaluate(begin, current+1, end, captures, ctll::list<Tail...>());
115113
}

include/ctre/id.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@ namespace ctre {
77

88
template <auto... Name> struct id {
99
static constexpr auto name = ctll::fixed_string<sizeof...(Name)>{{Name...}};
10-
};
1110

12-
template <auto... Name> constexpr auto operator==(id<Name...>, id<Name...>) noexcept -> std::true_type { return {}; }
11+
friend constexpr auto operator==(id<Name...>, id<Name...>) noexcept -> std::true_type { return {}; }
1312

14-
template <auto... Name1, auto... Name2> constexpr auto operator==(id<Name1...>, id<Name2...>) noexcept -> std::false_type { return {}; }
13+
template <auto... Other> friend constexpr auto operator==(id<Name...>, id<Other...>) noexcept -> std::false_type { return {}; }
1514

16-
template <auto... Name, typename T> constexpr auto operator==(id<Name...>, T) noexcept -> std::false_type { return {}; }
15+
template <typename T> friend constexpr auto operator==(id<Name...>, T) noexcept -> std::false_type { return {}; }
16+
17+
template <typename T> friend constexpr auto operator==(T, id<Name...>) noexcept -> std::false_type { return {}; }
18+
};
1719

1820
}
1921

include/ctre/return_type.hpp

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define CTRE__RETURN_TYPE__HPP
33

44
#include "id.hpp"
5+
#include "utf8.hpp"
56
#include <type_traits>
67
#include <tuple>
78
#include <string_view>
@@ -56,27 +57,46 @@ template <size_t Id, typename Name = void> struct captured_content {
5657
}
5758

5859
constexpr CTRE_FORCE_INLINE const auto * data() const noexcept {
60+
#if __cpp_char8_t >= 201811
61+
if constexpr (std::is_same_v<Iterator, utf8_iterator>) {
62+
return _begin.ptr;
63+
} else {
64+
return &*_begin;
65+
}
66+
#else
5967
return &*_begin;
68+
#endif
6069
}
6170

6271
constexpr CTRE_FORCE_INLINE auto size() const noexcept {
63-
return static_cast<size_t>(std::distance(_begin, _end));
72+
return static_cast<size_t>(std::distance(begin(), end()));
73+
}
74+
75+
constexpr CTRE_FORCE_INLINE size_t unit_size() const noexcept {
76+
#if __cpp_char8_t >= 201811
77+
if constexpr (std::is_same_v<Iterator, utf8_iterator>) {
78+
return static_cast<size_t>(std::distance(_begin.ptr, _end.ptr));
79+
}
80+
#endif
81+
return static_cast<size_t>(std::distance(begin(), end()));
6482
}
6583

6684
constexpr CTRE_FORCE_INLINE auto to_view() const noexcept {
67-
return std::basic_string_view<char_type>(&*_begin, static_cast<size_t>(std::distance(_begin, _end)));
85+
// TODO make sure we are working with contiguous range
86+
return std::basic_string_view<char_type>(data(), static_cast<size_t>(unit_size()));
6887
}
6988

7089
constexpr CTRE_FORCE_INLINE auto to_string() const noexcept {
71-
return std::basic_string<char_type>(begin(), end());
90+
// TODO make sure we are working with contiguous range
91+
return std::basic_string<char_type>(data(), static_cast<size_t>(unit_size()));
7292
}
7393

7494
constexpr CTRE_FORCE_INLINE auto view() const noexcept {
75-
return std::basic_string_view<char_type>(&*_begin, static_cast<size_t>(std::distance(_begin, _end)));
95+
return to_view();
7696
}
7797

7898
constexpr CTRE_FORCE_INLINE auto str() const noexcept {
79-
return std::basic_string<char_type>(begin(), end());
99+
return to_string();
80100
}
81101

82102
constexpr CTRE_FORCE_INLINE operator std::basic_string_view<char_type>() const noexcept {

include/ctre/utf8.hpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#ifndef CTRE__UTF8__HPP
2+
#define CTRE__UTF8__HPP
3+
4+
#if __cpp_char8_t >= 201811
5+
6+
#include "utility.hpp"
7+
#include <iterator>
8+
9+
namespace ctre {
10+
11+
struct utf8_iterator {
12+
using self_type = utf8_iterator;
13+
using value_type = char8_t;
14+
using reference = char8_t;
15+
using pointer = const char8_t *;
16+
using iterator_category = std::forward_iterator_tag;
17+
using difference_type = int;
18+
19+
struct sentinel {
20+
21+
};
22+
23+
const char8_t * ptr{nullptr};
24+
const char8_t * end{nullptr};
25+
26+
constexpr friend bool operator!=(const utf8_iterator & lhs, sentinel) {
27+
return lhs.ptr < lhs.end;
28+
}
29+
30+
constexpr friend bool operator!=(const utf8_iterator & lhs, const utf8_iterator & rhs) {
31+
return lhs.ptr != rhs.ptr;
32+
}
33+
34+
constexpr friend bool operator==(const utf8_iterator & lhs, sentinel) {
35+
return lhs.ptr >= lhs.end;
36+
}
37+
38+
constexpr utf8_iterator & operator=(const char8_t * rhs) {
39+
ptr = rhs;
40+
return *this;
41+
}
42+
43+
constexpr operator const char8_t *() const noexcept {
44+
return ptr;
45+
}
46+
47+
constexpr utf8_iterator & operator++() noexcept {
48+
// the contant is mapping from first 5 bits of first code unit to length of UTF8 code point -1
49+
// xxxxx -> yy (5 bits to 2 bits)
50+
// 5 bits are 32 combination, and for each I need 2 bits, hence 64 bit constant
51+
// (*ptr >> 3) looks at left 5 bits
52+
// << 1 will multiply it by 2
53+
// & 0b11u selects only needed two bits
54+
// +1 because each iteration is at least one code unit forward
55+
56+
ptr += ((0x3A55000000000000ull >> ((*ptr >> 3) << 1)) & 0b11u) + 1;
57+
return *this;
58+
}
59+
60+
constexpr utf8_iterator operator+(unsigned step) const noexcept {
61+
utf8_iterator result = *this;
62+
while (step > 0) {
63+
++result;
64+
step--;
65+
}
66+
return result;
67+
}
68+
69+
constexpr char32_t operator*() const noexcept {
70+
constexpr uint64_t lengths = 0x3A55000000000000ull;
71+
constexpr char32_t mojibake = 0xFFFDull;
72+
73+
// quickpath
74+
if (!(*ptr & 0b1000'0000u)) CTRE_LIKELY {
75+
return *ptr;
76+
}
77+
78+
// calculate length based on first 5 bits
79+
const unsigned length = (lengths >> ((*ptr >> 3) * 2)) & 0b11u;
80+
81+
// actual length is number + 1 bytes
82+
83+
// length 0 here means it's a bad front unit
84+
if (!length) CTRE_UNLIKELY {
85+
return mojibake;
86+
}
87+
88+
// if part of the utf-8 sequence is past the end
89+
if (((ptr + length) >= end)) CTRE_UNLIKELY {
90+
return mojibake;
91+
}
92+
93+
if ((ptr[1] >> 6) != 0b10) CTRE_UNLIKELY {
94+
return mojibake;
95+
}
96+
97+
const char8_t mask = (0b0010'0000u >> length) - 1;
98+
99+
// length = 1 (2 bytes) mask = 0b0001'1111u
100+
// length = 2 (3 bytes) mask = 0b0000'1111u
101+
// length = 3 (4 bytes) mask = 0b0000'0111u
102+
103+
// remove utf8 front bits, get only significant part
104+
// and add first trailing unit
105+
106+
char32_t result = ((ptr[0] & mask) << 6) | (ptr[1] & 0b0011'1111u);
107+
108+
// add rest of trailing units
109+
if (length == 1) CTRE_LIKELY {
110+
return result;
111+
}
112+
113+
if ((ptr[2] >> 6) != 0b10) CTRE_UNLIKELY {
114+
return mojibake;
115+
}
116+
117+
result = (result << 6) | (ptr[2] & 0b0011'1111u);
118+
119+
if (length == 2) CTRE_LIKELY {
120+
return result;
121+
}
122+
123+
if ((ptr[3] >> 6) != 0b10) CTRE_UNLIKELY {
124+
return mojibake;
125+
}
126+
127+
return (result << 6) | (ptr[3] & 0b0011'1111u);
128+
}
129+
};
130+
131+
struct utf8_range {
132+
std::u8string_view range;
133+
constexpr utf8_range(std::u8string_view r) noexcept: range{r} { }
134+
135+
constexpr auto begin() const noexcept {
136+
return utf8_iterator{range.data(), range.data() + range.size()};
137+
}
138+
constexpr auto end() const noexcept {
139+
return utf8_iterator::sentinel{};
140+
}
141+
};
142+
143+
}
144+
145+
#endif
146+
147+
#endif

include/ctre/utility.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
11
#ifndef CTRE__UTILITY__HPP
22
#define CTRE__UTILITY__HPP
33

4+
#if __has_cpp_attribute(likely)
5+
#define CTRE_LIKELY [[likely]]
6+
#else
7+
#define CTRE_LIKELY
8+
#endif
9+
10+
#if __has_cpp_attribute(unlikely)
11+
#define CTRE_UNLIKELY [[unlikely]]
12+
#else
13+
#define CTRE_UNLIKELY
14+
#endif
15+
416
#ifdef _MSC_VER
517
#define CTRE_FORCE_INLINE __forceinline
618
#define CTRE_FLATTEN

0 commit comments

Comments
 (0)