Skip to content

Commit 9c9d99f

Browse files
committed
optimizes string matching by allowing memcmp like functionality (even on utf8 sequences)
reference: #147 comparison: https://compiler-explorer.com/z/Tz3KhG
1 parent 9a37e55 commit 9c9d99f

File tree

2 files changed

+104
-4
lines changed

2 files changed

+104
-4
lines changed

include/ctre/evaluation.hpp

+66-4
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,73 @@ template <typename CharT, typename Iterator, typename EndIterator> constexpr CTR
115115
return false;
116116
}
117117

118-
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
119-
120-
bool same = (compare_character(String, current, end) && ... && true);
118+
#if __cpp_char8_t >= 201811
119+
template <size_t N, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_utf8_string(Iterator current, [[maybe_unused]] const EndIterator end, char8_t (&buffer)[N], std::index_sequence<Idx...>) noexcept {
120+
//abuse inside knowledge of how utf8_iterator works
121+
if constexpr (!std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>) {
122+
size_t count = end.ptr - current.ptr; //size_t count = std::distance(current.ptr, end.ptr);
123+
size_t bump = ((count < N) ? count : N);
124+
//using ^ operator vs != because gcc complains about parens
125+
#if defined(__GNUC__) && !defined(__clang__)
126+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] != buffer[Idx])) + ... + size_t{0}) };
127+
#else
128+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) };
129+
#endif
130+
} else {
131+
size_t count = current.end - current.ptr; //size_t count = std::distance(current.ptr, current.end);
132+
size_t bump = ((count < N) ? count : N);
133+
#if defined(__GNUC__) && !defined(__clang__)
134+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] != buffer[Idx])) + ... + size_t{0}) };
135+
#else
136+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) };
137+
#endif
138+
}
139+
}
140+
#endif
121141

122-
return {current, same};
142+
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
143+
#if __cpp_char8_t >= 201811
144+
if constexpr (sizeof...(String) && std::is_same_v<::std::remove_const_t<Iterator>, utf8_iterator> && (std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>> || std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>)) {
145+
constexpr size_t str_length = (utf8_codepoint_length(String) + ... + 0ULL);
146+
//encode our String... into it's utf8 representation
147+
char8_t utf8_sequence[str_length];
148+
char8_t* ptr = utf8_sequence;
149+
((ptr = utf32_codepoint_to_utf8_codepoint(String, ptr)), ...);
150+
//run the comparison
151+
return evaluate_match_utf8_string(current, end, utf8_sequence, std::make_index_sequence<str_length>());
152+
} else if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
153+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
154+
//check the remaining bytes*
155+
size_t count = end - current;
156+
//make sure we only "bump" the iterator a safe distance
157+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
158+
//do math against how many characters we match, avoid as many branches as possible
159+
#if defined(__GNUC__) && !defined(__clang__)
160+
//because gcc's pedantic about binary operators and parens
161+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] != static_cast<char_type>(String))) + ... + size_t{0}) };
162+
#else
163+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast<char_type>(String))) | ... | char_type{0}) };
164+
#endif
165+
} else {
166+
bool same = (compare_character(String, current, end) && ... && true);
167+
return { current, same };
168+
}
169+
#else
170+
if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
171+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
172+
size_t count = end - current;
173+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
174+
#if defined(__GNUC__) && !defined(__clang__)
175+
//because gcc's pedantic about binary operators and parens
176+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] != static_cast<char_type>(String))) + ... + size_t{0}) };
177+
#else
178+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast<char_type>(String))) | ... | char_type{0}) };
179+
#endif
180+
} else {
181+
bool same = (compare_character(String, current, end) && ... && true);
182+
return { current, same };
183+
}
184+
#endif
123185
}
124186

125187
template <typename R, typename Iterator, typename EndIterator, auto... String, typename... Tail>

include/ctre/utf8.hpp

+38
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,44 @@
88
#include <iterator>
99

1010
namespace ctre {
11+
constexpr char8_t* utf32_codepoint_to_utf8_codepoint(uint32_t code, char8_t *ptr) {
12+
if (code < 0x80) {
13+
ptr[0] = code;
14+
return ptr + 1;
15+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
16+
ptr[0] = (0b11000000 | (code >> 6));
17+
ptr[1] = (0b10000000 | (code & 0x3f));
18+
return ptr + 2;
19+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
20+
ptr[0] = (0b11100000 | (code >> 12)); // 1110zzz
21+
ptr[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
22+
ptr[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
23+
return ptr + 3;
24+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
25+
ptr[0] = (0b11110000 | (code >> 18)); // 11110uuu
26+
ptr[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
27+
ptr[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
28+
ptr[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
29+
return ptr + 4;
30+
} else {
31+
ptr[0] = 0xff; //invalid start byte
32+
return ptr + 1;
33+
}
34+
}
35+
36+
constexpr uint32_t utf8_codepoint_length(uint32_t code) {
37+
if (code < 0x80) {
38+
return 1;
39+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
40+
return 2;
41+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
42+
return 3;
43+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
44+
return 4;
45+
} else {
46+
return 1;
47+
}
48+
}
1149

1250
struct utf8_iterator {
1351
using self_type = utf8_iterator;

0 commit comments

Comments
 (0)