Skip to content

Commit f9cdaa8

Browse files
committed
optimizes string matching by allowing memcmp like functionality (even on utf8 sequences)
reference: #147 comparison: https://compiler-explorer.com/z/Tz3KhG
1 parent 9a37e55 commit f9cdaa8

File tree

2 files changed

+86
-4
lines changed

2 files changed

+86
-4
lines changed

include/ctre/evaluation.hpp

+48-4
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,55 @@ template <typename CharT, typename Iterator, typename EndIterator> constexpr CTR
115115
return false;
116116
}
117117

118-
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
119-
120-
bool same = (compare_character(String, current, end) && ... && true);
118+
#if __cpp_char8_t >= 201811
119+
template <size_t N, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_utf8_string(Iterator current, [[maybe_unused]] const EndIterator end, char8_t (&buffer)[N], std::index_sequence<Idx...>) noexcept {
120+
//abuse inside knowledge of how utf8_iterator works
121+
if constexpr (!std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>) {
122+
size_t count = end.ptr - current.ptr; //size_t count = std::distance(current.ptr, end.ptr);
123+
size_t bump = ((count < N) ? count : N);
124+
//using ^ operator vs != because gcc complains about parens
125+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) };
126+
} else {
127+
size_t count = current.end - current.ptr; //size_t count = std::distance(current.ptr, current.end);
128+
size_t bump = ((count < N) ? count : N);
129+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && !(bool)(((current.ptr[Idx] ^ buffer[Idx])) | ... | char8_t{0}) };
130+
}
131+
}
132+
#endif
121133

122-
return {current, same};
134+
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
135+
#if __cpp_char8_t >= 201811
136+
if constexpr (sizeof...(String) && std::is_same_v<::std::remove_const_t<Iterator>, utf8_iterator> && (std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>> || std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>)) {
137+
constexpr size_t str_length = (utf8_codepoint_length(String) + ... + 0ULL);
138+
//encode our String... into it's utf8 representation
139+
char8_t utf8_sequence[str_length];
140+
char8_t* ptr = utf8_sequence;
141+
((ptr = utf32_codepoint_to_utf8_codepoint(String, ptr)), ...);
142+
//run the comparison
143+
return evaluate_match_utf8_string(current, end, utf8_sequence, std::make_index_sequence<str_length>());
144+
} else if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
145+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
146+
//check the remaining bytes*
147+
size_t count = end - current;
148+
//make sure we only "bump" the iterator a safe distance
149+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
150+
//do math against how many characters we match, avoid as many branches as possible
151+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast<char_type>(String))) | ... | char_type{0}) };
152+
} else {
153+
bool same = (compare_character(String, current, end) && ... && true);
154+
return { current, same };
155+
}
156+
#else
157+
if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
158+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
159+
size_t count = end - current;
160+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
161+
return { current + bump, (count >= sizeof...(String)) && !(bool)(((current[Idx] ^ static_cast<char_type>(String))) | ... | char_type{0}) };
162+
} else {
163+
bool same = (compare_character(String, current, end) && ... && true);
164+
return { current, same };
165+
}
166+
#endif
123167
}
124168

125169
template <typename R, typename Iterator, typename EndIterator, auto... String, typename... Tail>

include/ctre/utf8.hpp

+38
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,44 @@
88
#include <iterator>
99

1010
namespace ctre {
11+
constexpr char8_t* utf32_codepoint_to_utf8_codepoint(uint32_t code, char8_t *ptr) {
12+
if (code < 0x80) {
13+
ptr[0] = code;
14+
return ptr + 1;
15+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
16+
ptr[0] = (0b11000000 | (code >> 6));
17+
ptr[1] = (0b10000000 | (code & 0x3f));
18+
return ptr + 2;
19+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
20+
ptr[0] = (0b11100000 | (code >> 12)); // 1110zzz
21+
ptr[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
22+
ptr[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
23+
return ptr + 3;
24+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
25+
ptr[0] = (0b11110000 | (code >> 18)); // 11110uuu
26+
ptr[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
27+
ptr[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
28+
ptr[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
29+
return ptr + 4;
30+
} else {
31+
ptr[0] = 0xff; //invalid start byte
32+
return ptr + 1;
33+
}
34+
}
35+
36+
constexpr uint32_t utf8_codepoint_length(uint32_t code) {
37+
if (code < 0x80) {
38+
return 1;
39+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
40+
return 2;
41+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
42+
return 3;
43+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
44+
return 4;
45+
} else {
46+
return 1;
47+
}
48+
}
1149

1250
struct utf8_iterator {
1351
using self_type = utf8_iterator;

0 commit comments

Comments
 (0)