Skip to content

Commit ef6e9f5

Browse files
Handle the punctuation definition mismatch between different Unicode versions.
PiperOrigin-RevId: 707239296
1 parent 31f22e9 commit ef6e9f5

File tree

5 files changed

+86
-4
lines changed

5 files changed

+86
-4
lines changed

tensorflow_text/core/kernels/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,8 @@ cc_test(
403403
srcs = ["fast_wordpiece_tokenizer_test.cc"],
404404
data = [
405405
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model.fb",
406+
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb",
407+
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb",
406408
],
407409
deps = [
408410
":fast_wordpiece_tokenizer",

tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,16 @@ void FastWordpieceTokenizer::TokenizeTextImpl(
278278
prev_unicode_char))) {
279279
// If the current Unicode character is a valid word boundary, collect the
280280
// remaining tokens stored on a path on the trie.
281+
absl::string_view cur_str = absl::string_view(
282+
input_substr.data(), cur_pos - input_word_offset_in_text);
281283
HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
282-
absl::string_view(input_substr.data(),
283-
cur_pos - input_word_offset_in_text),
284-
input_word_offset_in_text, cur_node, original_num_tokens,
284+
cur_str, input_word_offset_in_text, cur_node, original_num_tokens,
285285
cur_offset_in_input_word, output_pieces, output_ids,
286286
output_start_offsets, output_end_offsets);
287287
// Skip the whitespace.
288-
if (is_white_space) cur_pos = next_pos;
288+
// If the remaining tokens are empty, it means we encountered an
289+
// unmappable separator, so skip to the next token.
290+
if (is_white_space || cur_str.empty()) cur_pos = next_pos;
289291
// Continue in the outer while loop to process the remaining input.
290292
continue;
291293
}

tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,84 @@ TEST(FastWordpieceTokenizerTest, LoadAndTokenize) {
5858
EXPECT_THAT(output_end_offsets, ElementsAre(3, 5, 6, 9));
5959
}
6060

61+
TEST(FastWordpieceTokenizerTest, PunctuationVersionMismatch) {
62+
// The config_flatbuffer used here is built from the following config:
63+
// * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f",
64+
// "##ghz", "<unk>"}
65+
// * unk_token = "<unk>"
66+
// * suffix_indicator = "##"
67+
// * max_bytes_per_token = 100
68+
// * end_to_end = True
69+
70+
// Built with unicode ver 15.1
71+
const char kTestConfigUnicode15_1Path[] =
72+
"third_party/tensorflow_text/python/ops/test_data/"
73+
"fast_wordpiece_tokenizer_model_ver_15_1.fb";
74+
75+
// Built with unicode ver 16.0
76+
const char kTestConfigUnicode16_0Path[] =
77+
"third_party/tensorflow_text/python/ops/test_data/"
78+
"fast_wordpiece_tokenizer_model_ver_16_0.fb";
79+
80+
// We test the new punctuation symbol: \341\255\277, which was available in
81+
// Unicode 16: https://www.fileformat.info/info/unicode/char//1b7f/index.htm,
82+
// but not in 15.1.
83+
// We also test an existing punctuation symbol ">".
84+
std::string input = "abc>abc\341\255\277abc";
85+
86+
// Read 15.1 config.
87+
{
88+
std::string config_flatbuffer;
89+
auto status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
90+
kTestConfigUnicode15_1Path,
91+
&config_flatbuffer);
92+
ASSERT_TRUE(status.ok());
93+
94+
ASSERT_OK_AND_ASSIGN(auto tokenizer, FastWordpieceTokenizer::Create(
95+
config_flatbuffer.data()));
96+
97+
std::vector<std::string> output_tokens;
98+
std::vector<int> output_ids;
99+
std::vector<int> output_start_offsets;
100+
std::vector<int> output_end_offsets;
101+
tokenizer.Tokenize(input, &output_tokens, &output_ids,
102+
&output_start_offsets, &output_end_offsets);
103+
104+
// For 15.1, the flatbuffer does not have \341\255\277 as a punctuation.
105+
EXPECT_THAT(output_tokens, ElementsAre("abc", "<unk>", "abc", "abc"));
106+
EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 1));
107+
// Note that the new-version punctuation symbol is ignored.
108+
EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 10));
109+
EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 13));
110+
}
111+
112+
// Read 16.0 config.
113+
{
114+
std::string config_flatbuffer;
115+
auto status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
116+
kTestConfigUnicode16_0Path,
117+
&config_flatbuffer);
118+
ASSERT_TRUE(status.ok());
119+
120+
ASSERT_OK_AND_ASSIGN(auto tokenizer, FastWordpieceTokenizer::Create(
121+
config_flatbuffer.data()));
122+
123+
std::vector<std::string> output_tokens;
124+
std::vector<int> output_ids;
125+
std::vector<int> output_start_offsets;
126+
std::vector<int> output_end_offsets;
127+
tokenizer.Tokenize(input, &output_tokens, &output_ids,
128+
&output_start_offsets, &output_end_offsets);
129+
130+
// For 16.0, \341\255\277 is treated as a punctuation.
131+
EXPECT_THAT(output_tokens,
132+
ElementsAre("abc", "<unk>", "abc", "<unk>", "abc"));
133+
EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 8, 1));
134+
EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 7, 10));
135+
EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 10, 13));
136+
}
137+
}
138+
61139
template <typename T>
62140
std::string ListToString(const std::vector<T>& list) {
63141
return absl::StrCat("[", absl::StrJoin(list, ", "), "]");
346 KB
Binary file not shown.
346 KB
Binary file not shown.

0 commit comments

Comments
 (0)