@@ -58,6 +58,84 @@ TEST(FastWordpieceTokenizerTest, LoadAndTokenize) {
58
58
EXPECT_THAT (output_end_offsets, ElementsAre (3 , 5 , 6 , 9 ));
59
59
}
60
60
61
+ TEST (FastWordpieceTokenizerTest, PunctuationVersionMismatch) {
62
+ // The config_flatbuffer used here is built from the following config:
63
+ // * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f",
64
+ // "##ghz", "<unk>"}
65
+ // * unk_token = "<unk>"
66
+ // * suffix_indicator = "##"
67
+ // * max_bytes_per_token = 100
68
+ // * end_to_end = True
69
+
70
+ // Built with unicode ver 15.1
71
+ const char kTestConfigUnicode15_1Path [] =
72
+ " third_party/tensorflow_text/python/ops/test_data/"
73
+ " fast_wordpiece_tokenizer_model_ver_15_1.fb" ;
74
+
75
+ // Built with unicode ver 16.0
76
+ const char kTestConfigUnicode16_0Path [] =
77
+ " third_party/tensorflow_text/python/ops/test_data/"
78
+ " fast_wordpiece_tokenizer_model_ver_16_0.fb" ;
79
+
80
+ // We test the new punctuation symbol: \341\255\277, which was available in
81
+ // Unicode 16: https://www.fileformat.info/info/unicode/char//1b7f/index.htm,
82
+ // but not in 15.1.
83
+ // We also test an existing punctuation symbol ">".
84
+ std::string input = " abc>abc\341\255\277 abc" ;
85
+
86
+ // Read 15.1 config.
87
+ {
88
+ std::string config_flatbuffer;
89
+ auto status = tensorflow::ReadFileToString (tensorflow::Env::Default (),
90
+ kTestConfigUnicode15_1Path ,
91
+ &config_flatbuffer);
92
+ ASSERT_TRUE (status.ok ());
93
+
94
+ ASSERT_OK_AND_ASSIGN (auto tokenizer, FastWordpieceTokenizer::Create (
95
+ config_flatbuffer.data ()));
96
+
97
+ std::vector<std::string> output_tokens;
98
+ std::vector<int > output_ids;
99
+ std::vector<int > output_start_offsets;
100
+ std::vector<int > output_end_offsets;
101
+ tokenizer.Tokenize (input, &output_tokens, &output_ids,
102
+ &output_start_offsets, &output_end_offsets);
103
+
104
+ // For 15.1, the flatbuffer does not have \341\255\277 as a punctuation.
105
+ EXPECT_THAT (output_tokens, ElementsAre (" abc" , " <unk>" , " abc" , " abc" ));
106
+ EXPECT_THAT (output_ids, ElementsAre (1 , 8 , 1 , 1 ));
107
+ // Note that the new-version punctuation symbol is ignored.
108
+ EXPECT_THAT (output_start_offsets, ElementsAre (0 , 3 , 4 , 10 ));
109
+ EXPECT_THAT (output_end_offsets, ElementsAre (3 , 4 , 7 , 13 ));
110
+ }
111
+
112
+ // Read 16.0 config.
113
+ {
114
+ std::string config_flatbuffer;
115
+ auto status = tensorflow::ReadFileToString (tensorflow::Env::Default (),
116
+ kTestConfigUnicode16_0Path ,
117
+ &config_flatbuffer);
118
+ ASSERT_TRUE (status.ok ());
119
+
120
+ ASSERT_OK_AND_ASSIGN (auto tokenizer, FastWordpieceTokenizer::Create (
121
+ config_flatbuffer.data ()));
122
+
123
+ std::vector<std::string> output_tokens;
124
+ std::vector<int > output_ids;
125
+ std::vector<int > output_start_offsets;
126
+ std::vector<int > output_end_offsets;
127
+ tokenizer.Tokenize (input, &output_tokens, &output_ids,
128
+ &output_start_offsets, &output_end_offsets);
129
+
130
+ // For 16.0, \341\255\277 is treated as a punctuation.
131
+ EXPECT_THAT (output_tokens,
132
+ ElementsAre (" abc" , " <unk>" , " abc" , " <unk>" , " abc" ));
133
+ EXPECT_THAT (output_ids, ElementsAre (1 , 8 , 1 , 8 , 1 ));
134
+ EXPECT_THAT (output_start_offsets, ElementsAre (0 , 3 , 4 , 7 , 10 ));
135
+ EXPECT_THAT (output_end_offsets, ElementsAre (3 , 4 , 7 , 10 , 13 ));
136
+ }
137
+ }
138
+
61
139
template <typename T>
62
140
std::string ListToString (const std::vector<T>& list) {
63
141
return absl::StrCat (" [" , absl::StrJoin (list, " , " ), " ]" );
0 commit comments