diff --git a/examples/common.cpp b/examples/common.cpp index 11035736b..c06857799 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -324,22 +324,38 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest token that forms each word in words: std::vector tokens; + // unknown token + std::vector unknown; + unknown.clear(); for (const auto & word : words) { for (int i = 0; i < (int) word.size(); ){ for (int j = word.size() - 1; j >= i; j--){ auto cand = word.substr(i, j-i+1); auto it = vocab.token_to_id.find(cand); if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } tokens.push_back(it->second); i = j + 1; break; } else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + auto unk = word.substr(i, 1).data(); + unknown.push_back(*unk); i++; } } } + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } } return tokens;