Commit 68e1ae97 authored by Jia's avatar Jia Committed by Commit Bot

[cros-search-tokenization] Added another tokenization method for latin languages.

The new tokenization method only extracts words delimited by spaces and keep
punctuations that appear within a word.

Bug: 1085976
Change-Id: I61ef69db6653ecb91bb3945bad7f470bfbf00902
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2212293
Commit-Queue: Jia Meng <jiameng@chromium.org>
Reviewed-by: default avatarThanh Nguyen <thanhdng@chromium.org>
Reviewed-by: default avatarXiyuan Xia <xiyuan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#772628}
parent 24d02220
......@@ -9,12 +9,24 @@
#include "base/i18n/break_iterator.h"
#include "base/i18n/case_conversion.h"
#include "base/notreached.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "chrome/common/string_matching/term_break_iterator.h"
using base::i18n::BreakIterator;
TokenizedString::TokenizedString(const base::string16& text) : text_(text) {
Tokenize();
TokenizedString::TokenizedString(const base::string16& text, Mode mode)
: text_(text) {
switch (mode) {
case Mode::kCamelCase:
Tokenize();
break;
case Mode::kWords:
TokenizeWords();
break;
default:
break;
}
}
TokenizedString::~TokenizedString() = default;
......@@ -41,3 +53,49 @@ void TokenizedString::Tokenize() {
}
}
}
void TokenizedString::TokenizeWords() {
BreakIterator break_iter(text_, BreakIterator::BREAK_WORD);
if (!break_iter.Init()) {
NOTREACHED() << "BreakIterator init failed"
<< ", text=\"" << text_ << "\"";
return;
}
// The token to be generated will be in [start, end) of |text_|.
size_t start = 0;
size_t end = 0;
while (break_iter.Advance()) {
if (break_iter.IsWord()) {
// Update |end| but do not generate a token yet because the next segment
// after Advance may be a non-whitespace char. We may include the next
// char in the token.
end = break_iter.pos();
continue;
}
// If this is not a word, it may be a sequence of whitespace chars or
// another punctuation.
// 1. Whitespace chars only: generate a token from |text_| in the range of
// [start, end). Also reset |start| and |end| for next token.
// 2. A punctuation: do nothing and Advance.
const base::string16 word(break_iter.GetString());
const bool only_whitechars =
base::ContainsOnlyChars(word, base::kWhitespaceUTF16);
if (only_whitechars) {
if (end - start > 1) {
tokens_.emplace_back(
base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
start = break_iter.pos();
end = start;
}
}
// Generate the last token.
if (end - start > 1) {
tokens_.emplace_back(base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
}
......@@ -11,15 +11,21 @@
#include "base/strings/string16.h"
#include "ui/gfx/range/range.h"
// TokenizedString takes a string and breaks it down into token words. It
// first breaks using BreakIterator to get all the words. Then it breaks
// the words again at camel case boundaries and alpha/number boundaries.
// TokenizedString takes a string and breaks it down into token words.
class TokenizedString {
public:
enum class Mode {
// Break words into tokens at camel case and alpha/num boundaries.
kCamelCase,
// Break words into tokens at white space.
kWords,
};
typedef std::vector<base::string16> Tokens;
typedef std::vector<gfx::Range> Mappings;
explicit TokenizedString(const base::string16& text);
explicit TokenizedString(const base::string16& text,
Mode mode = Mode::kCamelCase);
~TokenizedString();
const base::string16& text() const { return text_; }
......@@ -28,6 +34,7 @@ class TokenizedString {
private:
void Tokenize();
void TokenizeWords();
// Input text.
const base::string16 text_;
......
......@@ -29,6 +29,8 @@ TEST(TokenizedStringTest, Empty) {
base::string16 empty;
TokenizedString tokens(empty);
EXPECT_EQ(base::string16(), GetContent(tokens));
TokenizedString token_words(empty, TokenizedString::Mode::kWords);
EXPECT_EQ(base::string16(), GetContent(token_words));
}
TEST(TokenizedStringTest, Basic) {
......@@ -36,45 +38,85 @@ TEST(TokenizedStringTest, Basic) {
base::string16 text(base::UTF8ToUTF16("ScratchPad"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("scratch{0,7} pad{7,10}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("scratchpad{0,10}"), GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Chess2.0"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("chess{0,5} 2.0{5,8}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("chess2.0{0,8}"), GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Cut the rope"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"),
GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("AutoCAD WS"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("auto{0,4} cad{4,7} ws{8,10}"),
GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("autocad{0,7} ws{8,10}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Great TweetDeck"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweet{6,11} deck{11,15}"),
GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweetdeck{6,15}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Draw-It!"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("draw{0,4} it{5,7}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("draw-it{0,7}"), GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Faxing & Signing"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"),
GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("!@#$%^&*()<<<**>>>"));
TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(token_words));
}
}
TEST(TokenizedStringTest, TokenizeWords) {
{
base::string16 text(base::UTF8ToUTF16("?! wi-fi abc@gmail.com?!"));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("wi-fi{3,8} abc@gmail.com{9,22}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Hello?! \t \b World! "));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("hello{0,5} world{14,19}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16(" ?|! *&"));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(token_words));
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment