Commit 68e1ae97 authored by Jia's avatar Jia Committed by Commit Bot

[cros-search-tokenization] Added another tokenization method for latin languages.

The new tokenization method only extracts words delimited by spaces and keep
punctuations that appear within a word.

Bug: 1085976
Change-Id: I61ef69db6653ecb91bb3945bad7f470bfbf00902
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2212293
Commit-Queue: Jia Meng <jiameng@chromium.org>
Reviewed-by: default avatarThanh Nguyen <thanhdng@chromium.org>
Reviewed-by: default avatarXiyuan Xia <xiyuan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#772628}
parent 24d02220
...@@ -9,12 +9,24 @@ ...@@ -9,12 +9,24 @@
#include "base/i18n/break_iterator.h" #include "base/i18n/break_iterator.h"
#include "base/i18n/case_conversion.h" #include "base/i18n/case_conversion.h"
#include "base/notreached.h" #include "base/notreached.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "chrome/common/string_matching/term_break_iterator.h" #include "chrome/common/string_matching/term_break_iterator.h"
using base::i18n::BreakIterator; using base::i18n::BreakIterator;
TokenizedString::TokenizedString(const base::string16& text) : text_(text) { TokenizedString::TokenizedString(const base::string16& text, Mode mode)
Tokenize(); : text_(text) {
switch (mode) {
case Mode::kCamelCase:
Tokenize();
break;
case Mode::kWords:
TokenizeWords();
break;
default:
break;
}
} }
TokenizedString::~TokenizedString() = default; TokenizedString::~TokenizedString() = default;
...@@ -41,3 +53,49 @@ void TokenizedString::Tokenize() { ...@@ -41,3 +53,49 @@ void TokenizedString::Tokenize() {
} }
} }
} }
void TokenizedString::TokenizeWords() {
BreakIterator break_iter(text_, BreakIterator::BREAK_WORD);
if (!break_iter.Init()) {
NOTREACHED() << "BreakIterator init failed"
<< ", text=\"" << text_ << "\"";
return;
}
// The token to be generated will be in [start, end) of |text_|.
size_t start = 0;
size_t end = 0;
while (break_iter.Advance()) {
if (break_iter.IsWord()) {
// Update |end| but do not generate a token yet because the next segment
// after Advance may be a non-whitespace char. We may include the next
// char in the token.
end = break_iter.pos();
continue;
}
// If this is not a word, it may be a sequence of whitespace chars or
// another punctuation.
// 1. Whitespace chars only: generate a token from |text_| in the range of
// [start, end). Also reset |start| and |end| for next token.
// 2. A punctuation: do nothing and Advance.
const base::string16 word(break_iter.GetString());
const bool only_whitechars =
base::ContainsOnlyChars(word, base::kWhitespaceUTF16);
if (only_whitechars) {
if (end - start > 1) {
tokens_.emplace_back(
base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
start = break_iter.pos();
end = start;
}
}
// Generate the last token.
if (end - start > 1) {
tokens_.emplace_back(base::i18n::ToLower(text_.substr(start, end - start)));
mappings_.emplace_back(start, end);
}
}
...@@ -11,15 +11,21 @@ ...@@ -11,15 +11,21 @@
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "ui/gfx/range/range.h" #include "ui/gfx/range/range.h"
// TokenizedString takes a string and breaks it down into token words. It // TokenizedString takes a string and breaks it down into token words.
// first breaks using BreakIterator to get all the words. Then it breaks
// the words again at camel case boundaries and alpha/number boundaries.
class TokenizedString { class TokenizedString {
public: public:
enum class Mode {
// Break words into tokens at camel case and alpha/num boundaries.
kCamelCase,
// Break words into tokens at white space.
kWords,
};
typedef std::vector<base::string16> Tokens; typedef std::vector<base::string16> Tokens;
typedef std::vector<gfx::Range> Mappings; typedef std::vector<gfx::Range> Mappings;
explicit TokenizedString(const base::string16& text); explicit TokenizedString(const base::string16& text,
Mode mode = Mode::kCamelCase);
~TokenizedString(); ~TokenizedString();
const base::string16& text() const { return text_; } const base::string16& text() const { return text_; }
...@@ -28,6 +34,7 @@ class TokenizedString { ...@@ -28,6 +34,7 @@ class TokenizedString {
private: private:
void Tokenize(); void Tokenize();
void TokenizeWords();
// Input text. // Input text.
const base::string16 text_; const base::string16 text_;
......
...@@ -29,6 +29,8 @@ TEST(TokenizedStringTest, Empty) { ...@@ -29,6 +29,8 @@ TEST(TokenizedStringTest, Empty) {
base::string16 empty; base::string16 empty;
TokenizedString tokens(empty); TokenizedString tokens(empty);
EXPECT_EQ(base::string16(), GetContent(tokens)); EXPECT_EQ(base::string16(), GetContent(tokens));
TokenizedString token_words(empty, TokenizedString::Mode::kWords);
EXPECT_EQ(base::string16(), GetContent(token_words));
} }
TEST(TokenizedStringTest, Basic) { TEST(TokenizedStringTest, Basic) {
...@@ -36,45 +38,85 @@ TEST(TokenizedStringTest, Basic) { ...@@ -36,45 +38,85 @@ TEST(TokenizedStringTest, Basic) {
base::string16 text(base::UTF8ToUTF16("ScratchPad")); base::string16 text(base::UTF8ToUTF16("ScratchPad"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("scratch{0,7} pad{7,10}"), GetContent(tokens)); EXPECT_EQ(base::UTF8ToUTF16("scratch{0,7} pad{7,10}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("scratchpad{0,10}"), GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("Chess2.0")); base::string16 text(base::UTF8ToUTF16("Chess2.0"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("chess{0,5} 2.0{5,8}"), GetContent(tokens)); EXPECT_EQ(base::UTF8ToUTF16("chess{0,5} 2.0{5,8}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("chess2.0{0,8}"), GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("Cut the rope")); base::string16 text(base::UTF8ToUTF16("Cut the rope"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"), EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"),
GetContent(tokens)); GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("cut{0,3} the{4,7} rope{8,12}"),
GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("AutoCAD WS")); base::string16 text(base::UTF8ToUTF16("AutoCAD WS"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("auto{0,4} cad{4,7} ws{8,10}"), EXPECT_EQ(base::UTF8ToUTF16("auto{0,4} cad{4,7} ws{8,10}"),
GetContent(tokens)); GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("autocad{0,7} ws{8,10}"),
GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("Great TweetDeck")); base::string16 text(base::UTF8ToUTF16("Great TweetDeck"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweet{6,11} deck{11,15}"), EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweet{6,11} deck{11,15}"),
GetContent(tokens)); GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("great{0,5} tweetdeck{6,15}"),
GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("Draw-It!")); base::string16 text(base::UTF8ToUTF16("Draw-It!"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("draw{0,4} it{5,7}"), GetContent(tokens)); EXPECT_EQ(base::UTF8ToUTF16("draw{0,4} it{5,7}"), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("draw-it{0,7}"), GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("Faxing & Signing")); base::string16 text(base::UTF8ToUTF16("Faxing & Signing"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"), EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"),
GetContent(tokens)); GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("faxing{0,6} signing{9,16}"),
GetContent(token_words));
} }
{ {
base::string16 text(base::UTF8ToUTF16("!@#$%^&*()<<<**>>>")); base::string16 text(base::UTF8ToUTF16("!@#$%^&*()<<<**>>>"));
TokenizedString tokens(text); TokenizedString tokens(text);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(tokens)); EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(tokens));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(token_words));
}
}
TEST(TokenizedStringTest, TokenizeWords) {
{
base::string16 text(base::UTF8ToUTF16("?! wi-fi abc@gmail.com?!"));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("wi-fi{3,8} abc@gmail.com{9,22}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16("Hello?! \t \b World! "));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16("hello{0,5} world{14,19}"),
GetContent(token_words));
}
{
base::string16 text(base::UTF8ToUTF16(" ?|! *&"));
TokenizedString token_words(text, TokenizedString::Mode::kWords);
EXPECT_EQ(base::UTF8ToUTF16(""), GetContent(token_words));
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment