Commit ffb3d3d9 authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

[local-search-service] Adds a normalizer for content extraction

This CL adds a normalizer for content extraction:
1. Remove diacritic marks
2. Convert text to lower case
3. Option to remove hyphens.

Bug: 1091091
Change-Id: I981be8ac512a6bd9e28ab8418d41495e9cad9338
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2228633Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#775017}
parent 0ffb28ad
...@@ -1326,6 +1326,8 @@ source_set("chromeos") { ...@@ -1326,6 +1326,8 @@ source_set("chromeos") {
"launcher_search_provider/launcher_search_provider_service.h", "launcher_search_provider/launcher_search_provider_service.h",
"launcher_search_provider/launcher_search_provider_service_factory.cc", "launcher_search_provider/launcher_search_provider_service_factory.cc",
"launcher_search_provider/launcher_search_provider_service_factory.h", "launcher_search_provider/launcher_search_provider_service_factory.h",
"local_search_service/content_extraction_utils.cc",
"local_search_service/content_extraction_utils.h",
"local_search_service/index.cc", "local_search_service/index.cc",
"local_search_service/index.h", "local_search_service/index.h",
"local_search_service/inverted_index.cc", "local_search_service/inverted_index.cc",
...@@ -3036,6 +3038,7 @@ source_set("unit_tests") { ...@@ -3036,6 +3038,7 @@ source_set("unit_tests") {
"kerberos/kerberos_credentials_manager_test.cc", "kerberos/kerberos_credentials_manager_test.cc",
"kerberos/kerberos_ticket_expiry_notification_test.cc", "kerberos/kerberos_ticket_expiry_notification_test.cc",
"lacros/lacros_util_unittest.cc", "lacros/lacros_util_unittest.cc",
"local_search_service/content_extraction_utils_unittest.cc",
"local_search_service/index_unittest.cc", "local_search_service/index_unittest.cc",
"local_search_service/inverted_index_unittest.cc", "local_search_service/inverted_index_unittest.cc",
"local_search_service/local_search_service_unittest.cc", "local_search_service/local_search_service_unittest.cc",
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/chromeos/local_search_service/content_extraction_utils.h"
#include <memory>
#include "base/i18n/case_conversion.h"
#include "base/i18n/unicodestring.h"
#include "base/memory/ptr_util.h"
#include "base/strings/string16.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/i18n/unicode/translit.h"
namespace local_search_service {
base::string16 Normalizer(const base::string16& word, bool remove_hyphen) {
// Case folding.
icu::UnicodeString source = icu::UnicodeString::fromUTF8(
base::UTF16ToUTF8(base::i18n::FoldCase(word)));
// Removes diacritic.
UErrorCode status = U_ZERO_ERROR;
UParseError parse_error;
// Adds a rule to remove diacritic from text. Adds a few characters that are
// not handled by ICU (ł > l; ø > o; đ > d).
std::unique_ptr<icu::Transliterator> diacritic_remover =
base::WrapUnique(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("RemoveDiacritic"),
icu::UnicodeString::fromUTF8("::NFD; ::[:Nonspacing Mark:] Remove; "
"::NFC; ł > l; ø > o; đ > d;"),
UTRANS_FORWARD, parse_error, status));
diacritic_remover->transliterate(source);
// Removes hyphen.
if (remove_hyphen) {
// Hyphen characters list is taken from here: http://jkorpela.fi/dashes.html
// U+002D(-), U+007E(~), U+058A(֊), U+05BE(־), U+1806(᠆), U+2010(‐),
// U+2011(‑), U+2012(‒), U+2013(–), U+2014(—), U+2015(―), U+2053(⁓),
// U+207B(⁻), U+208B(₋), U+2212(−), U+2E3A(⸺ ), U+2E3B(⸻ ), U+301C(〜),
// U+3030(〰), U+30A0(゠), U+FE58(﹘), U+FE63(﹣), U+FF0D(-).
std::unique_ptr<icu::Transliterator> hyphen_remover =
base::WrapUnique(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("RemoveHyphen"),
icu::UnicodeString::fromUTF8(
"::[-~֊־᠆‐‑‒–—―⁓⁻₋−⸺⸻〜〰゠﹘﹣-] Remove;"),
UTRANS_FORWARD, parse_error, status));
hyphen_remover->transliterate(source);
}
return base::i18n::UnicodeStringToString16(source);
}
} // namespace local_search_service
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_
#define CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_
#include "base/strings/string16.h"
namespace local_search_service {
// Returns a normalized version of a string16: removes diacritics, convert to
// lower-case and possibly remove hyphen from the text (set to true by default).
base::string16 Normalizer(const base::string16& word,
bool remove_hyphen = true);
} // namespace local_search_service
#endif // CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/chromeos/local_search_service/content_extraction_utils.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace local_search_service {
TEST(ContentExtractionUtilsTest, NormalizerTest) {
// Test diacritic removed.
EXPECT_EQ(
Normalizer(base::UTF8ToUTF16("các dấu câu đã được loại bỏ thành công")),
base::UTF8ToUTF16("cac dau cau da duoc loai bo thanh cong"));
// Test hyphens removed.
EXPECT_EQ(Normalizer(base::UTF8ToUTF16(u8"wi\u2015fi----"), true),
base::UTF8ToUTF16("wifi"));
// Keep hyphen.
EXPECT_EQ(Normalizer(base::UTF8ToUTF16("wi-fi"), false),
base::UTF8ToUTF16("wi-fi"));
// Case folding test.
EXPECT_EQ(Normalizer(base::UTF8ToUTF16("This Is sOmE WEIRD LooKing text")),
base::UTF8ToUTF16("this is some weird looking text"));
// Combine test.
EXPECT_EQ(
Normalizer(base::UTF8ToUTF16(
"Đây là MỘT trình duyệt tuyệt vời và mượt\u2014\u058Amà"),
true),
base::UTF8ToUTF16("day la mot trinh duyet tuyet voi va muotma"));
}
} // namespace local_search_service
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment