Commit 611c2c3a authored by Bruce Long's avatar Bruce Long Committed by Commit Bot

Spellcheck: Ignore words in script not used by enabled dictionaries

Some languages (Chinese, Japanese, ...) have no Windows or Hunspell
dictionary support, but a user will often have English spellchecking
enabled. In this case, when Windows spellchecking is enabled, all
editable content composed in the unsupported spellcheck
language will be red squiggled as misspelled, since the words aren't
recognized by the Windows spellchecker. This is a pretty ugly
experience and differs from what was seen when there was only Hunspell.

The fix is to check if a word that is flagged as misspelled by the
Windows spellchecker is written in a script associated with any of
the enabled spellchecker dictionaries (either Windows or Hunspell).
If not, ignore the word as a misspelling.

Bug: 1076677
Change-Id: I072a783d84a981e8f5f0316e8957d7ab93979ca8
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2210796
Commit-Queue: Bruce Long <brlong@microsoft.com>
Reviewed-by: default avatarRouslan Solomakhin <rouslan@chromium.org>
Reviewed-by: default avatarGuillaume Jenkins <gujen@google.com>
Cr-Commit-Position: refs/heads/master@{#772226}
parent ec78ed38
......@@ -501,6 +501,15 @@ void SpellCheck::CreateTextCheckingResults(
spellcheck_result.replacements;
SpellCheckResult::Decoration decoration = spellcheck_result.decoration;
#if BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
// Ignore words that are in a script not supported by any of the enabled
// spellcheck languages.
if (spellcheck::UseWinHybridSpellChecker() &&
!IsWordInSupportedScript(misspelled_word)) {
continue;
}
#endif // BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
// Ignore words in custom dictionary.
if (custom_dictionary_.SpellCheckWord(misspelled_word, 0,
misspelled_word.length())) {
......@@ -607,3 +616,10 @@ void SpellCheck::NotifyDictionaryObservers(
observer.OnDictionaryUpdated(words_added);
}
}
bool SpellCheck::IsWordInSupportedScript(const base::string16& word) const {
return std::find_if(languages_.begin(), languages_.end(),
[word](const auto& language) {
return language->IsTextInSameScript(word);
}) != languages_.end();
}
......@@ -192,6 +192,10 @@ class SpellCheck : public base::SupportsWeakPtr<SpellCheck>,
void NotifyDictionaryObservers(
const blink::WebVector<blink::WebString>& words_added);
// Returns whether a word is in the script of one of the enabled spellcheck
// languages.
bool IsWordInSupportedScript(const base::string16& word) const;
#if BUILDFLAG(USE_RENDERER_SPELLCHECKER)
// Posts delayed spellcheck task and clear it if any.
// Takes ownership of |request|.
......
......@@ -148,3 +148,7 @@ bool SpellcheckLanguage::IsEnabled() {
DCHECK(platform_spelling_engine_);
return platform_spelling_engine_->IsEnabled();
}
bool SpellcheckLanguage::IsTextInSameScript(const base::string16& text) const {
return character_attributes_.IsTextInSameScript(text);
}
......@@ -75,6 +75,10 @@ class SpellcheckLanguage {
// Return true if the underlying spellcheck engine is enabled.
bool IsEnabled();
// Returns true if all the characters in a text string are in the script
// associated with this spellcheck language.
bool IsTextInSameScript(const base::string16& text) const;
private:
friend class SpellCheckTest;
friend class FakeSpellCheck;
......
......@@ -68,23 +68,28 @@ void FakeSpellCheck::SetFakeLanguageCounts(size_t language_count,
}
#if BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
void FakeSpellCheck::InitializeRendererSpellCheckForLocale(
const std::string& language) {
base::FilePath hunspell_directory = GetHunspellDirectory();
EXPECT_FALSE(hunspell_directory.empty());
base::FilePath hunspell_file_path =
spellcheck::GetVersionedFileName(language, hunspell_directory);
base::File file(hunspell_file_path,
base::File::FLAG_OPEN | base::File::FLAG_READ);
EXPECT_TRUE(file.IsValid()) << hunspell_file_path << " is not valid"
<< file.ErrorToString(file.GetLastFileError());
// Add the SpellcheckLanguage manually to force the use of Hunspell.
void FakeSpellCheck::InitializeSpellCheckForLocale(const std::string& language,
bool use_hunspell) {
// Non-Hunspell case is passed invalid file to SpellcheckLanguage::Init.
base::File file;
if (use_hunspell) {
base::FilePath hunspell_directory = GetHunspellDirectory();
EXPECT_FALSE(hunspell_directory.empty());
base::FilePath hunspell_file_path =
spellcheck::GetVersionedFileName(language, hunspell_directory);
file.Initialize(hunspell_file_path,
base::File::FLAG_OPEN | base::File::FLAG_READ);
EXPECT_TRUE(file.IsValid()) << hunspell_file_path << " is not valid"
<< file.ErrorToString(file.GetLastFileError());
}
// Add the SpellcheckLanguage manually to the SpellCheck object.
SpellCheck::languages_.push_back(
std::make_unique<SpellcheckLanguage>(embedder_provider_));
SpellCheck::languages_.front()->platform_spelling_engine_ =
SpellCheck::languages_.back()->platform_spelling_engine_ =
std::make_unique<HunspellEngine>(embedder_provider_);
SpellCheck::languages_.front()->Init(std::move(file), language);
SpellCheck::languages_.back()->Init(std::move(file), language);
}
#endif // BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
......
......@@ -54,8 +54,9 @@ class FakeSpellCheck : public SpellCheck {
void SetFakeLanguageCounts(size_t language_count, size_t enabled_count);
#if BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
// Test-only method to initialize Hunspell for the given locale.
void InitializeRendererSpellCheckForLocale(const std::string& language);
// Test-only method to initialize SpellCheck object for the given locale.
void InitializeSpellCheckForLocale(const std::string& language,
bool use_hunspell);
#endif // BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
// Returns the current number of spell check languages.
......
......@@ -27,12 +27,22 @@ struct HybridSpellCheckTestCase {
};
struct CombineSpellCheckResultsTestCase {
std::string browser_locale;
std::string renderer_locale;
const wchar_t* text;
std::vector<SpellCheckResult> browser_results;
bool use_spelling_service;
blink::WebVector<blink::WebTextCheckingResult> expected_results;
};
std::ostream& operator<<(std::ostream& out,
const CombineSpellCheckResultsTestCase& test_case) {
out << "browser_locale=" << test_case.browser_locale
<< ", renderer_locale=" << test_case.renderer_locale << ", text=\""
<< test_case.text
<< "\", use_spelling_service=" << test_case.use_spelling_service;
return out;
}
#endif // BUILDFLAG(USE_WIN_HYBRID_SPELLCHECKER)
class SpellCheckProviderCacheTest : public SpellCheckProviderTest {
......@@ -202,7 +212,8 @@ INSTANTIATE_TEST_SUITE_P(
CombineSpellCheckResultsTest,
testing::Values(
// Browser-only check, no browser results
CombineSpellCheckResultsTestCase{"",
CombineSpellCheckResultsTestCase{"en-US",
"",
L"This has no misspellings",
{},
false,
......@@ -210,6 +221,7 @@ INSTANTIATE_TEST_SUITE_P(
// Browser-only check, no spelling service, browser results
CombineSpellCheckResultsTestCase{
"en-US",
"",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -233,8 +245,56 @@ INSTANTIATE_TEST_SUITE_P(
9,
4)})},
// Browser-only check, no spelling service, browser results, some words
// in character sets with no dictionaries enabled.
CombineSpellCheckResultsTestCase{
"en-US",
"",
L"Tihs has soem \x3053\x3093\x306B\x3061\x306F \x4F60\x597D "
L"\xC548\xB155\xD558\xC138\xC694 "
L"\x0930\x093E\x091C\x0927\x093E\x0928 words in different "
L"character sets "
L"(Japanese, Chinese, Korean, Hindi)",
{SpellCheckResult(SpellCheckResult::SPELLING,
0,
4,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
9,
4,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
14,
5,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
20,
2,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
23,
5,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
29,
6,
{base::ASCIIToUTF16("foo")})},
false,
blink::WebVector<blink::WebTextCheckingResult>(
{blink::WebTextCheckingResult(
blink::WebTextDecorationType::
kWebTextDecorationTypeSpelling,
0,
4),
blink::WebTextCheckingResult(
blink::WebTextDecorationType::
kWebTextDecorationTypeSpelling,
9,
4)})},
// Browser-only check, spelling service, spelling-only browser results
CombineSpellCheckResultsTestCase{
"en-US",
"",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -261,6 +321,7 @@ INSTANTIATE_TEST_SUITE_P(
// Browser-only check, spelling service, spelling and grammar browser
// results
CombineSpellCheckResultsTestCase{
"en-US",
"",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -285,6 +346,7 @@ INSTANTIATE_TEST_SUITE_P(
// Hybrid check, no browser results
CombineSpellCheckResultsTestCase{"en-US",
"en-US",
L"This has no misspellings",
{},
false,
......@@ -293,6 +355,7 @@ INSTANTIATE_TEST_SUITE_P(
// Hybrid check, no spelling service, browser results that all coincide
// with Hunspell results
CombineSpellCheckResultsTestCase{
"en-US",
"en-US",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -319,6 +382,7 @@ INSTANTIATE_TEST_SUITE_P(
// Hybrid check, no spelling service, browser results that partially
// coincide with Hunspell results
CombineSpellCheckResultsTestCase{
"en-US",
"en-US",
L"Tihs has soem misspellings",
{
......@@ -355,6 +419,7 @@ INSTANTIATE_TEST_SUITE_P(
// Hybrid check, no spelling service, browser results that don't
// coincide with Hunspell results
CombineSpellCheckResultsTestCase{
"en-US",
"en-US",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -369,10 +434,40 @@ INSTANTIATE_TEST_SUITE_P(
blink::WebVector<blink::WebTextCheckingResult>()},
// Hybrid check, no spelling service, browser results with some that
// are skipped by Hunspell because of character set mismatch
// that are in character set that does not have dictionary support (so
// (are considered spelled correctly)
CombineSpellCheckResultsTestCase{
"en-US",
"fr",
L"Tihs mot is misspelled in Russian: "
L"\x043C\x0438\x0440\x0432\x043E\x0439",
{SpellCheckResult(SpellCheckResult::SPELLING,
0,
4,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
5,
3,
{base::ASCIIToUTF16("foo")}),
SpellCheckResult(SpellCheckResult::SPELLING,
35,
6,
{base::ASCIIToUTF16("foo")})},
false,
blink::WebVector<blink::WebTextCheckingResult>(
std::vector<blink::WebTextCheckingResult>(
{blink::WebTextCheckingResult(
blink::WebTextDecorationType::
kWebTextDecorationTypeSpelling,
0,
4)}))},
// Hybrid check, no spelling service, text with some words in a
// character set that only has a Hunspell dictionary enabled.
CombineSpellCheckResultsTestCase{
"en-US",
L"Tihs word is misspelled in Russian: "
"ru",
L"Tihs \x0432\x0441\x0435\x0445 is misspelled in Russian: "
L"\x043C\x0438\x0440\x0432\x043E\x0439",
{SpellCheckResult(SpellCheckResult::SPELLING,
0,
......@@ -402,6 +497,7 @@ INSTANTIATE_TEST_SUITE_P(
// Hybrid check, spelling service, spelling and grammar browser results
// that all coincide with Hunspell results
CombineSpellCheckResultsTestCase{
"en-US",
"en-US",
L"Tihs has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -428,6 +524,7 @@ INSTANTIATE_TEST_SUITE_P(
// but some of the spelling results are correctly spelled in Hunspell
// locales
CombineSpellCheckResultsTestCase{
"en-US",
"en-US",
L"This has soem misspellings",
{SpellCheckResult(SpellCheckResult::SPELLING,
......@@ -461,12 +558,17 @@ TEST_P(CombineSpellCheckResultsTest, ShouldCorrectlyCombineHybridResults) {
/*enabled_features=*/{spellcheck::kWinUseBrowserSpellChecker,
spellcheck::kWinUseHybridSpellChecker},
/*disabled_features=*/{});
bool has_renderer_check = !test_case.renderer_locale.empty();
const bool has_browser_check = !test_case.browser_locale.empty();
const bool has_renderer_check = !test_case.renderer_locale.empty();
if (has_browser_check) {
provider_.spellcheck()->InitializeSpellCheckForLocale(
test_case.browser_locale, /*use_hunspell*/ false);
}
if (has_renderer_check) {
provider_.spellcheck()->InitializeRendererSpellCheckForLocale(
test_case.renderer_locale);
provider_.spellcheck()->SetFakeLanguageCounts(2u, 1u);
provider_.spellcheck()->InitializeSpellCheckForLocale(
test_case.renderer_locale, /*use_hunspell*/ true);
}
if (test_case.use_spelling_service) {
......@@ -478,7 +580,7 @@ TEST_P(CombineSpellCheckResultsTest, ShouldCorrectlyCombineHybridResults) {
FakeTextCheckingResult completion;
SpellCheckProvider::HybridSpellCheckRequestInfo request_info = {
/*used_hunspell=*/has_renderer_check,
/*used_native=*/true, base::TimeTicks::Now()};
/*used_native=*/has_browser_check, base::TimeTicks::Now()};
int check_id = provider_.AddCompletionForTest(
std::make_unique<FakeTextCheckingCompletion>(&completion), request_info);
......
......@@ -35,6 +35,24 @@ void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
CreateRuleSets(language);
}
bool SpellcheckCharAttribute::IsTextInSameScript(
const base::string16& text) const {
const base::char16* data = text.data();
const size_t length = text.length();
for (size_t index = 0; index < length; /* U16_NEXT post-increments */) {
uint32_t code = 0;
U16_NEXT(data, index, length, code);
UErrorCode error = U_ZERO_ERROR;
UScriptCode script = uscript_getScript(code, &error);
if (U_SUCCESS(error) && (script != USCRIPT_COMMON) &&
(script != USCRIPT_INHERITED)) {
if (script != script_code_)
return false;
}
}
return true;
}
base::string16 SpellcheckCharAttribute::GetRuleSet(
bool allow_contraction) const {
return allow_contraction ?
......
......@@ -40,6 +40,10 @@ class SpellcheckCharAttribute {
// GetRuleSet() returns the rule-sets created in this function.
void SetDefaultLanguage(const std::string& language);
// Returns true if all the characters in a text string are in the script
// associated with the spellcheck language.
bool IsTextInSameScript(const base::string16& text) const;
// Returns a custom rule-set string used by the ICU break iterator. This class
// has two rule-sets, one splits a contraction and the other does not, so we
// can split a concaticated word (e.g. "seven-year-old") into words (e.g.
......
......@@ -505,3 +505,75 @@ TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {
EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
EXPECT_FALSE(iter.Advance());
}
TEST(SpellcheckCharAttributeTest, IsTextInSameScript) {
struct LanguageWithSampleText {
const char* language;
const wchar_t* sample_text;
};
static const std::vector<LanguageWithSampleText> kLanguagesWithSampleText = {
// Latin
{"fr", L"Libert\x00e9, \x00e9galitt\x00e9, fraternit\x00e9."},
// Greek
{"el", L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"},
// Cyrillic
{"ru",
L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432\x0443\x0439\x0442"
L"\x0435"},
// Hebrew
{"he", L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd"},
// Arabic
{"ar",
L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627\x0645\x064f\x0639"
L"\x064e\x0644\x064e\x064a\x0652\x0643\x064f\x0645\x0652 "},
// Hindi
{"hi", L"\x0930\x093E\x091C\x0927\x093E\x0928"},
// Thai
{"th",
L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04\x0e23\x0e31\x0e1a"},
// Hiragata
{"jp-Hira", L"\x3053\x3093\x306B\x3061\x306F"},
// Katakana
{"jp-Kana", L"\x30b3\x30de\x30fc\x30b9"},
// CJKV ideographs
{"zh-Hani", L"\x4F60\x597D"},
// Hangul Syllables
{"ko", L"\xC548\xB155\xD558\xC138\xC694"},
};
for (const auto& testcase : kLanguagesWithSampleText) {
SpellcheckCharAttribute attribute;
attribute.SetDefaultLanguage(testcase.language);
base::string16 sample_text(base::WideToUTF16(testcase.sample_text));
EXPECT_TRUE(attribute.IsTextInSameScript(sample_text))
<< "Language \"" << testcase.language
<< "\" fails to identify that sample text in same language is in same "
"script.";
// All other scripts in isolatation or mixed with current script should
// return false.
for (const auto& other_script : kLanguagesWithSampleText) {
if (testcase.language == other_script.language)
continue;
base::string16 other_sample_text(
base::WideToUTF16(other_script.sample_text));
EXPECT_FALSE(attribute.IsTextInSameScript(other_sample_text))
<< "Language \"" << testcase.language
<< "\" fails to identify that sample text in language \""
<< other_script.language << "\" is in different script.";
EXPECT_FALSE(
attribute.IsTextInSameScript(sample_text + other_sample_text))
<< "Language \"" << testcase.language
<< "\" fails to identify that sample text in language \""
<< other_script.language
<< "\" is in different script when appended to text in this script.";
EXPECT_FALSE(
attribute.IsTextInSameScript(other_sample_text + sample_text))
<< "Language \"" << testcase.language
<< "\" fails to identify that sample text in language \""
<< other_script.language
<< "\" is in different script when prepended to text in this script.";
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment