Commit d704d1f2 authored by Elly Fong-Jones's avatar Elly Fong-Jones Committed by Commit Bot

tools: add tests for AffReader

This class has been around since the initial commit. This change makes
it not exit(1) on errors, then adds test coverage for most of it.

Bug: None
Change-Id: I5c366415c34abaa6bb90ce4657f2f94c3a45a1ad
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2040211Reviewed-by: default avatarNico Weber <thakis@chromium.org>
Commit-Queue: Elly Fong-Jones <ellyjones@chromium.org>
Cr-Commit-Position: refs/heads/master@{#739015}
parent a0ecc777
...@@ -4479,6 +4479,20 @@ test("unit_tests") { ...@@ -4479,6 +4479,20 @@ test("unit_tests") {
"../tools/convert_dict/convert_dict_unittest.cc", "../tools/convert_dict/convert_dict_unittest.cc",
] ]
if (!is_mac) {
sources += [ "../tools/convert_dict/aff_reader_unittest.cc" ]
}
data += [
"data/convert_dict/affix-flags.aff",
"data/convert_dict/empty.aff",
"data/convert_dict/indexed-affix.aff",
"data/convert_dict/leading-comment.aff",
"data/convert_dict/other-commands.aff",
"data/convert_dict/prefix-suffix.aff",
"data/convert_dict/rep.aff",
]
if (use_renderer_spellchecker) { if (use_renderer_spellchecker) {
sources += sources +=
[ "../browser/spellchecker/spell_check_host_chrome_impl_unittest.cc" ] [ "../browser/spellchecker/spell_check_host_chrome_impl_unittest.cc" ]
......
# This file defines a group of affixes using the AF keyword.
AF 3
AF Foo
AF Bar
AF Foobar
# This file exercises "continuation classes", whereby an affix can declare that
# it may be followed by another affix. This file declares two affixes:
# "s", which may be applied to any ADJECTIVE (i.e. "wearable" -> "wearables")
# "able", which may be applied to any VERB (i.e. "wear" -> "wearable")
SFX ADJECTIVE Y 1
SFX ADJECTIVE 0 s .
SFX VERB Y 1
SFX VERB 0 able/ADJECTIVE .
# This file has a leading comment, containing the word 'Foobar'.
# This file uses commands the parser doesn't know about at all.
FOOBAR foo bar
# This file describes two rules for an affix class named VERB:
# * Words in affix class VERB can be prefixed with "re"
# * Words in affix class VERB can be suffixed with:
# * "ed" if they do not end in "y"
# * "ied", deleting the trailing y, if they do end in "y"
# Define the affix class we're going to use:
AF 1
AF VERB
# The fields of PFX and SFX are either:
# ?FX <class> <cross> <count>
# or ?FX <class> <remove> <add> <match> [...]
# where the ? may be either P (for prefix) or S (for suffix).
# The first type of line is a header, which describes how many of this type of
# ?FX there are for this class. The second type of line describes how to
# construct a new word using this affix:
# For words in <class>, if they end in <match>, you can remove <remove> from
# the end being considered, then add <add> at that end.
# Where the end being considered is the beginning for PFX and the end for SFX.
# The meaning of <cross> and of the trailing stuff that is allowed in the second
# type of line (the morphology) is obscure to me (ellyjones@).
PFX VERB Y 1
PFX VERB 0 re .
SFX VERB Y 2
SFX VERB 0 ed [^y]
SFX VERB y ied y
# This file exercises the REP syntax for replacements.
REP 3
REP f ph
REP ph f
REP ^alot$ a_lot
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "base/files/file_util.h" #include "base/files/file_util.h"
#include "base/i18n/icu_string_conversions.h" #include "base/i18n/icu_string_conversions.h"
#include "base/logging.h"
#include "base/strings/string_split.h" #include "base/strings/string_split.h"
#include "base/strings/stringprintf.h" #include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h" #include "base/strings/utf_string_conversions.h"
...@@ -47,17 +48,6 @@ void CollapseDuplicateSpaces(std::string* str) { ...@@ -47,17 +48,6 @@ void CollapseDuplicateSpaces(std::string* str) {
} }
} }
// Print an error message and terminate execution
void Panic(const char* fmt, ...) {
va_list ap;
printf("ERROR: ");
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("\n");
exit(1);
}
} // namespace } // namespace
AffReader::AffReader(const base::FilePath& path) AffReader::AffReader(const base::FilePath& path)
...@@ -130,11 +120,13 @@ bool AffReader::Read() { ...@@ -130,11 +120,13 @@ bool AffReader::Read() {
StringBeginsWith(line, "MAP ")) { StringBeginsWith(line, "MAP ")) {
HandleEncodedCommand(line); HandleEncodedCommand(line);
} else if (StringBeginsWith(line, "IGNORE ")) { } else if (StringBeginsWith(line, "IGNORE ")) {
Panic("We don't support the IGNORE command yet. This would change how " LOG(FATAL)
"we would insert things in our lookup table."); << "We don't support the IGNORE command yet. This would change how "
"we would insert things in our lookup table.";
} else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
Panic("We don't support the COMPLEXPREFIXES command yet. This would " LOG(FATAL)
"mean we have to insert words backwards as well (I think)"); << "We don't support the COMPLEXPREFIXES command yet. This would "
"mean we have to insert words backwards as well (I think)";
} else { } else {
// All other commands get stored in the other commands list. // All other commands get stored in the other commands list.
HandleRawCommand(line); HandleRawCommand(line);
...@@ -261,14 +253,12 @@ void AffReader::AddAffix(std::string* rule) { ...@@ -261,14 +253,12 @@ void AffReader::AddAffix(std::string* rule) {
part.substr(slash_index + 1), " ", part.substr(slash_index + 1), " ",
base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (after_slash.size() == 0) { if (after_slash.size() == 0) {
Panic("Found 0 terms after slash in affix rule '%s', " LOG(FATAL) << "Found 0 terms after slash in affix rule '" << part
"but need at least 2.", << "' but need at least 2.";
part.c_str());
} }
if (after_slash.size() == 1) { if (after_slash.size() == 1) {
printf("WARNING: Found 1 term after slash in affix rule '%s', " LOG(WARNING) << "Found 1 term after slash in affix rule '" << part
"but expected at least 2. Adding '.'.\n", << "', but expected at least 2. Adding '.'.";
part.c_str());
after_slash.push_back("."); after_slash.push_back(".");
} }
// Note that we may get a third term here which is the morphological // Note that we may get a third term here which is the morphological
...@@ -281,10 +271,12 @@ void AffReader::AddAffix(std::string* rule) { ...@@ -281,10 +271,12 @@ void AffReader::AddAffix(std::string* rule) {
after_slash[1].c_str()); after_slash[1].c_str());
} }
// Reencode from here // Re-encode from here
std::string reencoded; std::string reencoded;
if (!EncodingToUTF8(part, &reencoded)) if (!EncodingToUTF8(part, &reencoded)) {
Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str()); LOG(FATAL) << "Cannot encode affix rule part '" << part
<< "' to utf8.";
}
*rule = rule->substr(0, part_start) + reencoded; *rule = rule->substr(0, part_start) + reencoded;
break; break;
...@@ -303,13 +295,15 @@ void AffReader::AddReplacement(std::string* rule) { ...@@ -303,13 +295,15 @@ void AffReader::AddReplacement(std::string* rule) {
CollapseDuplicateSpaces(rule); CollapseDuplicateSpaces(rule);
std::string utf8rule; std::string utf8rule;
if (!EncodingToUTF8(*rule, &utf8rule)) if (!EncodingToUTF8(*rule, &utf8rule)) {
Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str()); LOG(FATAL) << "Cannot encode replacement rule '" << rule << "' to utf8.";
}
// The first space separates key and value. // The first space separates key and value.
size_t space_index = utf8rule.find(' '); size_t space_index = utf8rule.find(' ');
if (space_index == std::string::npos) if (space_index == std::string::npos) {
Panic("Did not find a space in '%s'.", utf8rule.c_str()); LOG(FATAL) << "Did not find a space in '" << utf8rule << "'.";
}
std::vector<std::string> split; std::vector<std::string> split;
split.push_back(utf8rule.substr(0, space_index)); split.push_back(utf8rule.substr(0, space_index));
...@@ -329,8 +323,9 @@ void AffReader::HandleRawCommand(const std::string& line) { ...@@ -329,8 +323,9 @@ void AffReader::HandleRawCommand(const std::string& line) {
void AffReader::HandleEncodedCommand(const std::string& line) { void AffReader::HandleEncodedCommand(const std::string& line) {
std::string utf8; std::string utf8;
if (!EncodingToUTF8(line, &utf8)) if (!EncodingToUTF8(line, &utf8)) {
Panic("Cannot encode command '%s' to utf8.", line.c_str()); LOG(FATAL) << "Cannot encode command '" << line << "' to utf8.";
}
other_commands_.push_back(utf8); other_commands_.push_back(utf8);
} }
......
...@@ -59,10 +59,10 @@ class AffReader { ...@@ -59,10 +59,10 @@ class AffReader {
std::vector<std::string> GetAffixGroups() const; std::vector<std::string> GetAffixGroups() const;
private: private:
// Command-specific handlers. These are given the string folling the // Command-specific handlers. These are given the string following the
// command. The input rule may be modified arbitrarily by the function. // command. The input rule may be modified arbitrarily by the function.
int AddAffixGroup(std::string* rule); // Returns the new affix group ID. int AddAffixGroup(std::string* rule); // Returns the new affix group ID.
void AddAffix(std::string* rule); // SFX/PFX void AddAffix(std::string* rule); // SFX/PFX
void AddReplacement(std::string* rule); void AddReplacement(std::string* rule);
// void HandleFlag(std::string* rule); // void HandleFlag(std::string* rule);
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// When understanding these test cases, the nuspell docs are a useful reference:
// https://github.com/nuspell/nuspell/wiki/Affix-File-Format
#include "testing/gtest/include/gtest/gtest.h"
#include "base/files/file_path.h"
#include "base/path_service.h"
#include "chrome/common/chrome_paths.h"
#include "chrome/tools/convert_dict/aff_reader.h"
base::FilePath TestFilePath(const std::string& name) {
base::FilePath db_path;
base::PathService::Get(chrome::DIR_TEST_DATA, &db_path);
return db_path.AppendASCII("convert_dict").AppendASCII(name);
}
using convert_dict::AffReader;
TEST(AffReaderTest, EmptyFile) {
AffReader reader(TestFilePath("empty.aff"));
EXPECT_TRUE(reader.Read());
}
TEST(AffReaderTest, LeadingComment) {
AffReader reader(TestFilePath("leading-comment.aff"));
EXPECT_TRUE(reader.Read());
EXPECT_NE(std::string::npos, reader.comments().find("Foobar"));
}
TEST(AffReaderTest, AffixFlags) {
AffReader reader(TestFilePath("affix-flags.aff"));
EXPECT_TRUE(reader.Read());
EXPECT_EQ(reader.GetAffixGroups()[0], "AF Foo");
EXPECT_EQ(reader.GetAffixGroups()[1], "AF Bar");
EXPECT_EQ(reader.GetAffixGroups()[2], "AF Foobar");
}
TEST(AffReaderTest, PrefixSuffix) {
AffReader reader(TestFilePath("prefix-suffix.aff"));
EXPECT_TRUE(reader.Read());
EXPECT_EQ(reader.affix_rules()[0], "PFX VERB Y 1");
EXPECT_EQ(reader.affix_rules()[1], "PFX VERB 0 re .");
}
TEST(AffReaderTest, IndexedAffix) {
AffReader reader(TestFilePath("indexed-affix.aff"));
EXPECT_TRUE(reader.Read());
// The class name ("ADJECTIVE" in the input) should have been converted into
// an index in the AffReader's internal class table.
EXPECT_EQ(reader.affix_rules()[3], "SFX VERB 0 able/1 .");
}
TEST(AffReaderTest, Rep) {
AffReader reader(TestFilePath("rep.aff"));
EXPECT_TRUE(reader.Read());
EXPECT_EQ(reader.replacements()[0].first, "f");
EXPECT_EQ(reader.replacements()[0].second, "ph");
EXPECT_EQ(reader.replacements()[2].first, "^alot$");
// The "_" in the input should have been converted to a space - this is how
// multi-word suggestions are represented in AFF files.
EXPECT_EQ(reader.replacements()[2].second, "a lot");
}
TEST(AffReaderTest, OtherCommands) {
AffReader reader(TestFilePath("other-commands.aff"));
EXPECT_TRUE(reader.Read());
EXPECT_EQ(reader.other_commands()[0], "FOOBAR foo bar");
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment