Commit deea5c39 authored by Matthias Körber's avatar Matthias Körber Committed by Commit Bot

[Autofill][SlimShady] Component for structured names.

This CL adds the static address component tree for names that support
two surnames as they are common for Hispanic/Latinx names.

Change-Id: Icd54f986ab192101fbea5a8576c8abbabfffe636
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2264423
Commit-Queue: Matthias Körber <koerber@google.com>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#793110}
parent 73f0449d
......@@ -92,6 +92,8 @@ jumbo_static_library("browser") {
"data_model/autofill_structured_address_component.h",
"data_model/autofill_structured_address_constants.cc",
"data_model/autofill_structured_address_constants.h",
"data_model/autofill_structured_address_name.cc",
"data_model/autofill_structured_address_name.h",
"data_model/autofill_structured_address_regex_provider.cc",
"data_model/autofill_structured_address_regex_provider.h",
"data_model/autofill_structured_address_utils.cc",
......@@ -578,6 +580,7 @@ source_set("unit_tests") {
"data_model/autofill_profile_comparator_unittest.cc",
"data_model/autofill_profile_unittest.cc",
"data_model/autofill_structured_address_component_unittest.cc",
"data_model/autofill_structured_address_name_unittest.cc",
"data_model/autofill_structured_address_regex_provider_unittest.cc",
"data_model/autofill_structured_address_utils_unittest.cc",
"data_model/contact_info_unittest.cc",
......
......@@ -19,9 +19,21 @@ using data_util::bit_field_type_groups::kName;
using data_util::bit_field_type_groups::kPhone;
TEST(AutofillDataUtilTest, DetermineGroupsForHomeNameAndAddress) {
const std::vector<ServerFieldType> field_types{
NAME_FIRST, NAME_LAST, ADDRESS_HOME_LINE1,
ADDRESS_HOME_CITY, ADDRESS_HOME_STATE, ADDRESS_HOME_ZIP};
const std::vector<ServerFieldType> field_types{NAME_HONORIFIC_PREFIX,
NAME_FULL,
NAME_FIRST,
NAME_MIDDLE,
NAME_MIDDLE_INITIAL,
NAME_LAST,
NAME_LAST_FIRST,
NAME_LAST_CONJUNCTION,
NAME_LAST_SECOND,
NAME_FIRST,
NAME_LAST,
ADDRESS_HOME_LINE1,
ADDRESS_HOME_CITY,
ADDRESS_HOME_STATE,
ADDRESS_HOME_ZIP};
const uint32_t expected_group_bitmask = kName | kAddress;
const uint32_t group_bitmask = data_util::DetermineGroups(field_types);
......@@ -196,8 +208,7 @@ INSTANTIATE_TEST_SUITE_P(
// It occasionally happens that a full name is 2 characters, 1/1.
FullNameTestCase{"이도", "도", "", "이"}, // Korean name, Hangul
FullNameTestCase{"孫文", "文", "", "孫"} // Chinese name, Unihan
));
FullNameTestCase{"孫文", "文", "", "孫"})); // Chinese name, Unihan
class JoinNamePartsTest : public testing::TestWithParam<FullNameTestCase> {};
......@@ -229,9 +240,7 @@ INSTANTIATE_TEST_SUITE_P(
// These are no CJK names for us, they're just bogus.
FullNameTestCase{"Homer シンプソン", "Homer", "", "シンプソン"},
FullNameTestCase{"ホーマー Simpson", "ホーマー", "", "Simpson"},
FullNameTestCase{"반 기 문", "반", "기", "문"}
// Has a middle-name, too unusual
));
FullNameTestCase{"반 기 문", "반", "기", "문"}));
struct ValidCountryCodeTestCase {
std::string country_code;
......
......@@ -7,7 +7,7 @@
namespace autofill {
namespace structured_address {
const char kSingleWordRe[] = "(?:\\w+)";
const char kNameSeparators[] = " -";
} // namespace structured_address
} // namespace autofill
......@@ -8,8 +8,8 @@
namespace autofill {
namespace structured_address {
// Regular expression pattern to match a single word.
extern const char kSingleWordRe[];
// List of name separators.
extern const char kNameSeparators[];
} // namespace structured_address
} // namespace autofill
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/data_model/autofill_structured_address_name.h"
#include <utility>
#include "base/i18n/case_conversion.h"
#include "base/strings/strcat.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/autofill_type.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
#include "components/autofill/core/browser/field_types.h"
namespace autofill {
namespace structured_address {
base::string16 ReduceToInitials(const base::string16& value) {
if (value.empty())
return base::string16();
std::vector<base::string16> middle_name_tokens =
base::SplitString(value, base::ASCIIToUTF16(kNameSeparators),
base::WhitespaceHandling::TRIM_WHITESPACE,
base::SplitResult::SPLIT_WANT_NONEMPTY);
base::string16 result;
result.reserve(middle_name_tokens.size());
for (const auto& token : middle_name_tokens) {
DCHECK(!token.empty());
result += token[0];
}
return base::i18n::ToUpper(result);
}
bool HasHispanicLatinxNameCharaceristics(const std::string& name) {
// Check if the name contains one of the most common Hispanic/Latinx
// last names.
if (IsPartialMatch(name, RegEx::kMatchHispanicCommonNameCharacteristics))
return true;
// Check if it contains a last name conjunction.
if (IsPartialMatch(name,
RegEx::kMatchHispanicLastNameConjuctionCharacteristics))
return true;
// If none of the above, there is not sufficient reason to assume this is a
// Hispanic/Latinx name.
return false;
}
bool HasCjkNameCharacteristics(const std::string& name) {
return IsPartialMatch(name, RegEx::kMatchCjkNameCharacteristics);
}
bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name) {
return IsPartialMatch(middle_name,
RegEx::kMatchMiddleNameInitialsCharacteristics);
}
NameHonorific::NameHonorific() : NameHonorific(nullptr) {}
NameHonorific::NameHonorific(AddressComponent* parent)
: AddressComponent(NAME_HONORIFIC_PREFIX, parent) {}
NameHonorific::~NameHonorific() = default;
NameFirst::NameFirst() : NameFirst(nullptr) {}
NameFirst::NameFirst(AddressComponent* parent)
: AddressComponent(NAME_FIRST, parent) {}
NameFirst::~NameFirst() = default;
NameMiddle::NameMiddle() : NameMiddle(nullptr) {}
NameMiddle::NameMiddle(AddressComponent* parent)
: AddressComponent(NAME_MIDDLE, parent) {}
NameMiddle::~NameMiddle() = default;
void NameMiddle::GetAdditionalSupportedFieldTypes(
ServerFieldTypeSet* supported_types) const {
supported_types->insert(NAME_MIDDLE_INITIAL);
}
bool NameMiddle::ConvertAndGetTheValueForAdditionalFieldTypeName(
const std::string& type_name,
base::string16* value) const {
if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
if (value) {
// If the stored value has the characteristics of containing only
// initials, use the value as it is. Otherwise, convert it to a
// sequence of upper case letters, one for each space- or hyphen-separated
// token.
if (HasMiddleNameInitialsCharacteristics(base::UTF16ToUTF8(GetValue()))) {
*value = GetValue();
} else {
*value = ReduceToInitials(GetValue());
}
}
return true;
}
return false;
}
bool NameMiddle::ConvertAndSetValueForAdditionalFieldTypeName(
const std::string& type_name,
const base::string16& value,
const VerificationStatus& status) {
if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
SetValue(value, status);
return true;
}
return false;
}
NameLastFirst::NameLastFirst() : NameLastFirst(nullptr) {}
NameLastFirst::NameLastFirst(AddressComponent* parent)
: AddressComponent(NAME_LAST_FIRST, parent) {}
NameLastFirst::~NameLastFirst() = default;
NameLastConjunction::NameLastConjunction() : NameLastConjunction(nullptr) {}
NameLastConjunction::NameLastConjunction(AddressComponent* parent)
: AddressComponent(NAME_LAST_CONJUNCTION, parent) {}
NameLastConjunction::~NameLastConjunction() = default;
std::vector<const RE2*> NameLast::GetParseRegularExpressionsByRelevance()
const {
auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
DCHECK(pattern_provider);
// Check if the name has the characteristics of an Hispanic/Latinx name.
if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
return {pattern_provider->GetRegEx(RegEx::kParseHispanicLastName)};
return {pattern_provider->GetRegEx(RegEx::kParseLastNameIntoSecondLastName)};
}
NameLastSecond::NameLastSecond() : NameLastSecond(nullptr) {}
NameLastSecond::NameLastSecond(AddressComponent* parent)
: AddressComponent(NAME_LAST_SECOND, parent) {}
NameLastSecond::~NameLastSecond() = default;
NameLast::NameLast() : NameLast(nullptr) {}
NameLast::NameLast(AddressComponent* parent)
: AddressComponent(NAME_LAST, parent, {&first_, &conjunction_, &second_}) {}
NameLast::~NameLast() = default;
void NameLast::ParseValueAndAssignSubcomponentsByFallbackMethod() {
SetValueForTypeIfPossible(NAME_LAST_SECOND, GetValue(),
VerificationStatus::kParsed);
}
NameFull::NameFull() : NameFull(nullptr) {}
NameFull::NameFull(AddressComponent* parent)
: AddressComponent(
NAME_FULL,
parent,
{&name_honorific_, &name_first_, &name_middle_, &name_last_}) {}
std::vector<const RE2*> NameFull::GetParseRegularExpressionsByRelevance()
const {
auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
DCHECK(pattern_provider);
// If the name is a CJK name, try to match in the following order:
//
// * Match CJK names that include a separator.
// If a separator is present, dividing the name between first and last name is
// trivial.
//
// * Match Korean 4+ character names with two-character last names.
// Note, although some of the two-character last names are ambiguous in the
// sense that they share a common prefix with single character last names. For
// 4+ character names, it is more likely that the first two characters belong
// to the last name.
//
// * Match known two-character CJK last names.
// Note, this expressions uses only non-ambiguous two-character last names.
//
// * Match only the first character into the last name.
// This is the catch all expression that uses only the first character for the
// last name and puts all other characters into the first name.
//
if (HasCjkNameCharacteristics(base::UTF16ToUTF8(GetValue()))) {
return {
pattern_provider->GetRegEx(RegEx::kParseSeparatedCjkName),
pattern_provider->GetRegEx(RegEx::kParseKoreanTwoCharacterLastName),
pattern_provider->GetRegEx(RegEx::kParseCommonCjkTwoCharacterLastName),
pattern_provider->GetRegEx(RegEx::kParseCjkSingleCharacterLastName)};
}
if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
return {pattern_provider->GetRegEx(RegEx::kParseHispanicFullName)};
return {pattern_provider->GetRegEx(RegEx::kParseOnlyLastName),
pattern_provider->GetRegEx(RegEx::kParseLastCommaFirstMiddleName),
pattern_provider->GetRegEx(RegEx::kParseFirstMiddleLastName)};
}
NameFull::~NameFull() = default;
} // namespace structured_address
} // namespace autofill
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
#include <string>
#include <vector>
#include "components/autofill/core/browser/data_model/autofill_structured_address_component.h"
using autofill::structured_address::AddressComponent;
namespace autofill {
namespace structured_address {
// Returns true if |name| has the characteristics of a Chinese, Japanese or
// Korean name:
// * It must only contain CJK characters with at most one separator in between.
bool HasCjkNameCharacteristics(const std::string& name);
// Returns true if |name| has one of the characteristics of an Hispanic/Latinx
// name:
// * Name contains a very common Hispanic/Latinx surname.
// * Name uses a surname conjunction.
bool HasHispanicLatinxNameCharaceristics(const std::string& name);
// Return true if |middle_name| has the characteristics of a containing only
// initials:
// * The string contains only upper case letters that may be preceded by a
// point.
// * Between each letter, there can be a space or a hyphen.
bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name);
// Reduces a name to the initials in upper case.
// Example: George walker -> GW, Hans-Peter -> HP
base::string16 ReduceToInitials(const base::string16& value);
// Atomic component that represents the honorific prefix of a name.
class NameHonorific : public AddressComponent {
public:
NameHonorific();
explicit NameHonorific(AddressComponent* parent);
~NameHonorific() override;
};
// Atomic components that represents the first name.
class NameFirst : public AddressComponent {
public:
NameFirst();
explicit NameFirst(AddressComponent* parent);
~NameFirst() override;
};
// Atomic component that represents the middle name.
class NameMiddle : public AddressComponent {
public:
NameMiddle();
explicit NameMiddle(AddressComponent* parent);
~NameMiddle() override;
void GetAdditionalSupportedFieldTypes(
ServerFieldTypeSet* supported_types) const override;
protected:
// Implements support for getting for a value for the |MIDDLE_NAME_INITIAL|
// type.
bool ConvertAndGetTheValueForAdditionalFieldTypeName(
const std::string& type_name,
base::string16* value) const override;
// Implements support for setting the |MIDDLE_NAME_INITIAL| type.
bool ConvertAndSetValueForAdditionalFieldTypeName(
const std::string& type_name,
const base::string16& value,
const VerificationStatus& status) override;
};
// Atomic component that represents the first part of a last name.
class NameLastFirst : public AddressComponent {
public:
NameLastFirst();
explicit NameLastFirst(AddressComponent* parent);
~NameLastFirst() override;
};
// Atomic component that represents the conjunction in a Hispanic/Latinx
// surname.
class NameLastConjunction : public AddressComponent {
public:
NameLastConjunction();
explicit NameLastConjunction(AddressComponent* parent);
~NameLastConjunction() override;
};
// Atomic component that represents the second part of a surname.
class NameLastSecond : public AddressComponent {
public:
NameLastSecond();
explicit NameLastSecond(AddressComponent* parent);
~NameLastSecond() override;
};
// Compound that represent a last name. It contains a first and second last name
// and a conjunction as it is used in Hispanic/Latinx names. Note, that compound
// family names like Miller-Smith are not supposed to be split up into two
// components. If a name contains only a single component, the component is
// stored in the second part by default.
//
// +-------+
// | _LAST |
// +--------
// / | \
// / | \
// / | \
// +--------+ +-----------+ +---------+
// | _FIRST | | _CONJUNC. | | _SECOND |
// +--------+ +-----------+ +---------+
//
class NameLast : public AddressComponent {
public:
NameLast();
explicit NameLast(AddressComponent* parent);
~NameLast() override;
std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
const override;
private:
// As the fallback, write everything to the second last name.
void ParseValueAndAssignSubcomponentsByFallbackMethod() override;
NameLastFirst first_;
NameLastConjunction conjunction_;
NameLastSecond second_;
};
// Compound that represents a full name. It contains a honorific, a first
// name, a middle name and a last name. The last name is a compound itself.
//
// +----------+
// | NAME_FULL|
// +----------+
// / | | \
// / | | \
// / | | \
// / | | \
// +------------+ +--------+ +---------+ +-------+
// | _HONORIFIC | | _FIRST | | _MIDDLE | | _LAST |
// +------------+ +--------+ +---------+ +-------+
// / | \
// / | \
// / | \
// / | \
// +--------+ +-----------+ +---------+
// | _FIRST | | _CONJUNC. | | _SECOND |
// +--------+ +-----------+ +---------+
//
class NameFull : public AddressComponent {
public:
NameFull();
explicit NameFull(AddressComponent* parent);
~NameFull() override;
std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
const override;
private:
NameHonorific name_honorific_;
NameFirst name_first_;
NameMiddle name_middle_;
NameLast name_last_;
};
} // namespace structured_address
} // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/data_model/autofill_structured_address_name.h"
#include <stddef.h>
#include <map>
#include <string>
#include <vector>
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
using base::ASCIIToUTF16;
namespace autofill {
namespace structured_address {
namespace {
// A test record that contains all entries of the hybrid-structure name tree.
struct NameParserTestRecord {
std::string full;
std::string honorific;
std::string first;
std::string middle;
std::string last;
std::string last_first;
std::string last_conjunction;
std::string last_second;
};
// A test record that contains all entries of the hybrid-structure last name
// tree.
struct LastNameParserTestRecord {
std::string last_name;
std::string first;
std::string conjunction;
std::string second;
};
// Function to test the parsing of a name from the full (unstructured)
// representation into its subcomponents.
void TestNameParsing(const base::string16& full,
const base::string16& honorific,
const base::string16& first,
const base::string16& middle,
const base::string16& last,
const base::string16& last_first,
const base::string16& last_conjunction,
const base::string16& last_second) {
SCOPED_TRACE(full);
NameFull name;
name.SetValueForTypeIfPossible(NAME_FULL, full,
VerificationStatus::kObserved);
name.CompleteFullTree();
EXPECT_EQ(name.GetValueForType(NAME_FULL), full);
EXPECT_EQ(name.GetValueForType(NAME_HONORIFIC_PREFIX), honorific);
EXPECT_EQ(name.GetValueForType(NAME_FIRST), first);
EXPECT_EQ(name.GetValueForType(NAME_MIDDLE), middle);
EXPECT_EQ(name.GetValueForType(NAME_LAST), last);
EXPECT_EQ(name.GetValueForType(NAME_LAST_FIRST), last_first);
EXPECT_EQ(name.GetValueForType(NAME_LAST_CONJUNCTION), last_conjunction);
EXPECT_EQ(name.GetValueForType(NAME_LAST_SECOND), last_second);
}
// Testing function for parsing a |NAME_LAST| into its subcomponents.
void TestLastNameParsing(const base::string16& last_name,
const base::string16& target_first,
const base::string16& target_conjunction,
const base::string16& target_second) {
SCOPED_TRACE(last_name);
NameLast last_name_component;
last_name_component.SetValueForTypeIfPossible(NAME_LAST, last_name,
VerificationStatus::kObserved);
last_name_component.CompleteFullTree();
EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_FIRST), target_first);
EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_CONJUNCTION),
target_conjunction);
EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_SECOND),
target_second);
}
} // namespace
// Tests the parsing of last names into their tree components:
// * The first part, that is only used in Latinx/Hispanic names.
// * The conjunction, that is optional in Latinx/Hispanic names.
// * The second part, for Latinx/Hispanic and all other last names.
TEST(AutofillStructuredName, ParseLastName) {
LastNameParserTestRecord last_name_tests[] = {
// "von" is a known prefix for a surname and should be therefore parsed
// into the second last name
{"von Kitzling", "", "", "von Kitzling"},
{"Bush", "", "", "Bush"},
{"Picasso", "", "", "Picasso"},
// Ruiz is a common Spanish name and parsing into first and second last
// name should be applied. "de la" are known surname prefixes and should
// be included into the subsequeny token.
{"Ruiz de la Torro", "Ruiz", "", "de la Torro"},
{"Ruiz Picasso", "Ruiz", "", "Picasso"},
// "y" and "i" are known conjunctions.
{"Ruiz Y Picasso", "Ruiz", "Y", "Picasso"},
{"Ruiz y Picasso", "Ruiz", "y", "Picasso"},
{"Ruiz i Picasso", "Ruiz", "i", "Picasso"}};
for (const auto& last_name_test : last_name_tests) {
TestLastNameParsing(ASCIIToUTF16(last_name_test.last_name),
ASCIIToUTF16(last_name_test.first),
ASCIIToUTF16(last_name_test.conjunction),
ASCIIToUTF16(last_name_test.second));
}
}
// Tests the parsing of full names into their subcomponents.
TEST(AutofillStructuredName, ParseFullName) {
NameParserTestRecord name_tests[] = {
// Name starting with a last name, followed by a comma and the first and
// middle name.
{"Mueller, Hans Peter", "", "Hans", "Peter", "Mueller", "", "",
"Mueller"},
// Same with an honorific prefix an multiple middle names.
// middle name.
{"Prof. Mueller, Hans Walter Peter", "Prof.", "Hans", "Walter Peter",
"Mueller", "", "", "Mueller"},
// Name that includes a hyphen.
{"Dr. Hans-Peter Mueller", "Dr.", "Hans-Peter", "", "Mueller", "", "",
"Mueller"},
// Name with honorific prefix but without a middle name.
{"Prof. Albert Einstein", "Prof.", "Albert", "", "Einstein", "", "",
"Einstein"},
// Name with honorific prefix and a middle name.
{"Dr. Richard Phillips Feynman", "Dr.", "Richard", "Phillips", "Feynman",
"", "", "Feynman"},
// Name with honorific prefix and multiple middle name.
{"Dr. Richard Phillips Isaac Feynman", "Dr.", "Richard", "Phillips Isaac",
"Feynman", "", "", "Feynman"},
// Hispanic/Latinx name with two surname and a conjunction.
{"Pablo Diego Ruiz y Picasso", "", "Pablo Diego", "", "Ruiz y Picasso",
"Ruiz", "y", "Picasso"},
// Hispanic/Latinx name with two surname and a conjunction with an
// honorific prefix.
{"Mr. Pablo Ruiz y Picasso", "Mr.", "Pablo", "", "Ruiz y Picasso", "Ruiz",
"y", "Picasso"},
// Name with multiple middle names.
{"George Walker Junior Bush", "", "George", "Walker Junior", "Bush", "",
"", "Bush"},
// Name with a single middle name.
{"George Walker Bush", "", "George", "Walker", "Bush", "", "", "Bush"},
// Name without names.
{"George Bush", "", "George", "", "Bush", "", "", "Bush"},
// Three character Korean name wit two-character surname.
{"欧阳龙", "", "龙", "", "欧阳", "", "", "欧阳"},
// Four character Korean name wit two-character surname.
{"欧阳龙龙", "", "龙龙", "", "欧阳", "", "", "欧阳"},
// Full name including given, middle and family names.
{"Homer Jay Simpson", "", "Homer", "Jay", "Simpson", "", "", "Simpson"},
// No middle name.
{"Moe Szyslak", "", "Moe", "", "Szyslak", "", "", "Szyslak"},
// Common name prefixes parsed into the honorific prefix.
{"Reverend Timothy Lovejoy", "Reverend", "Timothy", "", "Lovejoy", "", "",
"Lovejoy"},
// Only a last name with a preposition.
{"von Gutenberg", "", "", "", "von Gutenberg", "", "", "von Gutenberg"},
// Common name suffixes removed.
{"John Frink Phd", "", "John", "", "Frink", "", "", "Frink"},
// Only lase name with common name suffixes removed.
{"Frink Phd", "", "", "", "Frink", "", "", "Frink"},
// Since "Ma" is a common last name, "Ma" was removed from the suffixes.
{"John Ma", "", "John", "", "Ma", "", "", "Ma"},
// Common family name prefixes not considered a middle name.
{"Milhouse Van Houten", "", "Milhouse", "", "Van Houten", "", "",
"Van Houten"},
// Chinese name, Unihan
{"孫 德明", "", "德明", "", "孫", "", "", "孫"},
// Chinese name, Unihan, 'IDEOGRAPHIC SPACE'
{"孫 德明", "", "德明", "", "孫", "", "", "孫"},
// Korean name, Hangul
{"홍 길동", "", "길동", "", "홍", "", "", "홍"},
// Japanese name, Unihan
{"山田 貴洋", "", "貴洋", "", "山田", "", "", "山田"},
// In Japanese, foreign names use 'KATAKANA MIDDLE DOT' (U+30FB) as a
// separator. There is no consensus for the ordering. For now, we use
// the same ordering as regular Japanese names ("last・first").
// Foreign name in Japanese, Katakana
{"ゲイツ・ビル", "", "ビル", "", "ゲイツ", "", "", "ゲイツ"},
// 'KATAKANA MIDDLE DOT' is occasionally typoed as 'MIDDLE DOT' (U+00B7).
{"ゲイツ·ビル", "", "ビル", "", "ゲイツ", "", "", "ゲイツ"},
// CJK names don't usually have a space in the middle, but most of the
// time, the surname is only one character (in Chinese & Korean).
{"최성훈", "", "성훈", "", "최", "", "", "최"}, // Korean name, Hangul
// (Simplified) Chinese name, Unihan
{"刘翔", "", "翔", "", "刘", "", "", "刘"},
// (Traditional) Chinese name, Unihan
{"劉翔", "", "翔", "", "劉", "", "", "劉"},
// Korean name, Hangul
{"남궁도", "", "도", "", "남궁", "", "", "남궁"},
// Korean name, Hangul
{"황보혜정", "", "혜정", "", "황보", "", "", "황보"},
// (Traditional) Chinese name, Unihan
{"歐陽靖", "", "靖", "", "歐陽", "", "", "歐陽"},
// In Korean, some 2-character surnames are rare/ambiguous, like "강전":
// "강" is a common surname, and "전" can be part of a given name. In
// those cases, we assume it's 1/2 for 3-character names, or 2/2 for
// 4-character names.
// Korean name, Hangul
{"강전희", "", "전희", "", "강", "", "", "강"},
// Korean name, Hangul
{"황목치승", "", "치승", "", "황목", "", "", "황목"},
// It occasionally happens that a full name is 2 characters, 1/1.
// Korean name, Hangul
{"이도", "", "도", "", "이", "", "", "이"},
// Chinese name, Unihan
{"孫文", "", "文", "", "孫", "", "", "孫"}};
for (const auto& name_test : name_tests) {
TestNameParsing(base::UTF8ToUTF16(name_test.full),
base::UTF8ToUTF16(name_test.honorific),
base::UTF8ToUTF16(name_test.first),
base::UTF8ToUTF16(name_test.middle),
base::UTF8ToUTF16(name_test.last),
base::UTF8ToUTF16(name_test.last_first),
base::UTF8ToUTF16(name_test.last_conjunction),
base::UTF8ToUTF16(name_test.last_second));
}
}
// Tests the detection of CJK name characteristics.
TEST(AutofillStructuredName, HasCjkNameCharacteristics) {
EXPECT_FALSE(HasCjkNameCharacteristics("Peterson"));
EXPECT_TRUE(HasCjkNameCharacteristics("ㅎ"));
EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙"));
EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙龙"));
EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙"));
EXPECT_TRUE(HasCjkNameCharacteristics("房仕・龙"));
EXPECT_FALSE(HasCjkNameCharacteristics("・"));
EXPECT_FALSE(HasCjkNameCharacteristics("房・仕・龙"));
// Non-CJK language with only ASCII characters.
EXPECT_FALSE(HasCjkNameCharacteristics("Homer Jay Simpson"));
// Non-CJK language with some ASCII characters.
EXPECT_FALSE(HasCjkNameCharacteristics("Éloïse Paré"));
// Non-CJK language with no ASCII characters.
EXPECT_FALSE(HasCjkNameCharacteristics("Σωκράτης"));
// (Simplified) Chinese name, Unihan.
EXPECT_TRUE(HasCjkNameCharacteristics("刘翔"));
// (Simplified) Chinese name, Unihan, with an ASCII space.
EXPECT_TRUE(HasCjkNameCharacteristics("成 龙"));
// Korean name, Hangul.
EXPECT_TRUE(HasCjkNameCharacteristics("송지효"));
// Korean name, Hangul, with an 'IDEOGRAPHIC SPACE' (U+3000).
EXPECT_TRUE(HasCjkNameCharacteristics("김 종국"));
// Japanese name, Unihan.
EXPECT_TRUE(HasCjkNameCharacteristics("山田貴洋"));
// Japanese name, Katakana, with a 'KATAKANA MIDDLE DOT' (U+30FB).
EXPECT_TRUE(HasCjkNameCharacteristics("ビル・ゲイツ"));
// Japanese name, Katakana, with a 'MIDDLE DOT' (U+00B7) (likely a
// typo).
EXPECT_TRUE(HasCjkNameCharacteristics("ビル·ゲイツ"));
// CJK names don't have a middle name, so a 3-part name is bogus to us.
EXPECT_FALSE(HasCjkNameCharacteristics("반 기 문"));
}
// Test the detection of Hispanic/Latinx name characteristics.
TEST(AutofillStructuredName, HasHispanicLatinxNameCharaceristics) {
EXPECT_TRUE(HasHispanicLatinxNameCharaceristics("Pablo Ruiz Picasso"));
EXPECT_FALSE(HasHispanicLatinxNameCharaceristics("Werner Heisenberg"));
EXPECT_TRUE(HasHispanicLatinxNameCharaceristics("SomeName y SomeOtherName"));
}
// Test the detection of middle name initials.
TEST(AutofillStructuredName, HasMiddleNameInitialsCharacteristics) {
EXPECT_FALSE(HasMiddleNameInitialsCharacteristics("Diego"));
EXPECT_FALSE(HasMiddleNameInitialsCharacteristics("d"));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D"));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("DD"));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D.D."));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D. D. D."));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D-D"));
EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D.-D."));
}
// Test the reduction of a name to its initials.
TEST(AutofillStructuredName, ReduceToInitials) {
EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("")), base::ASCIIToUTF16(""));
EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("George")),
base::ASCIIToUTF16("G"));
EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("George Walker")),
base::ASCIIToUTF16("GW"));
EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("michael myers")),
base::ASCIIToUTF16("MM"));
EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("Hans-Peter")),
base::ASCIIToUTF16("HP"));
}
// Test getting the field type |NAME_MIDDLE_INITIAL|.
TEST(AutofillStructuredName, GetNameMiddleInitial) {
NameFull full_name;
full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
base::ASCIIToUTF16("Michael"),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("M"));
full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
base::ASCIIToUTF16("Michael Myers"),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("MM"));
full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
base::ASCIIToUTF16("george walker"),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("GW"));
// The the set value already has the characteristics of initials, the value
// should be returned as it is.
full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("GW"),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("GW"));
full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("G. W."),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("G. W."));
full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("G.-W."),
VerificationStatus::kObserved);
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::ASCIIToUTF16("G.-W."));
}
TEST(AutofillStructuredName, TestGetSupportedTypes) {
NameFull full_name;
ServerFieldTypeSet supported_types;
full_name.GetSupportedTypes(&supported_types);
EXPECT_EQ(ServerFieldTypeSet({NAME_FULL, NAME_HONORIFIC_PREFIX, NAME_FIRST,
NAME_MIDDLE, NAME_MIDDLE_INITIAL, NAME_LAST,
NAME_LAST_FIRST, NAME_LAST_CONJUNCTION,
NAME_LAST_SECOND}),
supported_types);
}
TEST(AutofillStructuredName, TestSettingMiddleNameInitial) {
NameFull full_name;
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE), base::string16());
EXPECT_TRUE(full_name.SetValueForTypeIfPossible(
NAME_MIDDLE_INITIAL, base::UTF8ToUTF16("M"),
VerificationStatus::kObserved));
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
base::UTF8ToUTF16("M"));
EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE), base::UTF8ToUTF16("M"));
}
} // namespace structured_address
} // namespace autofill
......@@ -3,7 +3,10 @@
// found in the LICENSE file.
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
#include <utility>
#include "base/strings/strcat.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
......@@ -13,6 +16,317 @@ namespace autofill {
namespace structured_address {
namespace {
// Best practices for writing regular expression snippets:
// By wrapping snippets in non-capture groups, i.e. (?: ... ), we ensure that a
// pending "?" is interpreted as "optional" instead of a modifier of a previous
// operator. E.g. `StrCat({"(?:a+)", "?"})` means an optional sequence of "a"
// characters. But `StrCat({"a+", "?"})` means lazily match one or more "a"
// characters. Prefer [^\s,] ('not a whitespace or a comma') over \w ('a word
// character') in names, when you have concerns about hyphens (e.g. the German
// name "Hans-Joachim") because '-' is not matched by \w.
// Regular expressions pattern of common two-character CJK last names.
// Korean names are written in Hangul.
// Chinese names are written in their traditional and simplified version.
// Source:
// https://en.wikipedia.org/wiki/List_of_Korean_surnames
// https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
const char kTwoCharacterCjkLastNamesRe[] =
"(?:남궁|사공|서문|선우|제갈|황보|독고|망절"
"|欧阳|令狐|皇甫|上官|司徒|诸葛|司马|宇文|呼延|端木"
"|張簡|歐陽|諸葛|申屠|尉遲|司馬|軒轅|夏侯)";
// Regular expression pattern for a Hangul (Korean) character.
const char kHangulCharacterRe[] = "(?:\\p{Hangul})";
// Regular expression pattern for a sequence of Hangul (Korean) character.
const char kHangulCharactersRe[] = "(?:\\p{Hangul}+)";
// Regular expression pattern to match separators as used in CJK names:
// Included separators: \u30FB, \u00B7, \u3000 or a simple space.
const char kCjkNameSeperatorsRe[] = "(?:・|·| |\\s+)";
// Regular expression pattern for common honorific name prefixes.
// The list is incomplete and focused on the English and German language.
// Sources:
// * https://en.wikipedia.org/wiki/English_honorifics
// * https://en.wikipedia.org/wiki/German_honorifics
// TODO(crbug.com/1107770): Include more languages and categories.
const char kHonorificPrefixRe[] =
"(?:"
"Master|Mr\\.?|Miss\\.?|Mrs\\.?|Missus|Ms\\.?|Mx\\.?|M\\.?|Ma'am|Sir|"
"Gentleman|Sire|Mistress|Madam|Ma'am|Dame|Lord|Lady|Esq|Excellency|"
"Excellence|Her Honour|His Honour|Hon\\.?|The Right Honourable|The Most "
"Honourable|Dr\\.?|PhD|DPhil|MD|DO|Prof\\.|Professor|QC|CL|Chancellor|Vice-"
"Chancellor|Principle|Principal|President|Master|Warden|Dean|Regent|Rector|"
"Provost|Director|Chief Executive|Imām|Shaykh|Muftī|Hāfiz|Hāfizah|Qārī"
"|Mawlānā|Hājī|Sayyid|Sayyidah|Sharif|Eminent|Venerable|His Holiness"
"|His Holiness|His All Holiness|His Beatitude|The Most Blessed"
"|His Excellency|His Most Eminent Highness|His Eminence"
"|Most Reverend Eminence|The Most Reverend|His Grace|His Lordship"
"|The Reverend|Fr|Pr|Br|Sr|Elder|Rabbi|The Reverend|Cantor|Chief Rabbi"
"|Grand "
"Rabbi|Rebbetzin|Herr|Frau|Fräulein|Dame|PD|Doktor|Magister|Ingenieur"
"|1lt|1st|2lt|2nd|3rd|admiral|capt|captain|col|cpt|dr|gen|general|lcdr"
"|lt|ltc|ltg|ltjg|maj|major|mg|pastor|prof|rep|reverend"
"|rev|sen|st)";
// Regular expression pattern for an optional last name suffix.
const char kOptionalLastNameSuffixRe[] =
"(?:b\\.a|ba|d\\.d\\.s|dds|ii|iii|iv|ix|jr|m\\.a|m\\.d|md|ms|"
"ph\\.?d|sr|v|vi|vii|viii|x)?";
// Regular expression pattern for a CJK character.
const char kCjkCharacterRe[] =
"(?:"
"\\p{Han}|"
"\\p{Hangul}|"
"\\p{Katakana}|"
"\\p{Hiragana}|"
"\\p{Bopomofo})";
// Regular expression pattern for a sequence of CJK character.
const char kCjkCharactersRe[] =
"(?:(?:"
"\\p{Han}|"
"\\p{Hangul}|"
"\\p{Katakana}|"
"\\p{Hiragana}|"
"\\p{Bopomofo})+)";
// Regular expression pattern of common two-character Korean names.
// Korean last names are written in Hangul. Note, some last names are ambiguous
// in the sense that they share a common prefix with a single-character last
// name. Source: https://en.wikipedia.org/wiki/List_of_Korean_surnames
const char kTwoCharacterKoreanNamesRe[] =
"(?:강전|남궁|독고|동방|망절|사공|서문|선우"
"|소봉|어금|장곡|제갈|황목|황보)";
// Regular expression pattern to match if a string contains a common
// Hispanic/Latinx last name.
// It contains the most common names in Spain, Mexico, Cuba, Dominican Republic,
// Puerto Rico and Guatemala.
// Source: https://en.wikipedia.org/wiki/List_of_common_Spanish_surnames
const char kHispanicCommonLastNameCharacteristicsRe[] =
"(?:Aguilar|Alonso|Álvarez|Amador|Betancourt|Blanco|Burgos|Castillo|Castro|"
"Chávez|Colón|Contreras|Cortez|Cruz|Delgado|Diaz|Díaz|Domínguez|Estrada|"
"Fernandez|Fernández|Flores|Fuentes|Garcia|García|Garza|Gil|Gómez|González|"
"Guerrero|Gutiérrez|Guzmán|Hernández|Herrera|Iglesias|Jiménez|Juárez|Lopez|"
"López|Luna|Marín|Marroquín|Martín|Martinez|Martínez|Medina|Méndez|Mendoza|"
"Molina|Morales|Moreno|Muñoz|Narvaez|Navarro|Núñez|Ortega|Ortiz|Ortíz|Peña|"
"Perez|Pérez|Ramírez|Ramos|Reyes|Rivera|Rodriguez|Rodríguez|Rojas|Romero|"
"Rosario|Rubio|Ruiz|Ruíz|Salazar|Sanchez|Sánchez|Santana|Santiago|Santos|"
"Sanz|Serrano|Soto|Suárez|Toro|Torres|Vargas|Vasquez|Vásquez|Vázquez|"
"Velásquez)";
// Regular expression pattern to match a single word.
const char kSingleWordRe[] = "(?:[^\\s,]+)";
// Regular expression pattern for multiple lazy words meaning that the
// expression avoids to match more than one word if possible.
const char kMultipleLazyWordsRe[] = "(?:[^\\s,]+(?:\\s+[^\\s,]+)*?)";
// Regular expression pattern to check if a name contains a Hispanic/Latinx
// last name conjunction.
const char kHispanicLastNameConjunctionCharacteristicsRe[] = "\\s(y|e|i)\\s";
// Regular expression pattern to match the conjunction used between
// Hispanic/Latinx last names.
const char kHispanicLastNameConjunctionsRe[] = "(?:y|e|i)";
// Regular expression pattern to match common prefixes belonging to a (single)
// last name.
// Source: https://en.wikipedia.org/wiki/List_of_family_name_affixes
// According to the source, the list is partial. Changes to the list:
// * "De la" and "De le" is added to support the combination of "de" and
// "le"/"la" as used in Hispanic/Latinx names.
// * The matching of "i" is made lazy to give the last name conjunction
// precedence.
const char kOptionalLastNamePrefixRe[] =
"(?:(?:"
"a|ab|af|av|ap|abu|aït|al|ālam|aust|austre|bar|bath|bat|ben|bin|ibn|bet|"
"bint|binti|binte|da|das|de|degli|dele|del|du|della|der|di|dos|du|e|el|"
"fetch|vetch|fitz|i??|kil|gil|de le|de "
"la|la|le|lille|lu|m|mac|mc|mck|mhic|mic|mala|"
"mellom|myljom|na|ned|nedre|neder|nic|ni|nin|nord|norr|ny|o|ua|"
"ui|opp|upp|öfver|ost|öst|öster|øst|øst|østre|över|øvste|øvre|øver|öz|pour|"
"putra|putri|setia|tor|söder|sør|sønder|sør|syd|søndre|syndre|søre|ter|ter|"
"tre|van|väst|väster|verch|erch|vest|vestre|vesle|vetle|von|zu|von und "
"zu)\\s)?";
// Regular expressions to characterize if a string contains initials by
// checking that:
// * The string contains only upper case letters that may be preceded by a
// point.
// * Between each letter, there can be a space or a hyphen.
const char kMiddleNameInitialsCharacteristicsRe[] =
"^(?:[A-Z]\\.?(?:(?:\\s|-)?[A-Z]\\.?)*)$";
// Returns an expression to parse a CJK name that includes one separator.
// The full name is parsed into |NAME_FULL|, the part of the name before the
// separator is parsed into |NAME_LAST| and the part after the separator is
// parsed into |NAME_FIRST|.
std::string ParseSeparatedCJkNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{// Parse one or more CJK characters into the last name.
CaptureTypeWithPattern(NAME_LAST, kCjkCharactersRe,
{.separator = kCjkNameSeperatorsRe}),
// Parse the remaining CJK characters into the first name.
CaptureTypeWithPattern(NAME_FIRST, kCjkCharactersRe)});
}
// Returns an expression to parse a CJK name that starts with a known
// two-character last name.
std::string ParseCommonCjkTwoCharacterLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{// Parse known two-character CJK last name into |NAME_LAST|.
CaptureTypeWithPattern(NAME_LAST, kTwoCharacterCjkLastNamesRe,
{.separator = std::string()}),
// Parse the remaining CJK characters into |NAME_FIRST|.
CaptureTypeWithPattern(
NAME_FIRST, kCjkCharactersRe,
{.separator = "", .quantifier = MATCH_OPTIONAL})});
}
// Returns an expression to parse a CJK name without a separator.
// The full name is parsed into |NAME_FULL|, the first character is parsed
// into |NAME_LAST| and the rest into |NAME_FIRST|.
std::string ParseCjkSingleCharacterLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{// Parse the first CJK character into |NAME_LAST|.
CaptureTypeWithPattern(NAME_LAST, kCjkCharacterRe,
{.separator = std::string()}),
// Parse the remaining CJK characters into |NAME_FIRST|.
CaptureTypeWithPattern(
NAME_FIRST, kCjkCharactersRe,
{.separator = "", .quantifier = MATCH_OPTIONAL})});
}
// Returns an expression to parse a Korean name that contains at least 4
// characters with a common Korean two-character last name. The full name is
// parsed into |NAME_FULL|, the first two characters into |NAME_LAST| and the
// rest into |NAME_FIRST|.
std::string ParseKoreanTwoCharacterLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{// Parse known Korean two-character last names into |NAME_LAST|.
CaptureTypeWithPattern(NAME_LAST, kTwoCharacterKoreanNamesRe,
{.separator = std::string()}),
// Parse at least two remaining Hangul characters into
// |NAME_FIRST|.
CaptureTypeWithPattern(NAME_FIRST,
{kHangulCharacterRe, kHangulCharactersRe})});
}
// Returns an expression to determine if a name has the characteristics of a
// CJK name.
std::string MatchCjkNameExpression() {
return base::StrCat({// Must contain one or more CJK characters
"^", kCjkCharactersRe,
// Followed by an optional separator with one
// or more additional CJK characters.
"(", kCjkNameSeperatorsRe, kCjkCharactersRe, ")?$"});
}
// Returns an expression to parse a full name that contains only a last name.
std::string ParseOnlyLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL, {CaptureTypeWithPattern(
NAME_LAST, {kOptionalLastNamePrefixRe, kSingleWordRe}),
kOptionalLastNameSuffixRe});
}
// Returns an expression to parse a name that consists of a first, middle and
// last name with an optional honorific prefix. The full name is parsed into
// |NAME_FULL|. The name can start with an honorific prefix that is parsed
// into |NAME_HONORIFIC_PREFIX|. The last token is parsed into |NAME_LAST|.
// This token may be preceded by a last name prefix like "Mac" or
// "von" that is included in |NAME_LAST|. If the strings contains any
// remaining tokens, the first token is parsed into
// |NAME_FIRST| and all remaining tokens into |NAME_MIDDLE|.
std::string ParseFirstMiddleLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_FIRST, kSingleWordRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_MIDDLE, kMultipleLazyWordsRe,
{.quantifier = MATCH_LAZY_OPTIONAL}),
CaptureTypeWithPattern(NAME_LAST,
{kOptionalLastNamePrefixRe, kSingleWordRe}),
kOptionalLastNameSuffixRe});
}
// Returns an expression to parse a name that starts with the last name,
// followed by a comma, and than the first and middle names.
// The full name is parsed into |NAME_FULL|. The name can start with an optional
// honorific prefix that is parsed into |HONORIFIC_PREFIX|, follow by a single
// token that is parsed into |LAST_NAME|. The |LAST_NAME| must be preceded by a
// comma with optional spaces. The next token is parsed into |NAME_FIRST| and
// all remaining tokens are parsed into |NAME_MIDDLE|.
std::string ParseLastCommaFirstMiddleExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_LAST,
{kOptionalLastNamePrefixRe, kSingleWordRe},
{.separator = "\\s*,\\s*"}),
CaptureTypeWithPattern(NAME_FIRST, kSingleWordRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_MIDDLE, kMultipleLazyWordsRe,
{.quantifier = MATCH_LAZY_OPTIONAL})});
}
// Returns an expression to parse an Hispanic/Latinx last name.
// The last name can consist of two parts with an optional conjunction.
// The full last name is parsed into |NAME_LAST|, the first part into
// |NAME_LAST_FIRST|, the conjunction into |NAME_LAST_CONJUNCTION|, and the
// second part into |NAME_LAST_SECOND|.
// Each last name part consists of a space-separated toke with an optional
// prefix like "de le". If only one last name part is found, it is parsed into
// |NAME_LAST_SECOND|.
std::string ParseHispanicLastNameExpression() {
return CaptureTypeWithPattern(
NAME_LAST,
{CaptureTypeWithPattern(NAME_LAST_FIRST,
{kOptionalLastNamePrefixRe, kSingleWordRe}),
CaptureTypeWithPattern(NAME_LAST_CONJUNCTION,
kHispanicLastNameConjunctionsRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_LAST_SECOND,
{kOptionalLastNamePrefixRe, kSingleWordRe})});
}
// Returns an expression to parse a full Hispanic/Latinx name that
// contains an optional honorific prefix, a first name, and a last name as
// specified by |ParseHispanicLastNameExpression()|.
std::string ParseHispanicFullNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
{CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPattern(NAME_FIRST, kMultipleLazyWordsRe,
{.quantifier = MATCH_LAZY_OPTIONAL}),
ParseHispanicLastNameExpression()});
}
// Returns an expression that parses the whole |LAST_NAME| into
// |LAST_NAME_SECOND|.
std::string ParseLastNameIntoSecondLastNameExpression() {
return CaptureTypeWithPattern(
NAME_LAST,
{CaptureTypeWithPattern(NAME_LAST_SECOND, kMultipleLazyWordsRe)});
}
} // namespace
StructuredAddressesRegExProvider::StructuredAddressesRegExProvider() = default;
// static
......@@ -27,6 +341,34 @@ std::string StructuredAddressesRegExProvider::GetPattern(
switch (expression_identifier) {
case RegEx::kSingleWord:
return kSingleWordRe;
case RegEx::kParseSeparatedCjkName:
return ParseSeparatedCJkNameExpression();
case RegEx::kParseCommonCjkTwoCharacterLastName:
return ParseCommonCjkTwoCharacterLastNameExpression();
case RegEx::kParseKoreanTwoCharacterLastName:
return ParseKoreanTwoCharacterLastNameExpression();
case RegEx::kParseCjkSingleCharacterLastName:
return ParseCjkSingleCharacterLastNameExpression();
case RegEx::kMatchHispanicCommonNameCharacteristics:
return kHispanicCommonLastNameCharacteristicsRe;
case RegEx::kMatchHispanicLastNameConjuctionCharacteristics:
return kHispanicLastNameConjunctionCharacteristicsRe;
case RegEx::kMatchCjkNameCharacteristics:
return MatchCjkNameExpression();
case RegEx::kParseOnlyLastName:
return ParseOnlyLastNameExpression();
case RegEx::kParseLastCommaFirstMiddleName:
return ParseLastCommaFirstMiddleExpression();
case RegEx::kParseFirstMiddleLastName:
return ParseFirstMiddleLastNameExpression();
case RegEx::kParseHispanicLastName:
return ParseHispanicLastNameExpression();
case RegEx::kParseHispanicFullName:
return ParseHispanicFullNameExpression();
case RegEx::kMatchMiddleNameInitialsCharacteristics:
return kMiddleNameInitialsCharacteristicsRe;
case RegEx::kParseLastNameIntoSecondLastName:
return ParseLastNameIntoSecondLastNameExpression();
}
NOTREACHED();
}
......
......@@ -20,7 +20,21 @@ namespace structured_address {
// values in an AddressComponent tree.
enum class RegEx {
kSingleWord,
kLastRegEx = kSingleWord,
kParseSeparatedCjkName,
kParseCommonCjkTwoCharacterLastName,
kParseKoreanTwoCharacterLastName,
kParseCjkSingleCharacterLastName,
kMatchCjkNameCharacteristics,
kMatchHispanicCommonNameCharacteristics,
kMatchHispanicLastNameConjuctionCharacteristics,
kParseOnlyLastName,
kParseLastCommaFirstMiddleName,
kParseFirstMiddleLastName,
kParseHispanicLastName,
kParseHispanicFullName,
kParseLastNameIntoSecondLastName,
kMatchMiddleNameInitialsCharacteristics,
kLastRegEx = kParseLastNameIntoSecondLastName,
};
// This singleton class builds and caches the regular expressions for value
......@@ -73,4 +87,5 @@ class StructuredAddressesRegExProvider {
} // namespace structured_address
} // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_PATTERN_REGEX_H_
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_REGEX_PROVIDER_H_
......@@ -13,6 +13,7 @@
#include "base/debug/alias.h"
#include "base/debug/dump_without_crashing.h"
#include "base/strings/strcat.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
namespace autofill {
namespace structured_address {
......@@ -45,9 +46,13 @@ const RE2* Re2RegExCache::GetRegEx(const std::string& pattern) {
return result.first->second.get();
}
std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern) {
std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern) {
RE2::Options opt;
opt.set_case_sensitive(false);
// By default, patters are case sensitive.
// Note that, the named-capture-group patterns build with
// |CaptureTypeWithPattern()| apply a flag to make the matching case
// insensitive.
opt.set_case_sensitive(true);
auto regex = std::make_unique<const RE2>(pattern, opt);
......@@ -108,12 +113,17 @@ bool ParseValueByRegularExpression(
return true;
}
bool IsPartialMatch(const std::string& value, RegEx regex) {
return IsPartialMatch(
value, StructuredAddressesRegExProvider::Instance()->GetRegEx(regex));
}
bool IsPartialMatch(const std::string& value, const std::string& pattern) {
const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern);
if (!regex || !regex->ok())
return false;
return IsPartialMatch(value, Re2RegExCache::Instance()->GetRegEx(pattern));
}
return RE2::PartialMatch(value, *regex);
bool IsPartialMatch(const std::string& value, const RE2* expression) {
return RE2::PartialMatch(value, *expression);
}
std::vector<std::string> GetAllPartialMatches(const std::string& value,
......@@ -172,7 +182,8 @@ std::string CaptureTypeWithPattern(const ServerFieldType& type,
quantifier = "";
}
return base::StrCat({"(?:(?P<", AutofillType(type).ToString(), ">", pattern,
// By adding an "i" in the first group, the capturing is case insensitive.
return base::StrCat({"(?i:(?P<", AutofillType(type).ToString(), ">", pattern,
")(?:", options.separator, "))", quantifier});
}
......
......@@ -21,6 +21,8 @@
namespace autofill {
namespace structured_address {
enum class RegEx;
// Enum to express the few quantifiers needed to parse values.
enum MatchQuantifier {
// The capture group is required.
......@@ -39,7 +41,7 @@ struct CaptureOptions {
// By default, a group must be either followed by a space-like character (\s)
// or it must be the last group in the line. The separator is allowed to be
// empty.
std::string separator = "\\s|$";
std::string separator = "\\s+|$";
// Indicates if the group is required, optional or even lazy optional.
MatchQuantifier quantifier = MATCH_REQUIRED;
};
......@@ -55,7 +57,7 @@ class Re2RegExCache {
static Re2RegExCache* Instance();
// Returns a pointer to a constant compiled expression that matches |pattern|
// case-insensitively.
// case-sensitively.
const RE2* GetRegEx(const std::string& pattern);
#ifdef UNIT_TEST
......@@ -96,12 +98,19 @@ bool ParseValueByRegularExpression(
const RE2* regex,
std::map<std::string, std::string>* result_map);
// Returns a compiled case insensitive regular expression for |pattern|.
std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern);
// Returns a compiled case sensitive regular expression for |pattern|.
std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern);
// Returns true if |value| can be matched by the enumuerated RegEx |regex|.
bool IsPartialMatch(const std::string& value, RegEx regex);
// Returns true if |value| can be matched with |pattern|.
bool IsPartialMatch(const std::string& value, const std::string& pattern);
// Same as above, but accepts a compiled regular expression instead of the
// pattern.
bool IsPartialMatch(const std::string& value, const RE2* expression);
// Returns a vector that contains all partial matches of |pattern| in |value|;
std::vector<std::string> GetAllPartialMatches(const std::string& value,
const std::string& pattern);
......
......@@ -193,17 +193,17 @@ TEST(AutofillStructuredAddressUtils, TestGetPlaceholderToken) {
}
TEST(AutofillStructuredAddressUtils, CaptureTypeWithPattern) {
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))?",
EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))?",
CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"},
{.quantifier = MATCH_OPTIONAL}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))",
EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))??",
EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))??",
CaptureTypeWithPattern(NAME_FULL, "abs\\w",
{.quantifier = MATCH_LAZY_OPTIONAL}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))",
EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
CaptureTypeWithPattern(NAME_FULL, "abs\\w"));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:_))",
EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:_))",
CaptureTypeWithPattern(NAME_FULL, "abs\\w", {.separator = "_"}));
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment