Commit deea5c39 authored by Matthias Körber's avatar Matthias Körber Committed by Commit Bot

[Autofill][SlimShady] Component for structured names.

This CL adds the static address component tree for names that support
two surnames as they are common for Hispanic/Latinx names.

Change-Id: Icd54f986ab192101fbea5a8576c8abbabfffe636
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2264423
Commit-Queue: Matthias Körber <koerber@google.com>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#793110}
parent 73f0449d
...@@ -92,6 +92,8 @@ jumbo_static_library("browser") { ...@@ -92,6 +92,8 @@ jumbo_static_library("browser") {
"data_model/autofill_structured_address_component.h", "data_model/autofill_structured_address_component.h",
"data_model/autofill_structured_address_constants.cc", "data_model/autofill_structured_address_constants.cc",
"data_model/autofill_structured_address_constants.h", "data_model/autofill_structured_address_constants.h",
"data_model/autofill_structured_address_name.cc",
"data_model/autofill_structured_address_name.h",
"data_model/autofill_structured_address_regex_provider.cc", "data_model/autofill_structured_address_regex_provider.cc",
"data_model/autofill_structured_address_regex_provider.h", "data_model/autofill_structured_address_regex_provider.h",
"data_model/autofill_structured_address_utils.cc", "data_model/autofill_structured_address_utils.cc",
...@@ -578,6 +580,7 @@ source_set("unit_tests") { ...@@ -578,6 +580,7 @@ source_set("unit_tests") {
"data_model/autofill_profile_comparator_unittest.cc", "data_model/autofill_profile_comparator_unittest.cc",
"data_model/autofill_profile_unittest.cc", "data_model/autofill_profile_unittest.cc",
"data_model/autofill_structured_address_component_unittest.cc", "data_model/autofill_structured_address_component_unittest.cc",
"data_model/autofill_structured_address_name_unittest.cc",
"data_model/autofill_structured_address_regex_provider_unittest.cc", "data_model/autofill_structured_address_regex_provider_unittest.cc",
"data_model/autofill_structured_address_utils_unittest.cc", "data_model/autofill_structured_address_utils_unittest.cc",
"data_model/contact_info_unittest.cc", "data_model/contact_info_unittest.cc",
......
...@@ -19,9 +19,21 @@ using data_util::bit_field_type_groups::kName; ...@@ -19,9 +19,21 @@ using data_util::bit_field_type_groups::kName;
using data_util::bit_field_type_groups::kPhone; using data_util::bit_field_type_groups::kPhone;
TEST(AutofillDataUtilTest, DetermineGroupsForHomeNameAndAddress) { TEST(AutofillDataUtilTest, DetermineGroupsForHomeNameAndAddress) {
const std::vector<ServerFieldType> field_types{ const std::vector<ServerFieldType> field_types{NAME_HONORIFIC_PREFIX,
NAME_FIRST, NAME_LAST, ADDRESS_HOME_LINE1, NAME_FULL,
ADDRESS_HOME_CITY, ADDRESS_HOME_STATE, ADDRESS_HOME_ZIP}; NAME_FIRST,
NAME_MIDDLE,
NAME_MIDDLE_INITIAL,
NAME_LAST,
NAME_LAST_FIRST,
NAME_LAST_CONJUNCTION,
NAME_LAST_SECOND,
NAME_FIRST,
NAME_LAST,
ADDRESS_HOME_LINE1,
ADDRESS_HOME_CITY,
ADDRESS_HOME_STATE,
ADDRESS_HOME_ZIP};
const uint32_t expected_group_bitmask = kName | kAddress; const uint32_t expected_group_bitmask = kName | kAddress;
const uint32_t group_bitmask = data_util::DetermineGroups(field_types); const uint32_t group_bitmask = data_util::DetermineGroups(field_types);
...@@ -195,9 +207,8 @@ INSTANTIATE_TEST_SUITE_P( ...@@ -195,9 +207,8 @@ INSTANTIATE_TEST_SUITE_P(
"황목"}, // Korean name, Hangul "황목"}, // Korean name, Hangul
// It occasionally happens that a full name is 2 characters, 1/1. // It occasionally happens that a full name is 2 characters, 1/1.
FullNameTestCase{"이도", "도", "", "이"}, // Korean name, Hangul FullNameTestCase{"이도", "도", "", "이"}, // Korean name, Hangul
FullNameTestCase{"孫文", "文", "", "孫"} // Chinese name, Unihan FullNameTestCase{"孫文", "文", "", "孫"})); // Chinese name, Unihan
));
class JoinNamePartsTest : public testing::TestWithParam<FullNameTestCase> {}; class JoinNamePartsTest : public testing::TestWithParam<FullNameTestCase> {};
...@@ -229,9 +240,7 @@ INSTANTIATE_TEST_SUITE_P( ...@@ -229,9 +240,7 @@ INSTANTIATE_TEST_SUITE_P(
// These are no CJK names for us, they're just bogus. // These are no CJK names for us, they're just bogus.
FullNameTestCase{"Homer シンプソン", "Homer", "", "シンプソン"}, FullNameTestCase{"Homer シンプソン", "Homer", "", "シンプソン"},
FullNameTestCase{"ホーマー Simpson", "ホーマー", "", "Simpson"}, FullNameTestCase{"ホーマー Simpson", "ホーマー", "", "Simpson"},
FullNameTestCase{"반 기 문", "반", "기", "문"} FullNameTestCase{"반 기 문", "반", "기", "문"}));
// Has a middle-name, too unusual
));
struct ValidCountryCodeTestCase { struct ValidCountryCodeTestCase {
std::string country_code; std::string country_code;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
namespace autofill { namespace autofill {
namespace structured_address { namespace structured_address {
const char kSingleWordRe[] = "(?:\\w+)"; const char kNameSeparators[] = " -";
} // namespace structured_address } // namespace structured_address
} // namespace autofill } // namespace autofill
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
namespace autofill { namespace autofill {
namespace structured_address { namespace structured_address {
// Regular expression pattern to match a single word. // List of name separators.
extern const char kSingleWordRe[]; extern const char kNameSeparators[];
} // namespace structured_address } // namespace structured_address
} // namespace autofill } // namespace autofill
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/data_model/autofill_structured_address_name.h"
#include <utility>
#include "base/i18n/case_conversion.h"
#include "base/strings/strcat.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/autofill_type.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
#include "components/autofill/core/browser/field_types.h"
namespace autofill {
namespace structured_address {
base::string16 ReduceToInitials(const base::string16& value) {
if (value.empty())
return base::string16();
std::vector<base::string16> middle_name_tokens =
base::SplitString(value, base::ASCIIToUTF16(kNameSeparators),
base::WhitespaceHandling::TRIM_WHITESPACE,
base::SplitResult::SPLIT_WANT_NONEMPTY);
base::string16 result;
result.reserve(middle_name_tokens.size());
for (const auto& token : middle_name_tokens) {
DCHECK(!token.empty());
result += token[0];
}
return base::i18n::ToUpper(result);
}
bool HasHispanicLatinxNameCharaceristics(const std::string& name) {
// Check if the name contains one of the most common Hispanic/Latinx
// last names.
if (IsPartialMatch(name, RegEx::kMatchHispanicCommonNameCharacteristics))
return true;
// Check if it contains a last name conjunction.
if (IsPartialMatch(name,
RegEx::kMatchHispanicLastNameConjuctionCharacteristics))
return true;
// If none of the above, there is not sufficient reason to assume this is a
// Hispanic/Latinx name.
return false;
}
bool HasCjkNameCharacteristics(const std::string& name) {
return IsPartialMatch(name, RegEx::kMatchCjkNameCharacteristics);
}
bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name) {
return IsPartialMatch(middle_name,
RegEx::kMatchMiddleNameInitialsCharacteristics);
}
NameHonorific::NameHonorific() : NameHonorific(nullptr) {}
NameHonorific::NameHonorific(AddressComponent* parent)
: AddressComponent(NAME_HONORIFIC_PREFIX, parent) {}
NameHonorific::~NameHonorific() = default;
NameFirst::NameFirst() : NameFirst(nullptr) {}
NameFirst::NameFirst(AddressComponent* parent)
: AddressComponent(NAME_FIRST, parent) {}
NameFirst::~NameFirst() = default;
NameMiddle::NameMiddle() : NameMiddle(nullptr) {}
NameMiddle::NameMiddle(AddressComponent* parent)
: AddressComponent(NAME_MIDDLE, parent) {}
NameMiddle::~NameMiddle() = default;
void NameMiddle::GetAdditionalSupportedFieldTypes(
ServerFieldTypeSet* supported_types) const {
supported_types->insert(NAME_MIDDLE_INITIAL);
}
bool NameMiddle::ConvertAndGetTheValueForAdditionalFieldTypeName(
const std::string& type_name,
base::string16* value) const {
if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
if (value) {
// If the stored value has the characteristics of containing only
// initials, use the value as it is. Otherwise, convert it to a
// sequence of upper case letters, one for each space- or hyphen-separated
// token.
if (HasMiddleNameInitialsCharacteristics(base::UTF16ToUTF8(GetValue()))) {
*value = GetValue();
} else {
*value = ReduceToInitials(GetValue());
}
}
return true;
}
return false;
}
bool NameMiddle::ConvertAndSetValueForAdditionalFieldTypeName(
const std::string& type_name,
const base::string16& value,
const VerificationStatus& status) {
if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
SetValue(value, status);
return true;
}
return false;
}
NameLastFirst::NameLastFirst() : NameLastFirst(nullptr) {}
NameLastFirst::NameLastFirst(AddressComponent* parent)
: AddressComponent(NAME_LAST_FIRST, parent) {}
NameLastFirst::~NameLastFirst() = default;
NameLastConjunction::NameLastConjunction() : NameLastConjunction(nullptr) {}
NameLastConjunction::NameLastConjunction(AddressComponent* parent)
: AddressComponent(NAME_LAST_CONJUNCTION, parent) {}
NameLastConjunction::~NameLastConjunction() = default;
std::vector<const RE2*> NameLast::GetParseRegularExpressionsByRelevance()
const {
auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
DCHECK(pattern_provider);
// Check if the name has the characteristics of an Hispanic/Latinx name.
if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
return {pattern_provider->GetRegEx(RegEx::kParseHispanicLastName)};
return {pattern_provider->GetRegEx(RegEx::kParseLastNameIntoSecondLastName)};
}
NameLastSecond::NameLastSecond() : NameLastSecond(nullptr) {}
NameLastSecond::NameLastSecond(AddressComponent* parent)
: AddressComponent(NAME_LAST_SECOND, parent) {}
NameLastSecond::~NameLastSecond() = default;
NameLast::NameLast() : NameLast(nullptr) {}
NameLast::NameLast(AddressComponent* parent)
: AddressComponent(NAME_LAST, parent, {&first_, &conjunction_, &second_}) {}
NameLast::~NameLast() = default;
void NameLast::ParseValueAndAssignSubcomponentsByFallbackMethod() {
SetValueForTypeIfPossible(NAME_LAST_SECOND, GetValue(),
VerificationStatus::kParsed);
}
NameFull::NameFull() : NameFull(nullptr) {}
NameFull::NameFull(AddressComponent* parent)
: AddressComponent(
NAME_FULL,
parent,
{&name_honorific_, &name_first_, &name_middle_, &name_last_}) {}
std::vector<const RE2*> NameFull::GetParseRegularExpressionsByRelevance()
const {
auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
DCHECK(pattern_provider);
// If the name is a CJK name, try to match in the following order:
//
// * Match CJK names that include a separator.
// If a separator is present, dividing the name between first and last name is
// trivial.
//
// * Match Korean 4+ character names with two-character last names.
// Note, although some of the two-character last names are ambiguous in the
// sense that they share a common prefix with single character last names. For
// 4+ character names, it is more likely that the first two characters belong
// to the last name.
//
// * Match known two-character CJK last names.
// Note, this expressions uses only non-ambiguous two-character last names.
//
// * Match only the first character into the last name.
// This is the catch all expression that uses only the first character for the
// last name and puts all other characters into the first name.
//
if (HasCjkNameCharacteristics(base::UTF16ToUTF8(GetValue()))) {
return {
pattern_provider->GetRegEx(RegEx::kParseSeparatedCjkName),
pattern_provider->GetRegEx(RegEx::kParseKoreanTwoCharacterLastName),
pattern_provider->GetRegEx(RegEx::kParseCommonCjkTwoCharacterLastName),
pattern_provider->GetRegEx(RegEx::kParseCjkSingleCharacterLastName)};
}
if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
return {pattern_provider->GetRegEx(RegEx::kParseHispanicFullName)};
return {pattern_provider->GetRegEx(RegEx::kParseOnlyLastName),
pattern_provider->GetRegEx(RegEx::kParseLastCommaFirstMiddleName),
pattern_provider->GetRegEx(RegEx::kParseFirstMiddleLastName)};
}
NameFull::~NameFull() = default;
} // namespace structured_address
} // namespace autofill
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
#include <string>
#include <vector>
#include "components/autofill/core/browser/data_model/autofill_structured_address_component.h"
using autofill::structured_address::AddressComponent;
namespace autofill {
namespace structured_address {
// Returns true if |name| has the characteristics of a Chinese, Japanese or
// Korean name:
// * It must only contain CJK characters with at most one separator in between.
bool HasCjkNameCharacteristics(const std::string& name);
// Returns true if |name| has one of the characteristics of an Hispanic/Latinx
// name:
// * Name contains a very common Hispanic/Latinx surname.
// * Name uses a surname conjunction.
bool HasHispanicLatinxNameCharaceristics(const std::string& name);
// Return true if |middle_name| has the characteristics of a containing only
// initials:
// * The string contains only upper case letters that may be preceded by a
// point.
// * Between each letter, there can be a space or a hyphen.
bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name);
// Reduces a name to the initials in upper case.
// Example: George walker -> GW, Hans-Peter -> HP
base::string16 ReduceToInitials(const base::string16& value);
// Atomic component that represents the honorific prefix of a name.
class NameHonorific : public AddressComponent {
public:
NameHonorific();
explicit NameHonorific(AddressComponent* parent);
~NameHonorific() override;
};
// Atomic components that represents the first name.
class NameFirst : public AddressComponent {
public:
NameFirst();
explicit NameFirst(AddressComponent* parent);
~NameFirst() override;
};
// Atomic component that represents the middle name.
class NameMiddle : public AddressComponent {
public:
NameMiddle();
explicit NameMiddle(AddressComponent* parent);
~NameMiddle() override;
void GetAdditionalSupportedFieldTypes(
ServerFieldTypeSet* supported_types) const override;
protected:
// Implements support for getting for a value for the |MIDDLE_NAME_INITIAL|
// type.
bool ConvertAndGetTheValueForAdditionalFieldTypeName(
const std::string& type_name,
base::string16* value) const override;
// Implements support for setting the |MIDDLE_NAME_INITIAL| type.
bool ConvertAndSetValueForAdditionalFieldTypeName(
const std::string& type_name,
const base::string16& value,
const VerificationStatus& status) override;
};
// Atomic component that represents the first part of a last name.
class NameLastFirst : public AddressComponent {
public:
NameLastFirst();
explicit NameLastFirst(AddressComponent* parent);
~NameLastFirst() override;
};
// Atomic component that represents the conjunction in a Hispanic/Latinx
// surname.
class NameLastConjunction : public AddressComponent {
public:
NameLastConjunction();
explicit NameLastConjunction(AddressComponent* parent);
~NameLastConjunction() override;
};
// Atomic component that represents the second part of a surname.
class NameLastSecond : public AddressComponent {
public:
NameLastSecond();
explicit NameLastSecond(AddressComponent* parent);
~NameLastSecond() override;
};
// Compound that represent a last name. It contains a first and second last name
// and a conjunction as it is used in Hispanic/Latinx names. Note, that compound
// family names like Miller-Smith are not supposed to be split up into two
// components. If a name contains only a single component, the component is
// stored in the second part by default.
//
// +-------+
// | _LAST |
// +--------
// / | \
// / | \
// / | \
// +--------+ +-----------+ +---------+
// | _FIRST | | _CONJUNC. | | _SECOND |
// +--------+ +-----------+ +---------+
//
class NameLast : public AddressComponent {
public:
NameLast();
explicit NameLast(AddressComponent* parent);
~NameLast() override;
std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
const override;
private:
// As the fallback, write everything to the second last name.
void ParseValueAndAssignSubcomponentsByFallbackMethod() override;
NameLastFirst first_;
NameLastConjunction conjunction_;
NameLastSecond second_;
};
// Compound that represents a full name. It contains a honorific, a first
// name, a middle name and a last name. The last name is a compound itself.
//
// +----------+
// | NAME_FULL|
// +----------+
// / | | \
// / | | \
// / | | \
// / | | \
// +------------+ +--------+ +---------+ +-------+
// | _HONORIFIC | | _FIRST | | _MIDDLE | | _LAST |
// +------------+ +--------+ +---------+ +-------+
// / | \
// / | \
// / | \
// / | \
// +--------+ +-----------+ +---------+
// | _FIRST | | _CONJUNC. | | _SECOND |
// +--------+ +-----------+ +---------+
//
class NameFull : public AddressComponent {
public:
NameFull();
explicit NameFull(AddressComponent* parent);
~NameFull() override;
std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
const override;
private:
NameHonorific name_honorific_;
NameFirst name_first_;
NameMiddle name_middle_;
NameLast name_last_;
};
} // namespace structured_address
} // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
...@@ -20,7 +20,21 @@ namespace structured_address { ...@@ -20,7 +20,21 @@ namespace structured_address {
// values in an AddressComponent tree. // values in an AddressComponent tree.
enum class RegEx { enum class RegEx {
kSingleWord, kSingleWord,
kLastRegEx = kSingleWord, kParseSeparatedCjkName,
kParseCommonCjkTwoCharacterLastName,
kParseKoreanTwoCharacterLastName,
kParseCjkSingleCharacterLastName,
kMatchCjkNameCharacteristics,
kMatchHispanicCommonNameCharacteristics,
kMatchHispanicLastNameConjuctionCharacteristics,
kParseOnlyLastName,
kParseLastCommaFirstMiddleName,
kParseFirstMiddleLastName,
kParseHispanicLastName,
kParseHispanicFullName,
kParseLastNameIntoSecondLastName,
kMatchMiddleNameInitialsCharacteristics,
kLastRegEx = kParseLastNameIntoSecondLastName,
}; };
// This singleton class builds and caches the regular expressions for value // This singleton class builds and caches the regular expressions for value
...@@ -73,4 +87,5 @@ class StructuredAddressesRegExProvider { ...@@ -73,4 +87,5 @@ class StructuredAddressesRegExProvider {
} // namespace structured_address } // namespace structured_address
} // namespace autofill } // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_PATTERN_REGEX_H_
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_REGEX_PROVIDER_H_
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "base/debug/alias.h" #include "base/debug/alias.h"
#include "base/debug/dump_without_crashing.h" #include "base/debug/dump_without_crashing.h"
#include "base/strings/strcat.h" #include "base/strings/strcat.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
namespace autofill { namespace autofill {
namespace structured_address { namespace structured_address {
...@@ -45,9 +46,13 @@ const RE2* Re2RegExCache::GetRegEx(const std::string& pattern) { ...@@ -45,9 +46,13 @@ const RE2* Re2RegExCache::GetRegEx(const std::string& pattern) {
return result.first->second.get(); return result.first->second.get();
} }
std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern) { std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern) {
RE2::Options opt; RE2::Options opt;
opt.set_case_sensitive(false); // By default, patters are case sensitive.
// Note that, the named-capture-group patterns build with
// |CaptureTypeWithPattern()| apply a flag to make the matching case
// insensitive.
opt.set_case_sensitive(true);
auto regex = std::make_unique<const RE2>(pattern, opt); auto regex = std::make_unique<const RE2>(pattern, opt);
...@@ -108,12 +113,17 @@ bool ParseValueByRegularExpression( ...@@ -108,12 +113,17 @@ bool ParseValueByRegularExpression(
return true; return true;
} }
bool IsPartialMatch(const std::string& value, RegEx regex) {
return IsPartialMatch(
value, StructuredAddressesRegExProvider::Instance()->GetRegEx(regex));
}
bool IsPartialMatch(const std::string& value, const std::string& pattern) { bool IsPartialMatch(const std::string& value, const std::string& pattern) {
const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern); return IsPartialMatch(value, Re2RegExCache::Instance()->GetRegEx(pattern));
if (!regex || !regex->ok()) }
return false;
return RE2::PartialMatch(value, *regex); bool IsPartialMatch(const std::string& value, const RE2* expression) {
return RE2::PartialMatch(value, *expression);
} }
std::vector<std::string> GetAllPartialMatches(const std::string& value, std::vector<std::string> GetAllPartialMatches(const std::string& value,
...@@ -172,7 +182,8 @@ std::string CaptureTypeWithPattern(const ServerFieldType& type, ...@@ -172,7 +182,8 @@ std::string CaptureTypeWithPattern(const ServerFieldType& type,
quantifier = ""; quantifier = "";
} }
return base::StrCat({"(?:(?P<", AutofillType(type).ToString(), ">", pattern, // By adding an "i" in the first group, the capturing is case insensitive.
return base::StrCat({"(?i:(?P<", AutofillType(type).ToString(), ">", pattern,
")(?:", options.separator, "))", quantifier}); ")(?:", options.separator, "))", quantifier});
} }
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
namespace autofill { namespace autofill {
namespace structured_address { namespace structured_address {
enum class RegEx;
// Enum to express the few quantifiers needed to parse values. // Enum to express the few quantifiers needed to parse values.
enum MatchQuantifier { enum MatchQuantifier {
// The capture group is required. // The capture group is required.
...@@ -39,7 +41,7 @@ struct CaptureOptions { ...@@ -39,7 +41,7 @@ struct CaptureOptions {
// By default, a group must be either followed by a space-like character (\s) // By default, a group must be either followed by a space-like character (\s)
// or it must be the last group in the line. The separator is allowed to be // or it must be the last group in the line. The separator is allowed to be
// empty. // empty.
std::string separator = "\\s|$"; std::string separator = "\\s+|$";
// Indicates if the group is required, optional or even lazy optional. // Indicates if the group is required, optional or even lazy optional.
MatchQuantifier quantifier = MATCH_REQUIRED; MatchQuantifier quantifier = MATCH_REQUIRED;
}; };
...@@ -55,7 +57,7 @@ class Re2RegExCache { ...@@ -55,7 +57,7 @@ class Re2RegExCache {
static Re2RegExCache* Instance(); static Re2RegExCache* Instance();
// Returns a pointer to a constant compiled expression that matches |pattern| // Returns a pointer to a constant compiled expression that matches |pattern|
// case-insensitively. // case-sensitively.
const RE2* GetRegEx(const std::string& pattern); const RE2* GetRegEx(const std::string& pattern);
#ifdef UNIT_TEST #ifdef UNIT_TEST
...@@ -96,12 +98,19 @@ bool ParseValueByRegularExpression( ...@@ -96,12 +98,19 @@ bool ParseValueByRegularExpression(
const RE2* regex, const RE2* regex,
std::map<std::string, std::string>* result_map); std::map<std::string, std::string>* result_map);
// Returns a compiled case insensitive regular expression for |pattern|. // Returns a compiled case sensitive regular expression for |pattern|.
std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern); std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern);
// Returns true if |value| can be matched by the enumuerated RegEx |regex|.
bool IsPartialMatch(const std::string& value, RegEx regex);
// Returns true if |value| can be matched with |pattern|. // Returns true if |value| can be matched with |pattern|.
bool IsPartialMatch(const std::string& value, const std::string& pattern); bool IsPartialMatch(const std::string& value, const std::string& pattern);
// Same as above, but accepts a compiled regular expression instead of the
// pattern.
bool IsPartialMatch(const std::string& value, const RE2* expression);
// Returns a vector that contains all partial matches of |pattern| in |value|; // Returns a vector that contains all partial matches of |pattern| in |value|;
std::vector<std::string> GetAllPartialMatches(const std::string& value, std::vector<std::string> GetAllPartialMatches(const std::string& value,
const std::string& pattern); const std::string& pattern);
......
...@@ -193,17 +193,17 @@ TEST(AutofillStructuredAddressUtils, TestGetPlaceholderToken) { ...@@ -193,17 +193,17 @@ TEST(AutofillStructuredAddressUtils, TestGetPlaceholderToken) {
} }
TEST(AutofillStructuredAddressUtils, CaptureTypeWithPattern) { TEST(AutofillStructuredAddressUtils, CaptureTypeWithPattern) {
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))?", EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))?",
CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"}, CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"},
{.quantifier = MATCH_OPTIONAL})); {.quantifier = MATCH_OPTIONAL}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))", EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"})); CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))??", EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))??",
CaptureTypeWithPattern(NAME_FULL, "abs\\w", CaptureTypeWithPattern(NAME_FULL, "abs\\w",
{.quantifier = MATCH_LAZY_OPTIONAL})); {.quantifier = MATCH_LAZY_OPTIONAL}));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))", EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
CaptureTypeWithPattern(NAME_FULL, "abs\\w")); CaptureTypeWithPattern(NAME_FULL, "abs\\w"));
EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:_))", EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:_))",
CaptureTypeWithPattern(NAME_FULL, "abs\\w", {.separator = "_"})); CaptureTypeWithPattern(NAME_FULL, "abs\\w", {.separator = "_"}));
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment