[Autofill][SlimShady] Component for structured names.

This CL adds the static address component tree for names that support two surnames as they are common for Hispanic/Latinx names. Change-Id: Icd54f986ab192101fbea5a8576c8abbabfffe636 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2264423 Commit-Queue: Matthias Körber <koerber@google.com> Reviewed-by: Dominic Battré <battre@chromium.org> Cr-Commit-Position: refs/heads/master@{#793110}

[Autofill][SlimShady] Component for structured names.
This CL adds the static address component tree for names that support two surnames as they are common for Hispanic/Latinx names. Change-Id: Icd54f986ab192101fbea5a8576c8abbabfffe636 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2264423 Commit-Queue: Matthias Körber <koerber@google.com> Reviewed-by: Dominic Battré <battre@chromium.org> Cr-Commit-Position: refs/heads/master@{#793110}
deea5c39 · Matthias Körber · Commit Bot · 73f0449d · deea5c39 · deea5c39
Commit deea5c39 authored Jul 30, 2020 by Matthias Körber Committed by Commit Bot Jul 30, 2020
12 changed files
--- a/components/autofill/core/browser/BUILD.gn
+++ b/components/autofill/core/browser/BUILD.gn
@@ -92,6 +92,8 @@ jumbo_static_library("browser") {
    "data_model/autofill_structured_address_component.h",
    "data_model/autofill_structured_address_constants.cc",
    "data_model/autofill_structured_address_constants.h",
+    "data_model/autofill_structured_address_name.cc",
+    "data_model/autofill_structured_address_name.h",
    "data_model/autofill_structured_address_regex_provider.cc",
    "data_model/autofill_structured_address_regex_provider.h",
    "data_model/autofill_structured_address_utils.cc",
@@ -578,6 +580,7 @@ source_set("unit_tests") {
    "data_model/autofill_profile_comparator_unittest.cc",
    "data_model/autofill_profile_unittest.cc",
    "data_model/autofill_structured_address_component_unittest.cc",
+    "data_model/autofill_structured_address_name_unittest.cc",
    "data_model/autofill_structured_address_regex_provider_unittest.cc",
    "data_model/autofill_structured_address_utils_unittest.cc",
    "data_model/contact_info_unittest.cc",

--- a/components/autofill/core/browser/autofill_data_util_unittest.cc
+++ b/components/autofill/core/browser/autofill_data_util_unittest.cc
@@ -19,9 +19,21 @@ using data_util::bit_field_type_groups::kName;
 using data_util::bit_field_type_groups::kPhone;

 TEST(AutofillDataUtilTest, DetermineGroupsForHomeNameAndAddress) {
-  const std::vector<ServerFieldType> field_types{
-      NAME_FIRST,        NAME_LAST,          ADDRESS_HOME_LINE1,
-      ADDRESS_HOME_CITY, ADDRESS_HOME_STATE, ADDRESS_HOME_ZIP};
+  const std::vector<ServerFieldType> field_types{NAME_HONORIFIC_PREFIX,
+                                                 NAME_FULL,
+                                                 NAME_FIRST,
+                                                 NAME_MIDDLE,
+                                                 NAME_MIDDLE_INITIAL,
+                                                 NAME_LAST,
+                                                 NAME_LAST_FIRST,
+                                                 NAME_LAST_CONJUNCTION,
+                                                 NAME_LAST_SECOND,
+                                                 NAME_FIRST,
+                                                 NAME_LAST,
+                                                 ADDRESS_HOME_LINE1,
+                                                 ADDRESS_HOME_CITY,
+                                                 ADDRESS_HOME_STATE,
+                                                 ADDRESS_HOME_ZIP};

  const uint32_t expected_group_bitmask = kName | kAddress;
  const uint32_t group_bitmask = data_util::DetermineGroups(field_types);
@@ -196,8 +208,7 @@ INSTANTIATE_TEST_SUITE_P(

        // It occasionally happens that a full name is 2 characters, 1/1.
        FullNameTestCase{"이도", "도", "", "이"},    // Korean name, Hangul
-        FullNameTestCase{"孫文", "文", "", "孫"}   // Chinese name, Unihan
-        ));
+        FullNameTestCase{"孫文", "文", "", "孫"}));  // Chinese name, Unihan

 class JoinNamePartsTest : public testing::TestWithParam<FullNameTestCase> {};

@@ -229,9 +240,7 @@ INSTANTIATE_TEST_SUITE_P(
        // These are no CJK names for us, they're just bogus.
        FullNameTestCase{"Homer シンプソン", "Homer", "", "シンプソン"},
        FullNameTestCase{"ホーマー Simpson", "ホーマー", "", "Simpson"},
-        FullNameTestCase{"반 기 문", "반", "기", "문"}
-        // Has a middle-name, too unusual
-        ));
+        FullNameTestCase{"반 기 문", "반", "기", "문"}));

 struct ValidCountryCodeTestCase {
  std::string country_code;

--- a/components/autofill/core/browser/data_model/autofill_structured_address_constants.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_constants.cc
@@ -7,7 +7,7 @@
 namespace autofill {
 namespace structured_address {

-const char kSingleWordRe[] = "(?:\\w+)";
+const char kNameSeparators[] = " -";

 }  // namespace structured_address
 }  // namespace autofill
--- a/components/autofill/core/browser/data_model/autofill_structured_address_constants.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_constants.h
@@ -8,8 +8,8 @@
 namespace autofill {
 namespace structured_address {

-// Regular expression pattern to match a single word.
-extern const char kSingleWordRe[];
+// List of name separators.
+extern const char kNameSeparators[];

 }  // namespace structured_address
 }  // namespace autofill

--- a/components/autofill/core/browser/data_model/autofill_structured_address_name.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_name.cc
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/autofill/core/browser/data_model/autofill_structured_address_name.h"
+
+#include <utility>
+
+#include "base/i18n/case_conversion.h"
+#include "base/strings/strcat.h"
+#include "base/strings/string_split.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "components/autofill/core/browser/autofill_type.h"
+#include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
+#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
+#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
+#include "components/autofill/core/browser/field_types.h"
+
+namespace autofill {
+
+namespace structured_address {
+
+base::string16 ReduceToInitials(const base::string16& value) {
+  if (value.empty())
+    return base::string16();
+
+  std::vector<base::string16> middle_name_tokens =
+      base::SplitString(value, base::ASCIIToUTF16(kNameSeparators),
+                        base::WhitespaceHandling::TRIM_WHITESPACE,
+                        base::SplitResult::SPLIT_WANT_NONEMPTY);
+
+  base::string16 result;
+  result.reserve(middle_name_tokens.size());
+  for (const auto& token : middle_name_tokens) {
+    DCHECK(!token.empty());
+    result += token[0];
+  }
+  return base::i18n::ToUpper(result);
+}
+
+bool HasHispanicLatinxNameCharaceristics(const std::string& name) {
+  // Check if the name contains one of the most common Hispanic/Latinx
+  // last names.
+  if (IsPartialMatch(name, RegEx::kMatchHispanicCommonNameCharacteristics))
+    return true;
+
+  // Check if it contains a last name conjunction.
+  if (IsPartialMatch(name,
+                     RegEx::kMatchHispanicLastNameConjuctionCharacteristics))
+    return true;
+
+  // If none of the above, there is not sufficient reason to assume this is a
+  // Hispanic/Latinx name.
+  return false;
+}
+
+bool HasCjkNameCharacteristics(const std::string& name) {
+  return IsPartialMatch(name, RegEx::kMatchCjkNameCharacteristics);
+}
+
+bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name) {
+  return IsPartialMatch(middle_name,
+                        RegEx::kMatchMiddleNameInitialsCharacteristics);
+}
+
+NameHonorific::NameHonorific() : NameHonorific(nullptr) {}
+
+NameHonorific::NameHonorific(AddressComponent* parent)
+    : AddressComponent(NAME_HONORIFIC_PREFIX, parent) {}
+
+NameHonorific::~NameHonorific() = default;
+
+NameFirst::NameFirst() : NameFirst(nullptr) {}
+
+NameFirst::NameFirst(AddressComponent* parent)
+    : AddressComponent(NAME_FIRST, parent) {}
+
+NameFirst::~NameFirst() = default;
+
+NameMiddle::NameMiddle() : NameMiddle(nullptr) {}
+
+NameMiddle::NameMiddle(AddressComponent* parent)
+    : AddressComponent(NAME_MIDDLE, parent) {}
+
+NameMiddle::~NameMiddle() = default;
+
+void NameMiddle::GetAdditionalSupportedFieldTypes(
+    ServerFieldTypeSet* supported_types) const {
+  supported_types->insert(NAME_MIDDLE_INITIAL);
+}
+
+bool NameMiddle::ConvertAndGetTheValueForAdditionalFieldTypeName(
+    const std::string& type_name,
+    base::string16* value) const {
+  if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
+    if (value) {
+      // If the stored value has the characteristics of containing only
+      // initials, use the value as it is. Otherwise, convert it to a
+      // sequence of upper case letters, one for each space- or hyphen-separated
+      // token.
+      if (HasMiddleNameInitialsCharacteristics(base::UTF16ToUTF8(GetValue()))) {
+        *value = GetValue();
+      } else {
+        *value = ReduceToInitials(GetValue());
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+bool NameMiddle::ConvertAndSetValueForAdditionalFieldTypeName(
+    const std::string& type_name,
+    const base::string16& value,
+    const VerificationStatus& status) {
+  if (type_name == AutofillType(NAME_MIDDLE_INITIAL).ToString()) {
+    SetValue(value, status);
+    return true;
+  }
+  return false;
+}
+
+NameLastFirst::NameLastFirst() : NameLastFirst(nullptr) {}
+
+NameLastFirst::NameLastFirst(AddressComponent* parent)
+    : AddressComponent(NAME_LAST_FIRST, parent) {}
+
+NameLastFirst::~NameLastFirst() = default;
+
+NameLastConjunction::NameLastConjunction() : NameLastConjunction(nullptr) {}
+
+NameLastConjunction::NameLastConjunction(AddressComponent* parent)
+    : AddressComponent(NAME_LAST_CONJUNCTION, parent) {}
+
+NameLastConjunction::~NameLastConjunction() = default;
+
+std::vector<const RE2*> NameLast::GetParseRegularExpressionsByRelevance()
+    const {
+  auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
+  DCHECK(pattern_provider);
+  // Check if the name has the characteristics of an Hispanic/Latinx name.
+  if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
+    return {pattern_provider->GetRegEx(RegEx::kParseHispanicLastName)};
+  return {pattern_provider->GetRegEx(RegEx::kParseLastNameIntoSecondLastName)};
+}
+
+NameLastSecond::NameLastSecond() : NameLastSecond(nullptr) {}
+
+NameLastSecond::NameLastSecond(AddressComponent* parent)
+    : AddressComponent(NAME_LAST_SECOND, parent) {}
+
+NameLastSecond::~NameLastSecond() = default;
+
+NameLast::NameLast() : NameLast(nullptr) {}
+
+NameLast::NameLast(AddressComponent* parent)
+    : AddressComponent(NAME_LAST, parent, {&first_, &conjunction_, &second_}) {}
+
+NameLast::~NameLast() = default;
+
+void NameLast::ParseValueAndAssignSubcomponentsByFallbackMethod() {
+  SetValueForTypeIfPossible(NAME_LAST_SECOND, GetValue(),
+                            VerificationStatus::kParsed);
+}
+
+NameFull::NameFull() : NameFull(nullptr) {}
+
+NameFull::NameFull(AddressComponent* parent)
+    : AddressComponent(
+          NAME_FULL,
+          parent,
+          {&name_honorific_, &name_first_, &name_middle_, &name_last_}) {}
+
+std::vector<const RE2*> NameFull::GetParseRegularExpressionsByRelevance()
+    const {
+  auto* pattern_provider = StructuredAddressesRegExProvider::Instance();
+  DCHECK(pattern_provider);
+  // If the name is a CJK name, try to match in the following order:
+  //
+  // * Match CJK names that include a separator.
+  // If a separator is present, dividing the name between first and last name is
+  // trivial.
+  //
+  // * Match Korean 4+ character names with two-character last names.
+  // Note, although some of the two-character last names are ambiguous in the
+  // sense that they share a common prefix with single character last names. For
+  // 4+ character names, it is more likely that the first two characters belong
+  // to the last name.
+  //
+  // * Match known two-character CJK last names.
+  // Note, this expressions uses only non-ambiguous two-character last names.
+  //
+  // * Match only the first character into the last name.
+  // This is the catch all expression that uses only the first character for the
+  // last name and puts all other characters into the first name.
+  //
+  if (HasCjkNameCharacteristics(base::UTF16ToUTF8(GetValue()))) {
+    return {
+        pattern_provider->GetRegEx(RegEx::kParseSeparatedCjkName),
+        pattern_provider->GetRegEx(RegEx::kParseKoreanTwoCharacterLastName),
+        pattern_provider->GetRegEx(RegEx::kParseCommonCjkTwoCharacterLastName),
+        pattern_provider->GetRegEx(RegEx::kParseCjkSingleCharacterLastName)};
+  }
+  if (HasHispanicLatinxNameCharaceristics(base::UTF16ToUTF8(GetValue())))
+    return {pattern_provider->GetRegEx(RegEx::kParseHispanicFullName)};
+
+  return {pattern_provider->GetRegEx(RegEx::kParseOnlyLastName),
+          pattern_provider->GetRegEx(RegEx::kParseLastCommaFirstMiddleName),
+          pattern_provider->GetRegEx(RegEx::kParseFirstMiddleLastName)};
+}
+
+NameFull::~NameFull() = default;
+
+}  // namespace structured_address
+
+}  // namespace autofill
--- a/components/autofill/core/browser/data_model/autofill_structured_address_name.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_name.h
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
+#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
+
+#include <string>
+#include <vector>
+
+#include "components/autofill/core/browser/data_model/autofill_structured_address_component.h"
+
+using autofill::structured_address::AddressComponent;
+
+namespace autofill {
+namespace structured_address {
+
+// Returns true if |name| has the characteristics of a Chinese, Japanese or
+// Korean name:
+// * It must only contain CJK characters with at most one separator in between.
+bool HasCjkNameCharacteristics(const std::string& name);
+
+// Returns true if |name| has one of the characteristics of an Hispanic/Latinx
+// name:
+// * Name contains a very common Hispanic/Latinx surname.
+// * Name uses a surname conjunction.
+bool HasHispanicLatinxNameCharaceristics(const std::string& name);
+
+// Return true if |middle_name| has the characteristics of a containing only
+// initials:
+// * The string contains only upper case letters that may be preceded by a
+// point.
+// * Between each letter, there can be a space or a hyphen.
+bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name);
+
+// Reduces a name to the initials in upper case.
+// Example: George walker -> GW, Hans-Peter -> HP
+base::string16 ReduceToInitials(const base::string16& value);
+
+// Atomic component that represents the honorific prefix of a name.
+class NameHonorific : public AddressComponent {
+ public:
+  NameHonorific();
+  explicit NameHonorific(AddressComponent* parent);
+  ~NameHonorific() override;
+};
+
+// Atomic components that represents the first name.
+class NameFirst : public AddressComponent {
+ public:
+  NameFirst();
+  explicit NameFirst(AddressComponent* parent);
+  ~NameFirst() override;
+};
+
+// Atomic component that represents the middle name.
+class NameMiddle : public AddressComponent {
+ public:
+  NameMiddle();
+  explicit NameMiddle(AddressComponent* parent);
+  ~NameMiddle() override;
+
+  void GetAdditionalSupportedFieldTypes(
+      ServerFieldTypeSet* supported_types) const override;
+
+ protected:
+  // Implements support for getting for a value for the |MIDDLE_NAME_INITIAL|
+  // type.
+  bool ConvertAndGetTheValueForAdditionalFieldTypeName(
+      const std::string& type_name,
+      base::string16* value) const override;
+
+  // Implements support for setting the |MIDDLE_NAME_INITIAL| type.
+  bool ConvertAndSetValueForAdditionalFieldTypeName(
+      const std::string& type_name,
+      const base::string16& value,
+      const VerificationStatus& status) override;
+};
+
+// Atomic component that represents the first part of a last name.
+class NameLastFirst : public AddressComponent {
+ public:
+  NameLastFirst();
+  explicit NameLastFirst(AddressComponent* parent);
+  ~NameLastFirst() override;
+};
+
+// Atomic component that represents the conjunction in a Hispanic/Latinx
+// surname.
+class NameLastConjunction : public AddressComponent {
+ public:
+  NameLastConjunction();
+  explicit NameLastConjunction(AddressComponent* parent);
+  ~NameLastConjunction() override;
+};
+
+// Atomic component that represents the second part of a surname.
+class NameLastSecond : public AddressComponent {
+ public:
+  NameLastSecond();
+  explicit NameLastSecond(AddressComponent* parent);
+  ~NameLastSecond() override;
+};
+
+// Compound that represent a last name. It contains a first and second last name
+// and a conjunction as it is used in Hispanic/Latinx names. Note, that compound
+// family names like Miller-Smith are not supposed to be split up into two
+// components. If a name contains only a single component, the component is
+// stored in the second part by default.
+//
+//               +-------+
+//               | _LAST |
+//               +--------
+//               /    |    \
+//             /      |      \
+//           /        |        \
+// +--------+ +-----------+ +---------+
+// | _FIRST | | _CONJUNC. | | _SECOND |
+// +--------+ +-----------+ +---------+
+//
+class NameLast : public AddressComponent {
+ public:
+  NameLast();
+  explicit NameLast(AddressComponent* parent);
+  ~NameLast() override;
+
+  std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
+      const override;
+
+ private:
+  // As the fallback, write everything to the second last name.
+  void ParseValueAndAssignSubcomponentsByFallbackMethod() override;
+
+  NameLastFirst first_;
+  NameLastConjunction conjunction_;
+  NameLastSecond second_;
+};
+
+// Compound that represents a full name. It contains a honorific, a first
+// name, a middle name and a last name. The last name is a compound itself.
+//
+//                     +----------+
+//                     | NAME_FULL|
+//                     +----------+
+//                    /  |      |  \
+//                  /    |      |    \
+//                /      |      |      \
+//              /        |      |        \
+// +------------+ +--------+ +---------+ +-------+
+// | _HONORIFIC | | _FIRST | | _MIDDLE | | _LAST |
+// +------------+ +--------+ +---------+ +-------+
+//                                        /   |   \
+//                                      /     |     \
+//                                    /       |       \
+//                                  /         |         \
+//                         +--------+ +-----------+ +---------+
+//                         | _FIRST | | _CONJUNC. | | _SECOND |
+//                         +--------+ +-----------+ +---------+
+//
+class NameFull : public AddressComponent {
+ public:
+  NameFull();
+  explicit NameFull(AddressComponent* parent);
+  ~NameFull() override;
+
+  std::vector<const RE2*> GetParseRegularExpressionsByRelevance()
+      const override;
+
+ private:
+  NameHonorific name_honorific_;
+  NameFirst name_first_;
+  NameMiddle name_middle_;
+  NameLast name_last_;
+};
+
+}  // namespace structured_address
+
+}  // namespace autofill
+#endif  // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_NAME_H_
--- a/components/autofill/core/browser/data_model/autofill_structured_address_name_unittest.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_name_unittest.cc
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/autofill/core/browser/data_model/autofill_structured_address_name.h"
+
+#include <stddef.h>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using base::ASCIIToUTF16;
+
+namespace autofill {
+namespace structured_address {
+
+namespace {
+
+// A test record that contains all entries of the hybrid-structure name tree.
+struct NameParserTestRecord {
+  std::string full;
+  std::string honorific;
+  std::string first;
+  std::string middle;
+  std::string last;
+  std::string last_first;
+  std::string last_conjunction;
+  std::string last_second;
+};
+
+// A test record that contains all entries of the hybrid-structure last name
+// tree.
+struct LastNameParserTestRecord {
+  std::string last_name;
+  std::string first;
+  std::string conjunction;
+  std::string second;
+};
+
+// Function to test the parsing of a name from the full (unstructured)
+// representation into its subcomponents.
+void TestNameParsing(const base::string16& full,
+                     const base::string16& honorific,
+                     const base::string16& first,
+                     const base::string16& middle,
+                     const base::string16& last,
+                     const base::string16& last_first,
+                     const base::string16& last_conjunction,
+                     const base::string16& last_second) {
+  SCOPED_TRACE(full);
+  NameFull name;
+  name.SetValueForTypeIfPossible(NAME_FULL, full,
+                                 VerificationStatus::kObserved);
+  name.CompleteFullTree();
+
+  EXPECT_EQ(name.GetValueForType(NAME_FULL), full);
+  EXPECT_EQ(name.GetValueForType(NAME_HONORIFIC_PREFIX), honorific);
+  EXPECT_EQ(name.GetValueForType(NAME_FIRST), first);
+  EXPECT_EQ(name.GetValueForType(NAME_MIDDLE), middle);
+  EXPECT_EQ(name.GetValueForType(NAME_LAST), last);
+  EXPECT_EQ(name.GetValueForType(NAME_LAST_FIRST), last_first);
+  EXPECT_EQ(name.GetValueForType(NAME_LAST_CONJUNCTION), last_conjunction);
+  EXPECT_EQ(name.GetValueForType(NAME_LAST_SECOND), last_second);
+}
+
+// Testing function for parsing a |NAME_LAST| into its subcomponents.
+void TestLastNameParsing(const base::string16& last_name,
+                         const base::string16& target_first,
+                         const base::string16& target_conjunction,
+                         const base::string16& target_second) {
+  SCOPED_TRACE(last_name);
+
+  NameLast last_name_component;
+  last_name_component.SetValueForTypeIfPossible(NAME_LAST, last_name,
+                                                VerificationStatus::kObserved);
+
+  last_name_component.CompleteFullTree();
+
+  EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_FIRST), target_first);
+  EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_CONJUNCTION),
+            target_conjunction);
+  EXPECT_EQ(last_name_component.GetValueForType(NAME_LAST_SECOND),
+            target_second);
+}
+
+}  // namespace
+
+// Tests the parsing of last names into their tree components:
+// * The first part, that is only used in Latinx/Hispanic names.
+// * The conjunction, that is optional in Latinx/Hispanic names.
+// * The second part, for Latinx/Hispanic and all other last names.
+TEST(AutofillStructuredName, ParseLastName) {
+  LastNameParserTestRecord last_name_tests[] = {
+      // "von" is a known prefix for a surname and should be therefore parsed
+      // into the second last name
+      {"von Kitzling", "", "", "von Kitzling"},
+      {"Bush", "", "", "Bush"},
+      {"Picasso", "", "", "Picasso"},
+      // Ruiz is a common Spanish name and parsing into first and second last
+      // name should be applied. "de la" are known surname prefixes and should
+      // be included into the subsequeny token.
+      {"Ruiz de la Torro", "Ruiz", "", "de la Torro"},
+      {"Ruiz Picasso", "Ruiz", "", "Picasso"},
+      // "y" and "i" are known conjunctions.
+      {"Ruiz Y Picasso", "Ruiz", "Y", "Picasso"},
+      {"Ruiz y Picasso", "Ruiz", "y", "Picasso"},
+      {"Ruiz i Picasso", "Ruiz", "i", "Picasso"}};
+
+  for (const auto& last_name_test : last_name_tests) {
+    TestLastNameParsing(ASCIIToUTF16(last_name_test.last_name),
+                        ASCIIToUTF16(last_name_test.first),
+                        ASCIIToUTF16(last_name_test.conjunction),
+                        ASCIIToUTF16(last_name_test.second));
+  }
+}
+
+// Tests the parsing of full names into their subcomponents.
+TEST(AutofillStructuredName, ParseFullName) {
+  NameParserTestRecord name_tests[] = {
+      // Name starting with a last name, followed by a comma and the first and
+      // middle name.
+      {"Mueller, Hans Peter", "", "Hans", "Peter", "Mueller", "", "",
+       "Mueller"},
+      // Same with an honorific prefix an multiple middle names.
+      // middle name.
+      {"Prof. Mueller, Hans Walter Peter", "Prof.", "Hans", "Walter Peter",
+       "Mueller", "", "", "Mueller"},
+      // Name that includes a hyphen.
+      {"Dr. Hans-Peter Mueller", "Dr.", "Hans-Peter", "", "Mueller", "", "",
+       "Mueller"},
+      // Name with honorific prefix but without a middle name.
+      {"Prof. Albert Einstein", "Prof.", "Albert", "", "Einstein", "", "",
+       "Einstein"},
+      // Name with honorific prefix and a middle name.
+      {"Dr. Richard Phillips Feynman", "Dr.", "Richard", "Phillips", "Feynman",
+       "", "", "Feynman"},
+      // Name with honorific prefix and multiple middle name.
+      {"Dr. Richard Phillips Isaac Feynman", "Dr.", "Richard", "Phillips Isaac",
+       "Feynman", "", "", "Feynman"},
+      // Hispanic/Latinx name with two surname and a conjunction.
+      {"Pablo Diego Ruiz y Picasso", "", "Pablo Diego", "", "Ruiz y Picasso",
+       "Ruiz", "y", "Picasso"},
+      // Hispanic/Latinx name with two surname and a conjunction with an
+      // honorific prefix.
+      {"Mr. Pablo Ruiz y Picasso", "Mr.", "Pablo", "", "Ruiz y Picasso", "Ruiz",
+       "y", "Picasso"},
+      // Name with multiple middle names.
+      {"George Walker Junior Bush", "", "George", "Walker Junior", "Bush", "",
+       "", "Bush"},
+      // Name with a single middle name.
+      {"George Walker Bush", "", "George", "Walker", "Bush", "", "", "Bush"},
+      // Name without names.
+      {"George Bush", "", "George", "", "Bush", "", "", "Bush"},
+      // Three character Korean name wit two-character surname.
+      {"欧阳龙", "", "龙", "", "欧阳", "", "", "欧阳"},
+      // Four character Korean name wit two-character surname.
+      {"欧阳龙龙", "", "龙龙", "", "欧阳", "", "", "欧阳"},
+      // Full name including given, middle and family names.
+      {"Homer Jay Simpson", "", "Homer", "Jay", "Simpson", "", "", "Simpson"},
+      // No middle name.
+      {"Moe Szyslak", "", "Moe", "", "Szyslak", "", "", "Szyslak"},
+      // Common name prefixes parsed into the honorific prefix.
+      {"Reverend Timothy Lovejoy", "Reverend", "Timothy", "", "Lovejoy", "", "",
+       "Lovejoy"},
+      // Only a last name with a preposition.
+      {"von Gutenberg", "", "", "", "von Gutenberg", "", "", "von Gutenberg"},
+      // Common name suffixes removed.
+      {"John Frink Phd", "", "John", "", "Frink", "", "", "Frink"},
+      // Only lase name with common name suffixes removed.
+      {"Frink Phd", "", "", "", "Frink", "", "", "Frink"},
+      // Since "Ma" is a common last name, "Ma" was removed from the suffixes.
+      {"John Ma", "", "John", "", "Ma", "", "", "Ma"},
+      // Common family name prefixes not considered a middle name.
+      {"Milhouse Van Houten", "", "Milhouse", "", "Van Houten", "", "",
+       "Van Houten"},
+      // Chinese name, Unihan
+      {"孫 德明", "", "德明", "", "孫", "", "", "孫"},
+      // Chinese name, Unihan, 'IDEOGRAPHIC SPACE'
+      {"孫　德明", "", "德明", "", "孫", "", "", "孫"},
+      // Korean name, Hangul
+      {"홍 길동", "", "길동", "", "홍", "", "", "홍"},
+      // Japanese name, Unihan
+      {"山田 貴洋", "", "貴洋", "", "山田", "", "", "山田"},
+      // In Japanese, foreign names use 'KATAKANA MIDDLE DOT' (U+30FB) as a
+      // separator. There is no consensus for the ordering. For now, we use
+      // the same ordering as regular Japanese names ("last・first").
+      // Foreign name in Japanese, Katakana
+      {"ゲイツ・ビル", "", "ビル", "", "ゲイツ", "", "", "ゲイツ"},
+      // 'KATAKANA MIDDLE DOT' is occasionally typoed as 'MIDDLE DOT' (U+00B7).
+      {"ゲイツ·ビル", "", "ビル", "", "ゲイツ", "", "", "ゲイツ"},
+      // CJK names don't usually have a space in the middle, but most of the
+      // time, the surname is only one character (in Chinese & Korean).
+      {"최성훈", "", "성훈", "", "최", "", "", "최"},  // Korean name, Hangul
+      // (Simplified) Chinese name, Unihan
+      {"刘翔", "", "翔", "", "刘", "", "", "刘"},
+      // (Traditional) Chinese name, Unihan
+      {"劉翔", "", "翔", "", "劉", "", "", "劉"},
+      // Korean name, Hangul
+      {"남궁도", "", "도", "", "남궁", "", "", "남궁"},
+      // Korean name, Hangul
+      {"황보혜정", "", "혜정", "", "황보", "", "", "황보"},
+      // (Traditional) Chinese name, Unihan
+      {"歐陽靖", "", "靖", "", "歐陽", "", "", "歐陽"},
+      // In Korean, some 2-character surnames are rare/ambiguous, like "강전":
+      // "강" is a common surname, and "전" can be part of a given name. In
+      // those cases, we assume it's 1/2 for 3-character names, or 2/2 for
+      // 4-character names.
+      // Korean name, Hangul
+      {"강전희", "", "전희", "", "강", "", "", "강"},
+      // Korean name, Hangul
+      {"황목치승", "", "치승", "", "황목", "", "", "황목"},
+      // It occasionally happens that a full name is 2 characters, 1/1.
+      // Korean name, Hangul
+      {"이도", "", "도", "", "이", "", "", "이"},
+      // Chinese name, Unihan
+      {"孫文", "", "文", "", "孫", "", "", "孫"}};
+
+  for (const auto& name_test : name_tests) {
+    TestNameParsing(base::UTF8ToUTF16(name_test.full),
+                    base::UTF8ToUTF16(name_test.honorific),
+                    base::UTF8ToUTF16(name_test.first),
+                    base::UTF8ToUTF16(name_test.middle),
+                    base::UTF8ToUTF16(name_test.last),
+                    base::UTF8ToUTF16(name_test.last_first),
+                    base::UTF8ToUTF16(name_test.last_conjunction),
+                    base::UTF8ToUTF16(name_test.last_second));
+  }
+}
+
+// Tests the detection of CJK name characteristics.
+TEST(AutofillStructuredName, HasCjkNameCharacteristics) {
+  EXPECT_FALSE(HasCjkNameCharacteristics("Peterson"));
+  EXPECT_TRUE(HasCjkNameCharacteristics("ㅎ"));
+  EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙"));
+  EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙龙"));
+  EXPECT_TRUE(HasCjkNameCharacteristics("房仕龙"));
+  EXPECT_TRUE(HasCjkNameCharacteristics("房仕・龙"));
+  EXPECT_FALSE(HasCjkNameCharacteristics("・"));
+  EXPECT_FALSE(HasCjkNameCharacteristics("房・仕・龙"));
+  // Non-CJK language with only ASCII characters.
+  EXPECT_FALSE(HasCjkNameCharacteristics("Homer Jay Simpson"));
+  // Non-CJK language with some ASCII characters.
+  EXPECT_FALSE(HasCjkNameCharacteristics("Éloïse Paré"));
+  // Non-CJK language with no ASCII characters.
+  EXPECT_FALSE(HasCjkNameCharacteristics("Σωκράτης"));
+  // (Simplified) Chinese name, Unihan.
+  EXPECT_TRUE(HasCjkNameCharacteristics("刘翔"));
+  // (Simplified) Chinese name, Unihan, with an ASCII space.
+  EXPECT_TRUE(HasCjkNameCharacteristics("成 龙"));
+  // Korean name, Hangul.
+  EXPECT_TRUE(HasCjkNameCharacteristics("송지효"));
+  // Korean name, Hangul, with an 'IDEOGRAPHIC SPACE' (U+3000).
+  EXPECT_TRUE(HasCjkNameCharacteristics("김　종국"));
+  // Japanese name, Unihan.
+  EXPECT_TRUE(HasCjkNameCharacteristics("山田貴洋"));
+  // Japanese name, Katakana, with a 'KATAKANA MIDDLE DOT' (U+30FB).
+  EXPECT_TRUE(HasCjkNameCharacteristics("ビル・ゲイツ"));
+  // Japanese name, Katakana, with a 'MIDDLE DOT' (U+00B7) (likely a
+  // typo).
+  EXPECT_TRUE(HasCjkNameCharacteristics("ビル·ゲイツ"));
+  // CJK names don't have a middle name, so a 3-part name is bogus to us.
+  EXPECT_FALSE(HasCjkNameCharacteristics("반 기 문"));
+}
+
+// Test the detection of Hispanic/Latinx name characteristics.
+TEST(AutofillStructuredName, HasHispanicLatinxNameCharaceristics) {
+  EXPECT_TRUE(HasHispanicLatinxNameCharaceristics("Pablo Ruiz Picasso"));
+  EXPECT_FALSE(HasHispanicLatinxNameCharaceristics("Werner Heisenberg"));
+  EXPECT_TRUE(HasHispanicLatinxNameCharaceristics("SomeName y SomeOtherName"));
+}
+
+// Test the detection of middle name initials.
+TEST(AutofillStructuredName, HasMiddleNameInitialsCharacteristics) {
+  EXPECT_FALSE(HasMiddleNameInitialsCharacteristics("Diego"));
+  EXPECT_FALSE(HasMiddleNameInitialsCharacteristics("d"));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D"));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("DD"));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D.D."));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D. D. D."));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D-D"));
+  EXPECT_TRUE(HasMiddleNameInitialsCharacteristics("D.-D."));
+}
+
+// Test the reduction of a name to its initials.
+TEST(AutofillStructuredName, ReduceToInitials) {
+  EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("")), base::ASCIIToUTF16(""));
+  EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("George")),
+            base::ASCIIToUTF16("G"));
+  EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("George Walker")),
+            base::ASCIIToUTF16("GW"));
+  EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("michael myers")),
+            base::ASCIIToUTF16("MM"));
+  EXPECT_EQ(ReduceToInitials(base::ASCIIToUTF16("Hans-Peter")),
+            base::ASCIIToUTF16("HP"));
+}
+
+// Test getting the field type |NAME_MIDDLE_INITIAL|.
+TEST(AutofillStructuredName, GetNameMiddleInitial) {
+  NameFull full_name;
+
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
+                                      base::ASCIIToUTF16("Michael"),
+                                      VerificationStatus::kObserved);
+
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("M"));
+
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
+                                      base::ASCIIToUTF16("Michael Myers"),
+                                      VerificationStatus::kObserved);
+
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("MM"));
+
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE,
+                                      base::ASCIIToUTF16("george walker"),
+                                      VerificationStatus::kObserved);
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("GW"));
+
+  // The the set value already has the characteristics of initials, the value
+  // should be returned as it is.
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("GW"),
+                                      VerificationStatus::kObserved);
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("GW"));
+
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("G. W."),
+                                      VerificationStatus::kObserved);
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("G. W."));
+
+  full_name.SetValueForTypeIfPossible(NAME_MIDDLE, base::ASCIIToUTF16("G.-W."),
+                                      VerificationStatus::kObserved);
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::ASCIIToUTF16("G.-W."));
+}
+
+TEST(AutofillStructuredName, TestGetSupportedTypes) {
+  NameFull full_name;
+  ServerFieldTypeSet supported_types;
+  full_name.GetSupportedTypes(&supported_types);
+  EXPECT_EQ(ServerFieldTypeSet({NAME_FULL, NAME_HONORIFIC_PREFIX, NAME_FIRST,
+                                NAME_MIDDLE, NAME_MIDDLE_INITIAL, NAME_LAST,
+                                NAME_LAST_FIRST, NAME_LAST_CONJUNCTION,
+                                NAME_LAST_SECOND}),
+            supported_types);
+}
+
+TEST(AutofillStructuredName, TestSettingMiddleNameInitial) {
+  NameFull full_name;
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE), base::string16());
+
+  EXPECT_TRUE(full_name.SetValueForTypeIfPossible(
+      NAME_MIDDLE_INITIAL, base::UTF8ToUTF16("M"),
+      VerificationStatus::kObserved));
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE_INITIAL),
+            base::UTF8ToUTF16("M"));
+  EXPECT_EQ(full_name.GetValueForType(NAME_MIDDLE), base::UTF8ToUTF16("M"));
+}
+
+}  // namespace structured_address
+}  // namespace autofill
--- a/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc
@@ -3,7 +3,10 @@
 // found in the LICENSE file.

 #include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
+
 #include <utility>
+
+#include "base/strings/strcat.h"
 #include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
 #include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"

@@ -13,6 +16,317 @@ namespace autofill {

 namespace structured_address {

+namespace {
+
+// Best practices for writing regular expression snippets:
+// By wrapping snippets in non-capture groups, i.e. (?: ... ), we ensure that a
+// pending "?" is interpreted as "optional" instead of a modifier of a previous
+// operator. E.g. `StrCat({"(?:a+)", "?"})` means an optional sequence of "a"
+// characters. But `StrCat({"a+", "?"})` means lazily match one or more "a"
+// characters. Prefer [^\s,] ('not a whitespace or a comma') over \w ('a word
+// character') in names, when you have concerns about hyphens (e.g. the German
+// name "Hans-Joachim") because '-' is not matched by \w.
+
+// Regular expressions pattern of common two-character CJK last names.
+// Korean names are written in Hangul.
+// Chinese names are written in their traditional and simplified version.
+// Source:
+// https://en.wikipedia.org/wiki/List_of_Korean_surnames
+// https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
+const char kTwoCharacterCjkLastNamesRe[] =
+    "(?:남궁|사공|서문|선우|제갈|황보|독고|망절"
+    "|欧阳|令狐|皇甫|上官|司徒|诸葛|司马|宇文|呼延|端木"
+    "|張簡|歐陽|諸葛|申屠|尉遲|司馬|軒轅|夏侯)";
+
+// Regular expression pattern for a Hangul (Korean) character.
+const char kHangulCharacterRe[] = "(?:\\p{Hangul})";
+
+// Regular expression pattern for a sequence of Hangul (Korean) character.
+const char kHangulCharactersRe[] = "(?:\\p{Hangul}+)";
+
+// Regular expression pattern to match separators as used in CJK names:
+// Included separators: \u30FB, \u00B7, \u3000 or a simple space.
+const char kCjkNameSeperatorsRe[] = "(?:・|·|　|\\s+)";
+
+// Regular expression pattern for common honorific name prefixes.
+// The list is incomplete and focused on the English and German language.
+// Sources:
+// * https://en.wikipedia.org/wiki/English_honorifics
+// * https://en.wikipedia.org/wiki/German_honorifics
+// TODO(crbug.com/1107770): Include more languages and categories.
+const char kHonorificPrefixRe[] =
+    "(?:"
+    "Master|Mr\\.?|Miss\\.?|Mrs\\.?|Missus|Ms\\.?|Mx\\.?|M\\.?|Ma'am|Sir|"
+    "Gentleman|Sire|Mistress|Madam|Ma'am|Dame|Lord|Lady|Esq|Excellency|"
+    "Excellence|Her Honour|His Honour|Hon\\.?|The Right Honourable|The Most "
+    "Honourable|Dr\\.?|PhD|DPhil|MD|DO|Prof\\.|Professor|QC|CL|Chancellor|Vice-"
+    "Chancellor|Principle|Principal|President|Master|Warden|Dean|Regent|Rector|"
+    "Provost|Director|Chief Executive|Imām|Shaykh|Muftī|Hāfiz|Hāfizah|Qārī"
+    "|Mawlānā|Hājī|Sayyid|Sayyidah|Sharif|Eminent|Venerable|His Holiness"
+    "|His Holiness|His All Holiness|His Beatitude|The Most Blessed"
+    "|His Excellency|His Most Eminent Highness|His Eminence"
+    "|Most Reverend Eminence|The Most Reverend|His Grace|His Lordship"
+    "|The Reverend|Fr|Pr|Br|Sr|Elder|Rabbi|The Reverend|Cantor|Chief Rabbi"
+    "|Grand "
+    "Rabbi|Rebbetzin|Herr|Frau|Fräulein|Dame|PD|Doktor|Magister|Ingenieur"
+    "|1lt|1st|2lt|2nd|3rd|admiral|capt|captain|col|cpt|dr|gen|general|lcdr"
+    "|lt|ltc|ltg|ltjg|maj|major|mg|pastor|prof|rep|reverend"
+    "|rev|sen|st)";
+
+// Regular expression pattern for an optional last name suffix.
+const char kOptionalLastNameSuffixRe[] =
+    "(?:b\\.a|ba|d\\.d\\.s|dds|ii|iii|iv|ix|jr|m\\.a|m\\.d|md|ms|"
+    "ph\\.?d|sr|v|vi|vii|viii|x)?";
+
+// Regular expression pattern for a CJK character.
+const char kCjkCharacterRe[] =
+    "(?:"
+    "\\p{Han}|"
+    "\\p{Hangul}|"
+    "\\p{Katakana}|"
+    "\\p{Hiragana}|"
+    "\\p{Bopomofo})";
+
+// Regular expression pattern for a sequence of CJK character.
+const char kCjkCharactersRe[] =
+    "(?:(?:"
+    "\\p{Han}|"
+    "\\p{Hangul}|"
+    "\\p{Katakana}|"
+    "\\p{Hiragana}|"
+    "\\p{Bopomofo})+)";
+
+// Regular expression pattern of common two-character Korean names.
+// Korean last names are written in Hangul. Note, some last names are ambiguous
+// in the sense that they share a common prefix with a single-character last
+// name. Source: https://en.wikipedia.org/wiki/List_of_Korean_surnames
+const char kTwoCharacterKoreanNamesRe[] =
+    "(?:강전|남궁|독고|동방|망절|사공|서문|선우"
+    "|소봉|어금|장곡|제갈|황목|황보)";
+
+// Regular expression pattern to match if a string contains a common
+// Hispanic/Latinx last name.
+// It contains the most common names in Spain, Mexico, Cuba, Dominican Republic,
+// Puerto Rico and Guatemala.
+// Source: https://en.wikipedia.org/wiki/List_of_common_Spanish_surnames
+const char kHispanicCommonLastNameCharacteristicsRe[] =
+    "(?:Aguilar|Alonso|Álvarez|Amador|Betancourt|Blanco|Burgos|Castillo|Castro|"
+    "Chávez|Colón|Contreras|Cortez|Cruz|Delgado|Diaz|Díaz|Domínguez|Estrada|"
+    "Fernandez|Fernández|Flores|Fuentes|Garcia|García|Garza|Gil|Gómez|González|"
+    "Guerrero|Gutiérrez|Guzmán|Hernández|Herrera|Iglesias|Jiménez|Juárez|Lopez|"
+    "López|Luna|Marín|Marroquín|Martín|Martinez|Martínez|Medina|Méndez|Mendoza|"
+    "Molina|Morales|Moreno|Muñoz|Narvaez|Navarro|Núñez|Ortega|Ortiz|Ortíz|Peña|"
+    "Perez|Pérez|Ramírez|Ramos|Reyes|Rivera|Rodriguez|Rodríguez|Rojas|Romero|"
+    "Rosario|Rubio|Ruiz|Ruíz|Salazar|Sanchez|Sánchez|Santana|Santiago|Santos|"
+    "Sanz|Serrano|Soto|Suárez|Toro|Torres|Vargas|Vasquez|Vásquez|Vázquez|"
+    "Velásquez)";
+
+// Regular expression pattern to match a single word.
+const char kSingleWordRe[] = "(?:[^\\s,]+)";
+
+// Regular expression pattern for multiple lazy words meaning that the
+// expression avoids to match more than one word if possible.
+const char kMultipleLazyWordsRe[] = "(?:[^\\s,]+(?:\\s+[^\\s,]+)*?)";
+
+// Regular expression pattern to check if a name contains a Hispanic/Latinx
+// last name conjunction.
+const char kHispanicLastNameConjunctionCharacteristicsRe[] = "\\s(y|e|i)\\s";
+
+// Regular expression pattern to match the conjunction used between
+// Hispanic/Latinx last names.
+const char kHispanicLastNameConjunctionsRe[] = "(?:y|e|i)";
+
+// Regular expression pattern to match common prefixes belonging to a (single)
+// last name.
+// Source: https://en.wikipedia.org/wiki/List_of_family_name_affixes
+// According to the source, the list is partial. Changes to the list:
+// * "De la" and "De le" is added to support the combination of "de" and
+// "le"/"la" as used in Hispanic/Latinx names.
+// * The matching of "i" is made lazy to give the last name conjunction
+// precedence.
+const char kOptionalLastNamePrefixRe[] =
+    "(?:(?:"
+    "a|ab|af|av|ap|abu|aït|al|ālam|aust|austre|bar|bath|bat|ben|bin|ibn|bet|"
+    "bint|binti|binte|da|das|de|degli|dele|del|du|della|der|di|dos|du|e|el|"
+    "fetch|vetch|fitz|i??|kil|gil|de le|de "
+    "la|la|le|lille|lu|m|mac|mc|mck|mhic|mic|mala|"
+    "mellom|myljom|na|ned|nedre|neder|nic|ni|nin|nord|norr|ny|o|ua|"
+    "ui|opp|upp|öfver|ost|öst|öster|øst|øst|østre|över|øvste|øvre|øver|öz|pour|"
+    "putra|putri|setia|tor|söder|sør|sønder|sør|syd|søndre|syndre|søre|ter|ter|"
+    "tre|van|väst|väster|verch|erch|vest|vestre|vesle|vetle|von|zu|von und "
+    "zu)\\s)?";
+
+// Regular expressions to characterize if a string contains initials by
+// checking that:
+// * The string contains only upper case letters that may be preceded by a
+// point.
+// * Between each letter, there can be a space or a hyphen.
+const char kMiddleNameInitialsCharacteristicsRe[] =
+    "^(?:[A-Z]\\.?(?:(?:\\s|-)?[A-Z]\\.?)*)$";
+
+// Returns an expression to parse a CJK name that includes one separator.
+// The full name is parsed into |NAME_FULL|, the part of the name before the
+// separator is parsed into |NAME_LAST| and the part after the separator is
+// parsed into |NAME_FIRST|.
+std::string ParseSeparatedCJkNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {// Parse one or more CJK characters into the last name.
+       CaptureTypeWithPattern(NAME_LAST, kCjkCharactersRe,
+                              {.separator = kCjkNameSeperatorsRe}),
+       // Parse the remaining CJK characters into the first name.
+       CaptureTypeWithPattern(NAME_FIRST, kCjkCharactersRe)});
+}
+
+// Returns an expression to parse a CJK name that starts with a known
+// two-character last name.
+std::string ParseCommonCjkTwoCharacterLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {// Parse known two-character CJK last name into |NAME_LAST|.
+       CaptureTypeWithPattern(NAME_LAST, kTwoCharacterCjkLastNamesRe,
+                              {.separator = std::string()}),
+       // Parse the remaining CJK characters into |NAME_FIRST|.
+       CaptureTypeWithPattern(
+           NAME_FIRST, kCjkCharactersRe,
+           {.separator = "", .quantifier = MATCH_OPTIONAL})});
+}
+
+// Returns an expression to parse a CJK name without a separator.
+// The full name is parsed into |NAME_FULL|, the first character is parsed
+// into |NAME_LAST| and the rest into |NAME_FIRST|.
+std::string ParseCjkSingleCharacterLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {// Parse the first CJK character into |NAME_LAST|.
+       CaptureTypeWithPattern(NAME_LAST, kCjkCharacterRe,
+                              {.separator = std::string()}),
+       // Parse the remaining CJK characters into |NAME_FIRST|.
+       CaptureTypeWithPattern(
+           NAME_FIRST, kCjkCharactersRe,
+           {.separator = "", .quantifier = MATCH_OPTIONAL})});
+}
+
+// Returns an expression to parse a Korean name that contains at least 4
+// characters with a common Korean two-character last name. The full name is
+// parsed into |NAME_FULL|, the first two characters into |NAME_LAST| and the
+// rest into |NAME_FIRST|.
+std::string ParseKoreanTwoCharacterLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {// Parse known Korean two-character last names into |NAME_LAST|.
+       CaptureTypeWithPattern(NAME_LAST, kTwoCharacterKoreanNamesRe,
+                              {.separator = std::string()}),
+       // Parse at least two remaining Hangul characters into
+       // |NAME_FIRST|.
+       CaptureTypeWithPattern(NAME_FIRST,
+                              {kHangulCharacterRe, kHangulCharactersRe})});
+}
+
+// Returns an expression to determine if a name has the characteristics of a
+// CJK name.
+std::string MatchCjkNameExpression() {
+  return base::StrCat({// Must contain one or more CJK characters
+                       "^", kCjkCharactersRe,
+                       // Followed by an optional separator with one
+                       // or more additional CJK characters.
+                       "(", kCjkNameSeperatorsRe, kCjkCharactersRe, ")?$"});
+}
+
+// Returns an expression to parse a full name that contains only a last name.
+std::string ParseOnlyLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL, {CaptureTypeWithPattern(
+                      NAME_LAST, {kOptionalLastNamePrefixRe, kSingleWordRe}),
+                  kOptionalLastNameSuffixRe});
+}
+
+// Returns an expression to parse a name that consists of a first, middle and
+// last name with an optional honorific prefix. The full name is parsed into
+// |NAME_FULL|. The name can start with an honorific prefix that is parsed
+// into |NAME_HONORIFIC_PREFIX|. The last token is parsed into |NAME_LAST|.
+// This token may be preceded by a last name prefix like "Mac" or
+// "von" that is included in |NAME_LAST|. If the strings contains any
+// remaining tokens, the first token is parsed into
+// |NAME_FIRST| and all remaining tokens into |NAME_MIDDLE|.
+std::string ParseFirstMiddleLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_FIRST, kSingleWordRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_MIDDLE, kMultipleLazyWordsRe,
+                              {.quantifier = MATCH_LAZY_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_LAST,
+                              {kOptionalLastNamePrefixRe, kSingleWordRe}),
+       kOptionalLastNameSuffixRe});
+}
+
+// Returns an expression to parse a name that starts with the last name,
+// followed by a comma, and than the first and middle names.
+// The full name is parsed into |NAME_FULL|. The name can start with an optional
+// honorific prefix that is parsed into |HONORIFIC_PREFIX|, follow by a single
+// token that is parsed into |LAST_NAME|. The |LAST_NAME| must be preceded by a
+// comma with optional spaces. The next token is parsed into |NAME_FIRST| and
+// all remaining tokens are parsed into |NAME_MIDDLE|.
+std::string ParseLastCommaFirstMiddleExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_LAST,
+                              {kOptionalLastNamePrefixRe, kSingleWordRe},
+                              {.separator = "\\s*,\\s*"}),
+       CaptureTypeWithPattern(NAME_FIRST, kSingleWordRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_MIDDLE, kMultipleLazyWordsRe,
+                              {.quantifier = MATCH_LAZY_OPTIONAL})});
+}
+
+// Returns an expression to parse an Hispanic/Latinx last name.
+// The last name can consist of two parts with an optional conjunction.
+// The full last name is parsed into |NAME_LAST|, the first part into
+// |NAME_LAST_FIRST|, the conjunction into |NAME_LAST_CONJUNCTION|, and the
+// second part into |NAME_LAST_SECOND|.
+// Each last name part consists of a space-separated toke with an optional
+// prefix like "de le". If only one last name part is found, it is parsed into
+// |NAME_LAST_SECOND|.
+std::string ParseHispanicLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_LAST,
+      {CaptureTypeWithPattern(NAME_LAST_FIRST,
+                              {kOptionalLastNamePrefixRe, kSingleWordRe}),
+       CaptureTypeWithPattern(NAME_LAST_CONJUNCTION,
+                              kHispanicLastNameConjunctionsRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_LAST_SECOND,
+                              {kOptionalLastNamePrefixRe, kSingleWordRe})});
+}
+
+// Returns an expression to parse a full Hispanic/Latinx name that
+// contains an optional honorific prefix, a first name, and a last name as
+// specified by |ParseHispanicLastNameExpression()|.
+std::string ParseHispanicFullNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_FULL,
+      {CaptureTypeWithPattern(NAME_HONORIFIC_PREFIX, kHonorificPrefixRe,
+                              {.quantifier = MATCH_OPTIONAL}),
+       CaptureTypeWithPattern(NAME_FIRST, kMultipleLazyWordsRe,
+                              {.quantifier = MATCH_LAZY_OPTIONAL}),
+       ParseHispanicLastNameExpression()});
+}
+
+// Returns an expression that parses the whole |LAST_NAME| into
+// |LAST_NAME_SECOND|.
+std::string ParseLastNameIntoSecondLastNameExpression() {
+  return CaptureTypeWithPattern(
+      NAME_LAST,
+      {CaptureTypeWithPattern(NAME_LAST_SECOND, kMultipleLazyWordsRe)});
+}
+
+}  // namespace
+
 StructuredAddressesRegExProvider::StructuredAddressesRegExProvider() = default;

 // static
@@ -27,6 +341,34 @@ std::string StructuredAddressesRegExProvider::GetPattern(
  switch (expression_identifier) {
    case RegEx::kSingleWord:
      return kSingleWordRe;
+    case RegEx::kParseSeparatedCjkName:
+      return ParseSeparatedCJkNameExpression();
+    case RegEx::kParseCommonCjkTwoCharacterLastName:
+      return ParseCommonCjkTwoCharacterLastNameExpression();
+    case RegEx::kParseKoreanTwoCharacterLastName:
+      return ParseKoreanTwoCharacterLastNameExpression();
+    case RegEx::kParseCjkSingleCharacterLastName:
+      return ParseCjkSingleCharacterLastNameExpression();
+    case RegEx::kMatchHispanicCommonNameCharacteristics:
+      return kHispanicCommonLastNameCharacteristicsRe;
+    case RegEx::kMatchHispanicLastNameConjuctionCharacteristics:
+      return kHispanicLastNameConjunctionCharacteristicsRe;
+    case RegEx::kMatchCjkNameCharacteristics:
+      return MatchCjkNameExpression();
+    case RegEx::kParseOnlyLastName:
+      return ParseOnlyLastNameExpression();
+    case RegEx::kParseLastCommaFirstMiddleName:
+      return ParseLastCommaFirstMiddleExpression();
+    case RegEx::kParseFirstMiddleLastName:
+      return ParseFirstMiddleLastNameExpression();
+    case RegEx::kParseHispanicLastName:
+      return ParseHispanicLastNameExpression();
+    case RegEx::kParseHispanicFullName:
+      return ParseHispanicFullNameExpression();
+    case RegEx::kMatchMiddleNameInitialsCharacteristics:
+      return kMiddleNameInitialsCharacteristicsRe;
+    case RegEx::kParseLastNameIntoSecondLastName:
+      return ParseLastNameIntoSecondLastNameExpression();
  }
  NOTREACHED();
 }

--- a/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h
@@ -20,7 +20,21 @@ namespace structured_address {
 // values in an AddressComponent tree.
 enum class RegEx {
  kSingleWord,
-  kLastRegEx = kSingleWord,
+  kParseSeparatedCjkName,
+  kParseCommonCjkTwoCharacterLastName,
+  kParseKoreanTwoCharacterLastName,
+  kParseCjkSingleCharacterLastName,
+  kMatchCjkNameCharacteristics,
+  kMatchHispanicCommonNameCharacteristics,
+  kMatchHispanicLastNameConjuctionCharacteristics,
+  kParseOnlyLastName,
+  kParseLastCommaFirstMiddleName,
+  kParseFirstMiddleLastName,
+  kParseHispanicLastName,
+  kParseHispanicFullName,
+  kParseLastNameIntoSecondLastName,
+  kMatchMiddleNameInitialsCharacteristics,
+  kLastRegEx = kParseLastNameIntoSecondLastName,
 };

 // This singleton class builds and caches the regular expressions for value
@@ -73,4 +87,5 @@ class StructuredAddressesRegExProvider {
 }  // namespace structured_address

 }  // namespace autofill
-#endif  // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_PATTERN_REGEX_H_
+
+#endif  // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_REGEX_PROVIDER_H_
--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
@@ -13,6 +13,7 @@
 #include "base/debug/alias.h"
 #include "base/debug/dump_without_crashing.h"
 #include "base/strings/strcat.h"
+#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"

 namespace autofill {
 namespace structured_address {
@@ -45,9 +46,13 @@ const RE2* Re2RegExCache::GetRegEx(const std::string& pattern) {
  return result.first->second.get();
 }

-std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern) {
+std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern) {
  RE2::Options opt;
-  opt.set_case_sensitive(false);
+  // By default, patters are case sensitive.
+  // Note that, the named-capture-group patterns build with
+  // |CaptureTypeWithPattern()| apply a flag to make the matching case
+  // insensitive.
+  opt.set_case_sensitive(true);

  auto regex = std::make_unique<const RE2>(pattern, opt);

@@ -108,12 +113,17 @@ bool ParseValueByRegularExpression(
  return true;
 }

+bool IsPartialMatch(const std::string& value, RegEx regex) {
+  return IsPartialMatch(
+      value, StructuredAddressesRegExProvider::Instance()->GetRegEx(regex));
+}
+
 bool IsPartialMatch(const std::string& value, const std::string& pattern) {
-  const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern);
-  if (!regex || !regex->ok())
-    return false;
+  return IsPartialMatch(value, Re2RegExCache::Instance()->GetRegEx(pattern));
+}

-  return RE2::PartialMatch(value, *regex);
+bool IsPartialMatch(const std::string& value, const RE2* expression) {
+  return RE2::PartialMatch(value, *expression);
 }

 std::vector<std::string> GetAllPartialMatches(const std::string& value,
@@ -172,7 +182,8 @@ std::string CaptureTypeWithPattern(const ServerFieldType& type,
      quantifier = "";
  }

-  return base::StrCat({"(?:(?P<", AutofillType(type).ToString(), ">", pattern,
+  // By adding an "i" in the first group, the capturing is case insensitive.
+  return base::StrCat({"(?i:(?P<", AutofillType(type).ToString(), ">", pattern,
                       ")(?:", options.separator, "))", quantifier});
 }


--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
@@ -21,6 +21,8 @@
 namespace autofill {
 namespace structured_address {

+enum class RegEx;
+
 // Enum to express the few quantifiers needed to parse values.
 enum MatchQuantifier {
  // The capture group is required.
@@ -39,7 +41,7 @@ struct CaptureOptions {
  // By default, a group must be either followed by a space-like character (\s)
  // or it must be the last group in the line. The separator is allowed to be
  // empty.
-  std::string separator = "\\s|$";
+  std::string separator = "\\s+|$";
  // Indicates if the group is required, optional or even lazy optional.
  MatchQuantifier quantifier = MATCH_REQUIRED;
 };
@@ -55,7 +57,7 @@ class Re2RegExCache {
  static Re2RegExCache* Instance();

  // Returns a pointer to a constant compiled expression that matches |pattern|
-  // case-insensitively.
+  // case-sensitively.
  const RE2* GetRegEx(const std::string& pattern);

 #ifdef UNIT_TEST
@@ -96,12 +98,19 @@ bool ParseValueByRegularExpression(
    const RE2* regex,
    std::map<std::string, std::string>* result_map);

-// Returns a compiled case insensitive regular expression for |pattern|.
-std::unique_ptr<const RE2> BuildRegExFromPattern(std::string pattern);
+// Returns a compiled case sensitive regular expression for |pattern|.
+std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern);
+
+// Returns true if |value| can be matched by the enumuerated RegEx |regex|.
+bool IsPartialMatch(const std::string& value, RegEx regex);

 // Returns true if |value| can be matched with |pattern|.
 bool IsPartialMatch(const std::string& value, const std::string& pattern);

+// Same as above, but accepts a compiled regular expression instead of the
+// pattern.
+bool IsPartialMatch(const std::string& value, const RE2* expression);
+
 // Returns a vector that contains all partial matches of |pattern| in |value|;
 std::vector<std::string> GetAllPartialMatches(const std::string& value,
                                              const std::string& pattern);

--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
@@ -193,17 +193,17 @@ TEST(AutofillStructuredAddressUtils, TestGetPlaceholderToken) {
 }

 TEST(AutofillStructuredAddressUtils, CaptureTypeWithPattern) {
-  EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))?",
+  EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))?",
            CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"},
                                   {.quantifier = MATCH_OPTIONAL}));
-  EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))",
+  EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
            CaptureTypeWithPattern(NAME_FULL, {"abs", "\\w"}));
-  EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))??",
+  EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))??",
            CaptureTypeWithPattern(NAME_FULL, "abs\\w",
                                   {.quantifier = MATCH_LAZY_OPTIONAL}));
-  EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:\\s|$))",
+  EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:\\s+|$))",
            CaptureTypeWithPattern(NAME_FULL, "abs\\w"));
-  EXPECT_EQ("(?:(?P<NAME_FULL>abs\\w)(?:_))",
+  EXPECT_EQ("(?i:(?P<NAME_FULL>abs\\w)(?:_))",
            CaptureTypeWithPattern(NAME_FULL, "abs\\w", {.separator = "_"}));
 }