[Autofill][Slimshady] merging of names that contain one token less

Change-Id: I90f75fd591537eeb1c6b23892a4703289af2872f Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2367612Reviewed-by: Christoph Schwering <schwering@google.com> Commit-Queue: Matthias Körber <koerber@google.com> Cr-Commit-Position: refs/heads/master@{#800452}

[Autofill][Slimshady] merging of names that contain one token less
Change-Id: I90f75fd591537eeb1c6b23892a4703289af2872f Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2367612Reviewed-by: Christoph Schwering <schwering@google.com> Commit-Queue: Matthias Körber <koerber@google.com> Cr-Commit-Position: refs/heads/master@{#800452}
3738a022 · Matthias Körber · Commit Bot · 48c2e8ef · 3738a022 · 3738a022
Commit 3738a022 authored Aug 21, 2020 by Matthias Körber Committed by Commit Bot Aug 21, 2020
8 changed files
--- a/components/autofill/core/browser/data_model/autofill_structured_address_component.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_component.cc
@@ -9,6 +9,7 @@
 #include <string>
 #include <utility>
+#include "base/feature_list.h"
 #include "base/notreached.h"
 #include "base/strings/strcat.h"
 #include "base/strings/string16.h"
@@ -20,6 +21,7 @@
 #include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
 #include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
 #include "components/autofill/core/browser/field_types.h"
+#include "components/autofill/core/common/autofill_features.h"
 namespace autofill {
@@ -618,8 +620,19 @@ bool AddressComponent::IsMergeableWithComponent(
  if (*this == newer_component)
    return true;
-  return AreSortedTokensEqual(GetSortedTokens(),
+  SortedTokenComparisonResult token_comparison_result =
-                              newer_component.GetSortedTokens());
+      CompareSortedTokens(GetSortedTokens(), newer_component.GetSortedTokens());
+  if (token_comparison_result.status == MATCH)
+    return true;
+  if (base::FeatureList::IsEnabled(
+          features::kAutofillEnableSupportForMergingSubsetNames)) {
+    if (token_comparison_result.status == SINGLE_TOKEN_SUPERSET)
+      return true;
+  }
+  return false;
 }
 bool AddressComponent::MergeWithComponent(
@@ -628,19 +641,32 @@ bool AddressComponent::MergeWithComponent(
  if (*this == newer_component)
    return true;
-  if (!IsMergeableWithComponent(newer_component))
+  SortedTokenComparisonResult token_comparison_result =
-    return false;
+      CompareSortedTokens(GetSortedTokens(), newer_component.GetSortedTokens());
-  // Applies the merging strategy for two token-equivalent components.
+  switch (token_comparison_result.status) {
-  if (AreSortedTokensEqual(GetSortedTokens(),
+    case MATCH:
-                           newer_component.GetSortedTokens())) {
      return MergeTokenEquivalentComponent(newer_component);
+    case SINGLE_TOKEN_SUPERSET:
+      if (base::FeatureList::IsEnabled(
+              features::kAutofillEnableSupportForMergingSubsetNames)) {
+        return MergeSubsetComponent(newer_component, token_comparison_result);
+      }
+      break;
+    default:
+      return false;
  }
  return false;
 }
 bool AddressComponent::MergeTokenEquivalentComponent(
    const AddressComponent& newer_component) {
+  if (!AreSortedTokensEqual(GetSortedTokens(),
+                            newer_component.GetSortedTokens()))
+    return false;
  // Assumption:
  // The values of both components are a permutation of the same tokens.
  // The componentization of the components can be different in terms of
@@ -690,7 +716,7 @@ bool AddressComponent::MergeTokenEquivalentComponent(
  const std::vector<AddressComponent*> other_subcomponents =
      newer_component.Subcomponents();
-  DCHECK(subcomponents_.size() == newer_component.Subcomponents().size());
+  DCHECK(subcomponents_.size() == other_subcomponents.size());
  int this_component_verification_score = 0;
  int newer_component_verification_score = 0;
@@ -705,7 +731,8 @@ bool AddressComponent::MergeTokenEquivalentComponent(
    // If the components can't be merged directly, store the ungermed index and
    // sum the verification scores to decide which component's substructure to
    // use.
-    if (!subcomponents_[i]->MergeWithComponent(*other_subcomponents.at(i))) {
+    if (!subcomponents_[i]->MergeTokenEquivalentComponent(
+            *other_subcomponents.at(i))) {
      this_component_verification_score +=
          subcomponents_[i]->GetStructureVerificationScore();
      newer_component_verification_score +=
@@ -725,6 +752,113 @@ bool AddressComponent::MergeTokenEquivalentComponent(
  return true;
 }
+void AddressComponent::ConsumeAdditionalToken(
+    const base::string16& token_value) {
+  if (IsAtomic()) {
+    if (GetValue().empty()) {
+      SetValue(token_value, VerificationStatus::kParsed);
+    } else {
+      SetValue(base::StrCat({GetValue(), base::ASCIIToUTF16(" "), token_value}),
+               VerificationStatus::kParsed);
+    }
+    return;
+  }
+  // Try the first free subcomponent.
+  for (auto* subcomponent : subcomponents_) {
+    if (subcomponent->GetValue().empty()) {
+      subcomponent->SetValue(token_value, VerificationStatus::kParsed);
+      return;
+    }
+  }
+  // Otherwise append the value to the first component.
+  subcomponents_[0]->SetValue(
+      base::StrCat({GetValue(), base::ASCIIToUTF16(" "), token_value}),
+      VerificationStatus::kParsed);
+}
+bool AddressComponent::MergeSubsetComponent(
+    const AddressComponent& subset_component,
+    const SortedTokenComparisonResult& token_comparison_result) {
+  DCHECK(token_comparison_result.status == SINGLE_TOKEN_SUPERSET);
+  DCHECK(token_comparison_result.additional_tokens.size() == 1);
+  base::string16 token_to_consume =
+      token_comparison_result.additional_tokens.back().value;
+  int this_component_verification_score = 0;
+  int newer_component_verification_score = 0;
+  bool found_subset_component = false;
+  std::vector<int> unmerged_indices;
+  unmerged_indices.reserve(subcomponents_.size());
+  const std::vector<AddressComponent*>& subset_subcomponents =
+      subset_component.Subcomponents();
+  unmerged_indices.reserve(subcomponents_.size());
+  for (size_t i = 0; i < subcomponents_.size(); i++) {
+    DCHECK(subcomponents_[i]->GetStorageType() ==
+           subset_subcomponents.at(i)->GetStorageType());
+    AddressComponent* subcomponent = subcomponents_[i];
+    const AddressComponent* subset_subcomponent = subset_subcomponents.at(i);
+    base::string16 additional_token;
+    // If the additional token is the value of this token. Just leave it in.
+    if (!found_subset_component &&
+        subcomponent->GetValue() == token_to_consume &&
+        subset_subcomponent->GetValue().empty()) {
+      found_subset_component = true;
+      continue;
+    }
+    SortedTokenComparisonResult subtoken_comparison_result =
+        CompareSortedTokens(subcomponent->GetSortedTokens(),
+                            subset_subcomponent->GetSortedTokens());
+    // Recursive case.
+    if (!found_subset_component &&
+        subtoken_comparison_result.status == SINGLE_TOKEN_SUPERSET) {
+      found_subset_component = true;
+      subcomponent->MergeSubsetComponent(*subset_subcomponent,
+                                         subtoken_comparison_result);
+      continue;
+    }
+    // If the tokens are the equivalent, they can directly be merged.
+    if (subtoken_comparison_result.status == MATCH) {
+      subcomponent->MergeTokenEquivalentComponent(*subset_subcomponent);
+      continue;
+    }
+    // Otherwise calculate the verification score.
+    this_component_verification_score +=
+        subcomponent->GetStructureVerificationScore();
+    newer_component_verification_score +=
+        subset_subcomponent->GetStructureVerificationScore();
+    unmerged_indices.emplace_back(i);
+  }
+  // If the total verification score of all unmerged components of the other
+  // component is equal or larger than the score of this component, use its
+  // subcomponents including their substructure for all unmerged components.
+  if (newer_component_verification_score >= this_component_verification_score) {
+    for (size_t i : unmerged_indices)
+      *subcomponents_[i] = *subset_subcomponents[i];
+    if (!found_subset_component)
+      this->ConsumeAdditionalToken(token_to_consume);
+  }
+  // In the current implementation it is always possible to merge.
+  // Once more tokens are supported this may change.
+  return true;
+}
 int AddressComponent::GetStructureVerificationScore() const {
  int result = 0;
  switch (GetVerificationStatus()) {

--- a/components/autofill/core/browser/data_model/autofill_structured_address_component.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_component.h
@@ -20,6 +20,9 @@ class RE2;
 namespace autofill {
 namespace structured_address {
+struct AddressToken;
+struct SortedTokenComparisonResult;
 // Represents the validation status of value stored in the AutofillProfile.
 // The associated integer values used to store the verification code in SQL and
 // should not be modified.
@@ -269,7 +272,7 @@ class AddressComponent {
  // Returns a constant reference to the sorted canonicalized tokens of the
  // value of the component.
-  const std::vector<base::string16>& GetSortedTokens() const {
+  const std::vector<AddressToken>& GetSortedTokens() const {
    return sorted_normalized_tokens_;
  }
@@ -359,6 +362,17 @@ class AddressComponent {
  // Clears all parsed and formatted values.
  void ClearAllParsedAndFormattedValues();
+  // Merge a component that has exactly one token less.
+  bool MergeSubsetComponent(
+      const AddressComponent& subset_component,
+      const SortedTokenComparisonResult& token_comparison_result);
+  // Consumes an additional token into the most appropriate subcomponent.
+  // Can be implemented by the specific node types.
+  // The fall-back solution uses the first empty node.
+  // If no empty node is available, it appends the value to the first node.
+  virtual void ConsumeAdditionalToken(const base::string16& token_value);
 private:
  // Returns a reference to the constant root node of the tree.
  const AddressComponent& GetRootNode() const;
@@ -410,7 +424,7 @@ class AddressComponent {
  // meaning that it was converted to lower case and diacritics have been
  // removed. |value_| is tokenized by splitting the string by white spaces and
  // commas. It is calculated when |value_| is set.
-  std::vector<base::string16> sorted_normalized_tokens_;
+  std::vector<AddressToken> sorted_normalized_tokens_;
  // A pointer to the parent node. It is set to nullptr if the node is the root
  // node of the AddressComponent tree.

--- a/components/autofill/core/browser/data_model/autofill_structured_address_name_unittest.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_name_unittest.cc
@@ -9,8 +9,11 @@
 #include <string>
 #include <vector>
+#include "base/feature_list.h"
 #include "base/strings/utf_string_conversions.h"
+#include "base/test/scoped_feature_list.h"
 #include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
+#include "components/autofill/core/common/autofill_features.h"
 #include "testing/gtest/include/gtest/gtest.h"
 using base::ASCIIToUTF16;
@@ -20,6 +23,38 @@ namespace structured_address {
 namespace {
+struct AddressComponentTestValue {
+  ServerFieldType type;
+  std::string value;
+  VerificationStatus status;
+};
+struct AddressComponentTestValues {
+  std::vector<AddressComponentTestValue> values;
+};
+void SetTestValues(AddressComponent* component,
+                   const AddressComponentTestValues& test_values,
+                   bool finalize = true) {
+  for (const auto& test_value : test_values.values) {
+    component->SetValueForTypeIfPossible(test_value.type,
+                                         base::UTF8ToUTF16(test_value.value),
+                                         test_value.status);
+  }
+  if (finalize)
+    component->CompleteFullTree();
+}
+void VerifyTestValues(AddressComponent* component,
+                      const AddressComponentTestValues test_values) {
+  for (const auto& test_value : test_values.values) {
+    EXPECT_EQ(component->GetValueForType(test_value.type),
+              base::UTF8ToUTF16(test_value.value));
+    EXPECT_EQ(component->GetVerificationStatusForType(test_value.type),
+              test_value.status);
+  }
+}
 // A test record that contains all entries of the hybrid-structure name tree.
 struct NameParserTestRecord {
  std::string full;
@@ -666,5 +701,133 @@ TEST(AutofillStructuredName, MigrationFromLegacyStructure_WithoutFullName) {
            VerificationStatus::kObserved);
 }
+TEST(AutofillStructuredName, MergeSubsetLastname) {
+  base::test::ScopedFeatureList scoped_feature;
+  scoped_feature.InitAndEnableFeature(
+      features::kAutofillEnableSupportForMergingSubsetNames);
+  NameFull name;
+  NameFull subset_name;
+  AddressComponentTestValues name_values = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_MIDDLE,
+               .value = "Neo",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST,
+               .value = "Anderson y Smith",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  AddressComponentTestValues subset_name_values = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_MIDDLE,
+               .value = "Neo",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST_FIRST,
+               .value = "Anderson",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST_SECOND,
+               .value = "Smith",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  AddressComponentTestValues expectation = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_MIDDLE,
+               .value = "Neo",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST_FIRST,
+               .value = "Anderson",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST_CONJUNCTION,
+               .value = "y",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST_SECOND,
+               .value = "Smith",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  SetTestValues(&name, name_values);
+  SetTestValues(&subset_name, subset_name_values);
+  EXPECT_TRUE(name.IsMergeableWithComponent(subset_name));
+  EXPECT_TRUE(name.MergeWithComponent(subset_name));
+  VerifyTestValues(&name, name_values);
+}
+TEST(AutofillStructuredName, MergeSubsetLastname2) {
+  base::test::ScopedFeatureList scoped_feature;
+  scoped_feature.InitAndEnableFeature(
+      features::kAutofillEnableSupportForMergingSubsetNames);
+  NameFull name;
+  NameFull subset_name;
+  AddressComponentTestValues name_values = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_MIDDLE,
+               .value = "Neo",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST,
+               .value = "Anderson",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  AddressComponentTestValues subset_name_values = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST,
+               .value = "Anderson",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  AddressComponentTestValues expectation = {
+      .values =
+          {
+              {.type = NAME_FIRST,
+               .value = "Thomas",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_MIDDLE,
+               .value = "Neo",
+               .status = VerificationStatus::kObserved},
+              {.type = NAME_LAST,
+               .value = "Anderson",
+               .status = VerificationStatus::kObserved},
+          },
+  };
+  SetTestValues(&name, name_values);
+  SetTestValues(&subset_name, subset_name_values);
+  EXPECT_TRUE(name.IsMergeableWithComponent(subset_name));
+  EXPECT_TRUE(name.MergeWithComponent(subset_name));
+  VerifyTestValues(&name, name_values);
+}
 }  // namespace structured_address
 }  // namespace autofill
--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
@@ -26,6 +26,16 @@
 namespace autofill {
 namespace structured_address {
+SortedTokenComparisonResult::SortedTokenComparisonResult(
+    SortedTokenComparisonStatus status,
+    std::vector<AddressToken> additional_tokens)
+    : status(status), additional_tokens(additional_tokens) {}
+SortedTokenComparisonResult::~SortedTokenComparisonResult() = default;
+SortedTokenComparisonResult::SortedTokenComparisonResult(
+    const SortedTokenComparisonResult& other) = default;
 bool StructuredNamesEnabled() {
  return base::FeatureList::IsEnabled(
      features::kAutofillEnableSupportForMoreStructureInNames);
@@ -241,41 +251,87 @@ bool AreStringTokenEquivalent(const base::string16& one,
  return AreSortedTokensEqual(TokenizeValue(one), TokenizeValue(other));
 }
-bool AreSortedTokensEqual(const std::vector<base::string16>& first,
+SortedTokenComparisonResult CompareSortedTokens(
-                          const std::vector<base::string16>& second) {
+    const std::vector<AddressToken>& first,
-  // It is assumed that the vectors are sorted.
+    const std::vector<AddressToken>& second) {
-  DCHECK(std::is_sorted(first.begin(), first.end()) &&
+  // Lambda to compare the normalized values of two AddressTokens.
-         std::is_sorted(second.begin(), second.end()));
+  auto cmp_normalized = [](const auto& a, const auto& b) {
-  // If there is a different number of tokens, it can't be a permutation.
+    return a.normalized_value < b.normalized_value;
-  if (first.size() != second.size())
+  };
-    return false;
-  // Return true if both vectors are component-wise equal.
+  // Verify that the two multi sets are sorted.
-  return std::equal(first.begin(), first.end(), second.begin());
+  DCHECK(std::is_sorted(first.begin(), first.end(), cmp_normalized) &&
+         std::is_sorted(second.begin(), second.end(), cmp_normalized));
+  bool is_supserset = std::includes(first.begin(), first.end(), second.begin(),
+                                    second.end(), cmp_normalized);
+  bool is_subset = std::includes(second.begin(), second.end(), first.begin(),
+                                 first.end(), cmp_normalized);
+  // If first is both a superset and a subset it is the same.
+  if (is_supserset && is_subset)
+    return SortedTokenComparisonResult(MATCH);
+  // If it is neither, both are distinct.
+  if (!is_supserset && !is_subset)
+    return SortedTokenComparisonResult(DISTINCT);
+  std::vector<AddressToken> additional_tokens;
+  // Collect the additional tokens from the superset.
+  // Note, that the superset property is already assured.
+  std::set_symmetric_difference(
+      first.begin(), first.end(), second.begin(), second.end(),
+      std::back_inserter(additional_tokens), cmp_normalized);
+  if (is_supserset) {
+    return SortedTokenComparisonResult(additional_tokens.size() == 1
+                                           ? SINGLE_TOKEN_SUPERSET
+                                           : MULTI_TOKEN_SUPERSET,
+                                       additional_tokens);
+  }
+  return SortedTokenComparisonResult(
+      additional_tokens.size() == 1 ? SINGLE_TOKEN_SUBSET : MULTI_TOKEN_SUBSET,
+      additional_tokens);
+}
+bool AreSortedTokensEqual(const std::vector<AddressToken>& first,
+                          const std::vector<AddressToken>& second) {
+  return CompareSortedTokens(first, second).status == MATCH;
 }
-std::vector<base::string16> TokenizeValue(const base::string16 value) {
+std::vector<AddressToken> TokenizeValue(const base::string16 value) {
-  // Canonicalize the value.
+  std::vector<AddressToken> tokens;
-  base::string16 cannonicalized_value = NormalizeValue(value);
+  int index = 0;
  // CJK names are a special case and are tokenized by character without the
  // separators.
-  std::vector<base::string16> tokens;
  if (HasCjkNameCharacteristics(base::UTF16ToUTF8(value))) {
    tokens.reserve(value.size());
    for (size_t i = 0; i < value.size(); i++) {
      base::string16 cjk_separators = base::UTF8ToUTF16("・·　 ");
      if (cjk_separators.find(value.substr(i, 1)) == base::string16::npos) {
-        tokens.emplace_back(value.substr(i, 1));
+        tokens.emplace_back(AddressToken{.value = value.substr(i, 1),
+                                         .normalized_value = value.substr(i, 1),
+                                         .position = index++});
      }
    }
  } else {
    // Split it by white spaces and commas into non-empty values.
-    tokens =
+    for (const auto& token :
-        base::SplitString(cannonicalized_value, base::ASCIIToUTF16(", "),
+         base::SplitString(value, base::ASCIIToUTF16(", "),
-                          base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+                           base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY)) {
+      tokens.emplace_back(
+          AddressToken{.value = token,
+                       .normalized_value = NormalizeValue(token),
+                       .position = index++});
+    }
  }
-  // Sort the tokens lexicographically.
+  // Sort the tokens lexicographically by their normalized value.
-  std::sort(tokens.begin(), tokens.end());
+  std::sort(tokens.begin(), tokens.end(), [](const auto& a, const auto& b) {
+    return a.normalized_value < b.normalized_value;
+  });
  return tokens;
 }

--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
@@ -22,6 +22,15 @@
 namespace autofill {
 namespace structured_address {
+struct AddressToken {
+  // The original value.
+  base::string16 value;
+  // The normalized value.
+  base::string16 normalized_value;
+  // The token position in the original string.
+  int position;
+};
 enum class RegEx;
 // Enum to express the few quantifiers needed to parse values.
@@ -35,6 +44,40 @@ enum MatchQuantifier {
  MATCH_LAZY_OPTIONAL,
 };
+// The result status of comparing two sets of sorted tokens.
+enum SortedTokenComparisonStatus {
+  // The tokens are neither the same nor super/sub sets.
+  DISTINCT,
+  // The token exactly match.
+  MATCH,
+  // The first tokens are a superset of the second with only one additional
+  // element.
+  SINGLE_TOKEN_SUPERSET,
+  // The second tokens are a subset of the second with only one additional
+  // element.
+  SINGLE_TOKEN_SUBSET,
+  // The first tokens are a superset of the second with multiple additional
+  // elements.
+  MULTI_TOKEN_SUPERSET,
+  // The second tokens are a subset of the second with multiple additional
+  // elements.
+  MULTI_TOKEN_SUBSET
+};
+// The result from comparing two sets of sorted tokens containing the status and
+// the additional tokens in the super/sub sets.
+struct SortedTokenComparisonResult {
+  explicit SortedTokenComparisonResult(
+      SortedTokenComparisonStatus status,
+      std::vector<AddressToken> additional_tokens = {});
+  ~SortedTokenComparisonResult();
+  SortedTokenComparisonResult(const SortedTokenComparisonResult& other);
+  // The status of the token comparison.
+  SortedTokenComparisonStatus status = DISTINCT;
+  // The additional elements in the super/subsets.
+  std::vector<AddressToken> additional_tokens{};
+};
 // Options for capturing a named group using the
 // |CaptureTypeWithPattern(...)| functions.
 struct CaptureOptions {
@@ -176,8 +219,8 @@ std::string CaptureTypeWithPattern(const ServerFieldType& type,
 base::string16 NormalizeValue(const base::string16& value);
 // Returns true of both vectors contain the same tokens in the same order.
-bool AreSortedTokensEqual(const std::vector<base::string16>& first,
+bool AreSortedTokensEqual(const std::vector<AddressToken>& first,
-                          const std::vector<base::string16>& second);
+                          const std::vector<AddressToken>& second);
 // Returns true if both strings contain the same tokens after normalization.
 bool AreStringTokenEquivalent(const base::string16& one,
@@ -186,7 +229,13 @@ bool AreStringTokenEquivalent(const base::string16& one,
 // Returns a sorted vector containing the tokens of |value| after |value| was
 // canonicalized. |value| is tokenized by splitting it by white spaces and
 // commas.
-std::vector<base::string16> TokenizeValue(const base::string16 value);
+std::vector<AddressToken> TokenizeValue(const base::string16 value);
+// Compares two vectors of sorted AddressTokens and returns the
+// SortedTokenComparisonResult;
+SortedTokenComparisonResult CompareSortedTokens(
+    const std::vector<AddressToken>& first,
+    const std::vector<AddressToken>& second);
 }  // namespace structured_address

--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
@@ -17,6 +17,13 @@
 namespace autofill {
 namespace structured_address {
+// Element-wise comparison operator.
+bool operator==(const AddressToken& lhs, const AddressToken& rhs) {
+  return lhs.value == rhs.value &&
+         lhs.normalized_value == rhs.normalized_value &&
+         lhs.position == rhs.position;
+}
 // Regular expression with named capture groups for parsing US-style names.
 char kFirstMiddleLastRe[] =
    "^(?P<NAME_FULL>((?P<NAME_FIRST>\\w+)\\s)?"
@@ -210,16 +217,19 @@ TEST(AutofillStructuredAddressUtils, CaptureTypeWithPattern) {
 }
 TEST(AutofillStructuredAddressUtils, TokenizeValue) {
-  std::vector<base::string16> expected_tokens = {
+  std::vector<AddressToken> expected_tokens = {
-      base::ASCIIToUTF16("and"), base::ASCIIToUTF16("anotherone"),
+      {base::ASCIIToUTF16("AnD"), base::ASCIIToUTF16("and"), 1},
-      base::ASCIIToUTF16("value")};
+      {base::ASCIIToUTF16("anotherOne"), base::ASCIIToUTF16("anotherone"), 2},
+      {base::ASCIIToUTF16("valUe"), base::ASCIIToUTF16("value"), 0}};
  EXPECT_EQ(TokenizeValue(base::ASCIIToUTF16("  valUe AnD    anotherOne")),
            expected_tokens);
-  std::vector<base::string16> expected_cjk_tokens = {base::UTF8ToUTF16("영"),
+  std::vector<AddressToken> expected_cjk_tokens = {
-                                                     base::UTF8ToUTF16("이"),
+      {base::UTF8ToUTF16("영"), base::UTF8ToUTF16("영"), 1},
-                                                     base::UTF8ToUTF16("호")};
+      {base::UTF8ToUTF16("이"), base::UTF8ToUTF16("이"), 0},
+      {base::UTF8ToUTF16("호"), base::UTF8ToUTF16("호"), 2}};
  EXPECT_EQ(TokenizeValue(base::UTF8ToUTF16("이영 호")), expected_cjk_tokens);
  EXPECT_EQ(TokenizeValue(base::UTF8ToUTF16("이・영호")), expected_cjk_tokens);
  EXPECT_EQ(TokenizeValue(base::UTF8ToUTF16("이영 호")), expected_cjk_tokens);
@@ -230,25 +240,5 @@ TEST(AutofillStructuredAddressUtils, NormalizeValue) {
            base::UTF8ToUTF16("muller orber"));
 }
-TEST(AutofillStructuredAddressUtils, AreSortedTokensEqual) {
-  EXPECT_FALSE(AreSortedTokensEqual(
-      {base::ASCIIToUTF16("aaaa"), base::ASCIIToUTF16("bbb")},
-      {base::ASCIIToUTF16("aaa"), base::ASCIIToUTF16("bbb")}));
-  EXPECT_TRUE(AreSortedTokensEqual(
-      {base::ASCIIToUTF16("aaa"), base::ASCIIToUTF16("bbb")},
-      {base::ASCIIToUTF16("aaa"), base::ASCIIToUTF16("bbb")}));
-  EXPECT_FALSE(AreSortedTokensEqual(
-      {base::ASCIIToUTF16("aaa")},
-      {base::ASCIIToUTF16("aaa"), base::ASCIIToUTF16("bbb")}));
-}
-TEST(AutofillStructuredAddressUtils, AreStringTokenEquivalent) {
-  EXPECT_TRUE(AreStringTokenEquivalent(base::ASCIIToUTF16("A B C"),
-                                       base::ASCIIToUTF16("A C B")));
-  EXPECT_FALSE(AreStringTokenEquivalent(base::ASCIIToUTF16("A Bb C"),
-                                        base::ASCIIToUTF16("A C B")));
-}
 }  // namespace structured_address
 }  // namespace autofill
--- a/components/autofill/core/common/autofill_features.cc
+++ b/components/autofill/core/common/autofill_features.cc
@@ -97,6 +97,12 @@ const base::Feature kAutofillEnableSupportForMoreStructureInNames{
    "AutofillEnableSupportForMoreStructureInNames",
    base::FEATURE_DISABLED_BY_DEFAULT};
+// Controls if Autofill supports merging subset names.
+// TODO(crbug.com/1098943): Remove once launched.
+const base::Feature kAutofillEnableSupportForMergingSubsetNames{
+    "AutofillEnableSupportForMergingSubsetNames",
+    base::FEATURE_DISABLED_BY_DEFAULT};
 // Controls whether or not a minimum number of fields is required before
 // heuristic field type prediction is run for a form.
 const base::Feature kAutofillEnforceMinRequiredFieldsForHeuristics{

--- a/components/autofill/core/common/autofill_features.h
+++ b/components/autofill/core/common/autofill_features.h
@@ -35,6 +35,7 @@ extern const base::Feature kAutofillEnableAugmentedPhoneCountryCode;
 extern const base::Feature kAutofillEnableCompanyName;
 extern const base::Feature kAutofillEnableHideSuggestionsUI;
 extern const base::Feature kAutofillEnableSupportForMoreStructureInNames;
+extern const base::Feature kAutofillEnableSupportForMergingSubsetNames;
 extern const base::Feature kAutofillEnforceMinRequiredFieldsForHeuristics;
 extern const base::Feature kAutofillEnforceMinRequiredFieldsForQuery;
 extern const base::Feature kAutofillEnforceMinRequiredFieldsForUpload;