Commit 19e44bf3 authored by Matthias Körber's avatar Matthias Körber Committed by Commit Bot

[Autofill][SlimShady] Utils for hybrid structured address components.

Currently, the utils only contain functions to parse/match regular
expressions and a cache for compiled regular expressions.

I kept the RE2 engine since the support for named capture groups
is very limited in ICU: You can have named capture groups but there
seems to be no method to acquire the the group names.

Change-Id: I664c4ce548d35971e43881c70a3849a5b09bbf33
Bug: 1099202
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2279980
Commit-Queue: Matthias Körber <koerber@google.com>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#789451}
parent c2c5230e
......@@ -88,6 +88,8 @@ jumbo_static_library("browser") {
"data_model/autofill_profile.h",
"data_model/autofill_profile_comparator.cc",
"data_model/autofill_profile_comparator.h",
"data_model/autofill_structured_address_utils.cc",
"data_model/autofill_structured_address_utils.h",
"data_model/contact_info.cc",
"data_model/contact_info.h",
"data_model/credit_card.cc",
......@@ -566,6 +568,7 @@ source_set("unit_tests") {
"data_model/autofill_data_model_unittest.cc",
"data_model/autofill_profile_comparator_unittest.cc",
"data_model/autofill_profile_unittest.cc",
"data_model/autofill_structured_address_utils_unittest.cc",
"data_model/contact_info_unittest.cc",
"data_model/credit_card_unittest.cc",
"data_model/phone_number_unittest.cc",
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "base/check.h"
#include "base/debug/alias.h"
#include "base/debug/dump_without_crashing.h"
#include "base/strings/strcat.h"
namespace autofill {
namespace structured_address {
Re2ExpressionCache::Re2ExpressionCache() = default;
// static
Re2ExpressionCache* Re2ExpressionCache::Instance() {
static base::NoDestructor<Re2ExpressionCache> g_re2regex_cache;
return g_re2regex_cache.get();
}
const RE2* Re2ExpressionCache::GetExpression(const std::string& pattern) {
// For thread safety, acquire a lock to prevent concurrent access.
base::AutoLock lock(lock_);
auto it = expression_map_.find(pattern);
if (it != expression_map_.end()) {
const RE2* expression = it->second.get();
return expression;
}
// Build the expression and verify it is correct.
auto expression_ptr = BuildExpressionFromPattern(pattern);
// Insert the expression into the map, check the success and return the
// pointer.
auto result = expression_map_.emplace(pattern, std::move(expression_ptr));
DCHECK(result.second);
return result.first->second.get();
}
std::unique_ptr<const RE2> BuildExpressionFromPattern(std::string pattern) {
RE2::Options opt;
opt.set_case_sensitive(false);
auto expression = std::make_unique<const RE2>(pattern, opt);
if (!expression->ok()) {
DEBUG_ALIAS_FOR_CSTR(pattern_copy, pattern.c_str(), 128);
base::debug::DumpWithoutCrashing();
}
return expression;
}
bool ParseValueByRegularExpression(
const std::string& value,
const std::string& pattern,
std::map<std::string, std::string>* result_map) {
DCHECK(result_map);
const RE2* expression =
Re2ExpressionCache::Instance()->GetExpression(pattern);
return ParseValueByRegularExpression(value, expression, result_map);
}
bool ParseValueByRegularExpression(
const std::string& value,
const RE2* expression,
std::map<std::string, std::string>* result_map) {
if (!expression || !expression->ok())
return false;
// Get the number of capturing groups in the expression.
// Note, the capturing group for the full match is not counted.
size_t number_of_capturing_groups = expression->NumberOfCapturingGroups() + 1;
// Create result vectors to get the matches for the capturing groups.
std::vector<std::string> results(number_of_capturing_groups);
std::vector<RE2::Arg> match_results(number_of_capturing_groups);
std::vector<RE2::Arg*> match_results_ptr(number_of_capturing_groups);
// Note, the capturing group for the full match is not counted by
// |NumberOfCapturingGroups|.
for (size_t i = 0; i < number_of_capturing_groups; i++) {
match_results[i] = &results[i];
match_results_ptr[i] = &match_results[i];
}
// One capturing group is not counted since it holds the full match.
if (!RE2::FullMatchN(value, *expression, match_results_ptr.data(),
number_of_capturing_groups - 1))
return false;
// If successful, write the values into the results map.
// Note, the capturing group for the full match creates an off-by-one scenario
// in the indexing.
for (auto named_group : expression->NamedCapturingGroups())
(*result_map)[named_group.first] =
std::move(results.at(named_group.second - 1));
return true;
}
bool IsPartialMatch(const std::string& value, const std::string& pattern) {
const RE2* expression =
Re2ExpressionCache::Instance()->GetExpression(pattern);
if (!expression || !expression->ok())
return false;
return RE2::PartialMatch(value, *expression);
}
std::vector<std::string> GetAllPartialMatches(const std::string& value,
const std::string& pattern) {
const RE2* expression =
Re2ExpressionCache::Instance()->GetExpression(pattern);
if (!expression || !expression->ok())
return {};
re2::StringPiece input(value);
std::string match;
std::vector<std::string> matches;
while (re2::RE2::FindAndConsume(&input, *expression, &match)) {
matches.emplace_back(match);
}
return matches;
}
std::vector<std::string> ExtractAllPlaceholders(const std::string& value) {
return GetAllPartialMatches(value, "\\${([\\w]+)}");
}
std::string GetPlaceholderToken(const std::string& value) {
return base::StrCat({"${", value, "}"});
}
} // namespace structured_address
} // namespace autofill
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_UTILS_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_UTILS_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "base/macros.h"
#include "base/no_destructor.h"
#include "base/synchronization/lock.h"
#include "third_party/re2/src/re2/re2.h"
namespace autofill {
namespace structured_address {
// A cache for compiled RE2 regular expressions.
class Re2ExpressionCache {
public:
Re2ExpressionCache& operator=(const Re2ExpressionCache&) = delete;
Re2ExpressionCache(const Re2ExpressionCache&) = delete;
~Re2ExpressionCache() = delete;
// Returns a singleton instance.
static Re2ExpressionCache* Instance();
// Returns a pointer to a constant compiled expression that matches |pattern|
// case-insensitively.
const RE2* GetExpression(const std::string& pattern);
#ifdef UNIT_TEST
// Returns true if the compiled regular expression corresponding to |pattern|
// is cached.
bool IsExpressionCachedForTesting(const std::string& pattern) {
return expression_map_.count(pattern) > 0;
}
#endif
private:
Re2ExpressionCache();
// Since the constructor is private, |base::NoDestructor| must be friend to be
// allowed to construct the cache.
friend class base::NoDestructor<Re2ExpressionCache>;
// Stores a compiled regular expression keyed by its corresponding |pattern|.
std::map<std::string, std::unique_ptr<const RE2>> expression_map_;
// A lock to prevent concurrent access to the map.
base::Lock lock_;
};
// Parses |value| with an regular expression defined by |pattern|.
// Returns true on success meaning that the expressions is fully matched.
// The matching results are written into the supplied |result_map|, keyed by the
// name of the capture group with the captured substrings as the value.
bool ParseValueByRegularExpression(
const std::string& value,
const std::string& pattern,
std::map<std::string, std::string>* result_map);
// Same as above, but accepts a compiled regular expression instead of the
// pattern.
bool ParseValueByRegularExpression(
const std::string& value,
const RE2* expression,
std::map<std::string, std::string>* result_map);
// Returns a compiled case insensitive regular expression for |pattern|.
std::unique_ptr<const RE2> BuildExpressionFromPattern(std::string pattern);
// Returns true if |value| can be matched with |pattern|.
bool IsPartialMatch(const std::string& value, const std::string& pattern);
// Returns a vector that contains all partial matches of |pattern| in |value|;
std::vector<std::string> GetAllPartialMatches(const std::string& value,
const std::string& pattern);
// Extracts all placeholders of the format ${PLACEHOLDER} in |value|.
std::vector<std::string> ExtractAllPlaceholders(const std::string& value);
// Returns |value| as a placeholder token: ${value}.
std::string GetPlaceholderToken(const std::string& value);
} // namespace structured_address
} // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_UTILS_H_
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
#include <stddef.h>
#include <map>
#include <string>
#include <vector>
#include "base/test/gtest_util.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace autofill {
namespace structured_address {
// Regular expression with named capture groups for parsing US-style names.
char kFirstMiddleLastRe[] =
"^(?P<NAME_FULL>((?P<NAME_FIRST>\\w+)\\s)?"
"((?P<NAME_MIDDLE>(\\w+(?:\\s+\\w+)*))\\s)??"
"(?P<NAME_LAST>\\w+))$";
// Test the successful parsing of a value by a regular expression.
TEST(AutofillStructuredAddressUtils, TestParseValueByRegularExpression) {
std::string regex = kFirstMiddleLastRe;
std::string value = "first middle1 middle2 middle3 last";
std::map<std::string, std::string> result_map;
bool success = ParseValueByRegularExpression(value, regex, &result_map);
EXPECT_TRUE(success);
EXPECT_EQ(result_map["NAME_FULL"], value);
EXPECT_EQ(result_map["NAME_FIRST"], "first");
EXPECT_EQ(result_map["NAME_MIDDLE"], "middle1 middle2 middle3");
EXPECT_EQ(result_map["NAME_LAST"], "last");
// Parse a name with only one middle name.
value = "first middle1 last";
result_map.clear();
success = ParseValueByRegularExpression(value, regex, &result_map);
EXPECT_TRUE(success);
EXPECT_EQ(result_map["NAME_FULL"], value);
EXPECT_EQ(result_map["NAME_FIRST"], "first");
EXPECT_EQ(result_map["NAME_MIDDLE"], "middle1");
EXPECT_EQ(result_map["NAME_LAST"], "last");
// Parse a name without a middle name.
value = "first last";
result_map.clear();
success = ParseValueByRegularExpression(value, regex, &result_map);
// Verify the expectation.
EXPECT_TRUE(success);
EXPECT_EQ(result_map["NAME_FULL"], value);
EXPECT_EQ(result_map["NAME_FIRST"], "first");
EXPECT_EQ(result_map["NAME_MIDDLE"], "");
EXPECT_EQ(result_map["NAME_LAST"], "last");
// Parse a name without only a last name.
value = "last";
result_map.clear();
success = ParseValueByRegularExpression(value, regex, &result_map);
// Verify the expectations.
EXPECT_TRUE(success);
EXPECT_EQ(result_map["NAME_FULL"], value);
EXPECT_EQ(result_map["NAME_FIRST"], "");
EXPECT_EQ(result_map["NAME_MIDDLE"], "");
EXPECT_EQ(result_map["NAME_LAST"], "last");
// Parse an empty name that should not be successful.
value = "";
result_map.clear();
success = ParseValueByRegularExpression(value, regex, &result_map);
// Verify the expectations.
EXPECT_FALSE(success);
EXPECT_EQ(result_map.size(), 0u);
}
TEST(AutofillStructuredAddressUtils,
TestParseValueByRegularExpression_OnlyPartialMatch) {
std::string regex = "(!<GROUP>this)";
std::string value = "this is missing";
std::map<std::string, std::string> result_map;
EXPECT_FALSE(ParseValueByRegularExpression(value, regex, &result_map));
}
TEST(AutofillStructuredAddressUtils,
TestParseValueByRegularExpression_InvalidExpression) {
std::string regex = "(!<INVALID";
std::string value = "first middle1 middle2 middle3 last";
std::map<std::string, std::string> result_map;
EXPECT_FALSE(ParseValueByRegularExpression(value, regex, &result_map));
auto expression = BuildExpressionFromPattern(regex);
EXPECT_FALSE(
ParseValueByRegularExpression(value, expression.get(), &result_map));
}
TEST(AutofillStructuredAddressUtils,
TestParseValueByRegularExpression_UnintializedResultMap) {
std::string regex = "(exp)";
std::string value = "first middle1 middle2 middle3 last";
std::map<std::string, std::string>* result_map = nullptr;
ASSERT_DCHECK_DEATH(ParseValueByRegularExpression(value, regex, result_map));
}
// Test the matching of a value against a regular expression.
TEST(AutofillStructuredAddressUtils, TestIsPartialMatch) {
EXPECT_TRUE(IsPartialMatch("123 sdf 123", "sdf"));
EXPECT_FALSE(IsPartialMatch("123 sdf 123", "^sdf$"));
}
// Test the matching of a value against an invalid regular expression.
TEST(AutofillStructuredAddressUtils, TestIsPartialMatch_InvalidExpression) {
EXPECT_FALSE(IsPartialMatch("123 sdf 123", "(!<sdf"));
}
// Test the caching of regular expressions.
TEST(AutofillStructuredAddressUtils, TestExpressionCaching) {
std::string pattern = "(?P<SOME_EXPRESSION>.)";
// Verify that the pattern is not cached yet.
EXPECT_FALSE(
Re2ExpressionCache::Instance()->IsExpressionCachedForTesting(pattern));
// Request the regular expression and verify that it is cached afterwards.
Re2ExpressionCache::Instance()->GetExpression(pattern);
EXPECT_TRUE(
Re2ExpressionCache::Instance()->IsExpressionCachedForTesting(pattern));
}
TEST(AutofillStructuredAddressUtils, TestGetAllPartialMatches) {
std::string input = "abaacaada";
std::string pattern = "(a.a)";
std::vector<std::string> expectation = {"aba", "aca", "ada"};
EXPECT_TRUE(IsPartialMatch(input, pattern));
EXPECT_EQ(GetAllPartialMatches(input, pattern), expectation);
}
TEST(AutofillStructuredAddressUtils, TestGetAllPartialMatches_InvalidPattern) {
std::string input = "abaacaada";
std::string pattern = "(a.a";
std::vector<std::string> expectation = {};
EXPECT_FALSE(IsPartialMatch(input, pattern));
EXPECT_EQ(GetAllPartialMatches(input, pattern), expectation);
}
TEST(AutofillStructuredAddressUtils,
TestExtractAllPlaceholders_Isolated_Placeholder) {
std::string input = "${HOLDER1}";
std::vector<std::string> expectation = {"HOLDER1"};
EXPECT_EQ(ExtractAllPlaceholders(input), expectation);
}
TEST(AutofillStructuredAddressUtils,
TestExtractAllPlaceholders_Placeholder_In_Text) {
std::string input = "Some ${HOLDER1} Text";
std::vector<std::string> expectation = {"HOLDER1"};
EXPECT_EQ(ExtractAllPlaceholders(input), expectation);
}
TEST(AutofillStructuredAddressUtils,
TestExtractAllPlaceholders_Multiple_Placeholders_In_Text) {
std::string input = "Some ${HOLDER1} Text ${HOLDER2}";
std::vector<std::string> expectation = {"HOLDER1", "HOLDER2"};
EXPECT_EQ(ExtractAllPlaceholders(input), expectation);
}
TEST(AutofillStructuredAddressUtils, TestExtractAllPlaceholders_Broken_Syntax) {
std::string input = "Some ${HOLDER1} }} ";
std::vector<std::string> expectation = {"HOLDER1"};
EXPECT_EQ(ExtractAllPlaceholders(input), expectation);
}
TEST(AutofillStructuredAddressUtils,
TestExtractAllPlaceholders_Nested_Placeholders) {
std::string input = "Some ${HOLDER${INANHOLDER}} }} ";
std::vector<std::string> expectation = {"INANHOLDER"};
EXPECT_EQ(ExtractAllPlaceholders(input), expectation);
}
TEST(AutofillStructuredAddressUtils, TestGetPlaceholderToken) {
EXPECT_EQ("${VAR}", GetPlaceholderToken("VAR"));
}
} // namespace structured_address
} // namespace autofill
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment