Commit f8db21ff authored by Matthias Körber's avatar Matthias Körber Committed by Commit Bot

[Autofill][Leipzig] Fallback parsing for address lines

This CL improves parsing of multi-line addresses.
If the full street address cannot be parsed, try to parse only the first
line into the street name and house number.

Additionally, this CL adds assignment of the
HOME_ADDRESS_STREET_AND_DEPENDENT_STREET_NAME token.

Change-Id: I0e377095834a722a5f9971e77d67a41a43b5a398
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2438392
Commit-Queue: Matthias Körber <koerber@google.com>
Reviewed-by: default avatarChristoph Schwering <schwering@google.com>
Cr-Commit-Position: refs/heads/master@{#811969}
parent 02611dfd
...@@ -117,6 +117,21 @@ StreetAddress::GetParseRegularExpressionsByRelevance() const { ...@@ -117,6 +117,21 @@ StreetAddress::GetParseRegularExpressionsByRelevance() const {
RegEx::kParseStreetNameHouseNumberSuffixedFloor)}; RegEx::kParseStreetNameHouseNumberSuffixedFloor)};
} }
void StreetAddress::ParseValueAndAssignSubcomponentsByFallbackMethod() {
// There is no point in doing a line-wise approach if there aren't multiple
// lines.
if (address_lines_.size() < 2)
return;
// Try to parse the address using only the first line.
for (const auto* parse_expression : GetParseRegularExpressionsByRelevance()) {
if (ParseValueAndAssignSubcomponentsByRegularExpression(
address_lines_.at(0), parse_expression)) {
return;
}
}
}
bool StreetAddress::HasNewerValuePrecendenceInMerging( bool StreetAddress::HasNewerValuePrecendenceInMerging(
const AddressComponent& newer_component) const { const AddressComponent& newer_component) const {
// If the newer component has a better verification status, use the newer one. // If the newer component has a better verification status, use the newer one.
......
...@@ -130,6 +130,9 @@ class StreetAddress : public AddressComponentWithRewriter { ...@@ -130,6 +130,9 @@ class StreetAddress : public AddressComponentWithRewriter {
// Recalculates the address line after an assignment. // Recalculates the address line after an assignment.
void PostAssignSanitization() override; void PostAssignSanitization() override;
// Apply line-wise parsing of the street address as a fallback method.
void ParseValueAndAssignSubcomponentsByFallbackMethod() override;
protected: protected:
// Implements support for getting the value of the individual address lines. // Implements support for getting the value of the individual address lines.
bool ConvertAndGetTheValueForAdditionalFieldTypeName( bool ConvertAndGetTheValueForAdditionalFieldTypeName(
......
...@@ -406,28 +406,37 @@ bool AddressComponent::ParseValueAndAssignSubcomponentsByRegularExpressions() { ...@@ -406,28 +406,37 @@ bool AddressComponent::ParseValueAndAssignSubcomponentsByRegularExpressions() {
for (const auto* parse_expression : GetParseRegularExpressionsByRelevance()) { for (const auto* parse_expression : GetParseRegularExpressionsByRelevance()) {
if (!parse_expression) if (!parse_expression)
continue; continue;
std::map<std::string, std::string> result_map; if (ParseValueAndAssignSubcomponentsByRegularExpression(GetValue(),
if (ParseValueByRegularExpression(base::UTF16ToUTF8(GetValue()), parse_expression))
parse_expression, &result_map)) {
// Parsing was successful and results from the result map can be written
// to the structure.
for (const auto& result_entry : result_map) {
std::string field_type = result_entry.first;
base::string16 field_value = base::UTF8ToUTF16(result_entry.second);
// Do not reassign the value of this node.
if (field_type == GetStorageTypeName())
continue;
// crbug.com(1113617): Honorifics are temporally disabled.
if (field_type == AutofillType(NAME_HONORIFIC_PREFIX).ToString())
continue;
bool success = SetValueForTypeIfPossible(field_type, field_value,
VerificationStatus::kParsed);
// Setting the value should always work unless the regular expression is
// invalid.
DCHECK(success);
}
return true; return true;
}
return false;
}
bool AddressComponent::ParseValueAndAssignSubcomponentsByRegularExpression(
const base::string16& value,
const RE2* parse_expression) {
std::map<std::string, std::string> result_map;
if (ParseValueByRegularExpression(base::UTF16ToUTF8(value), parse_expression,
&result_map)) {
// Parsing was successful and results from the result map can be written
// to the structure.
for (const auto& result_entry : result_map) {
const std::string& field_type = result_entry.first;
base::string16 field_value = base::UTF8ToUTF16(result_entry.second);
// Do not reassign the value of this node.
if (field_type == GetStorageTypeName())
continue;
// crbug.com(1113617): Honorifics are temporarily disabled.
if (field_type == AutofillType(NAME_HONORIFIC_PREFIX).ToString())
continue;
bool success = SetValueForTypeIfPossible(field_type, field_value,
VerificationStatus::kParsed);
// Setting the value should always work unless the regular expression is
// invalid.
DCHECK(success);
} }
return true;
} }
return false; return false;
} }
......
...@@ -463,6 +463,12 @@ class AddressComponent { ...@@ -463,6 +463,12 @@ class AddressComponent {
virtual bool HasNewerValuePrecendenceInMerging( virtual bool HasNewerValuePrecendenceInMerging(
const AddressComponent& newer_component) const; const AddressComponent& newer_component) const;
// Parses |value| by using |parse_expressions| and assigns the values.
// Returns true on success.
bool ParseValueAndAssignSubcomponentsByRegularExpression(
const base::string16& value,
const re2::RE2* parse_expression);
private: private:
// Unsets the node and all of its children. // Unsets the node and all of its children.
void UnsetAddressComponentAndItsSubcomponents(); void UnsetAddressComponentAndItsSubcomponents();
......
...@@ -126,7 +126,8 @@ const char kSingleWordRe[] = "(?:[^\\s,]+)"; ...@@ -126,7 +126,8 @@ const char kSingleWordRe[] = "(?:[^\\s,]+)";
// Regular expression pattern for multiple lazy words meaning that the // Regular expression pattern for multiple lazy words meaning that the
// expression avoids to match more than one word if possible. // expression avoids to match more than one word if possible.
const char kMultipleLazyWordsRe[] = "(?:[^\\s,]+(?:\\s+[^\\s,]+)*?)"; // Words are separated by white spaces but not by newlines or carriage returns.
const char kMultipleLazyWordsRe[] = "(?:[^\\s,]+(?:[^\\S\\r\\n]+[^\\s,]+)*?)";
// Regular expression pattern to check if a name contains a Hispanic/Latinx // Regular expression pattern to check if a name contains a Hispanic/Latinx
// last name conjunction. // last name conjunction.
...@@ -347,7 +348,10 @@ std::string ParseLastNameIntoSecondLastNameExpression() { ...@@ -347,7 +348,10 @@ std::string ParseLastNameIntoSecondLastNameExpression() {
std::string ParseStreetNameHouseNumberExpression() { std::string ParseStreetNameHouseNumberExpression() {
return CaptureTypeWithPattern( return CaptureTypeWithPattern(
ADDRESS_HOME_STREET_ADDRESS, ADDRESS_HOME_STREET_ADDRESS,
{CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME, kMultipleLazyWordsRe), {CaptureTypeWithPattern(ADDRESS_HOME_STREET_AND_DEPENDENT_STREET_NAME,
CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME,
kMultipleLazyWordsRe),
{.separator = ""}),
CaptureTypeWithPrefixedPattern(ADDRESS_HOME_HOUSE_NUMBER, CaptureTypeWithPrefixedPattern(ADDRESS_HOME_HOUSE_NUMBER,
kHouseNumberOptionalPrefixRe, kHouseNumberOptionalPrefixRe,
"(?:\\d+\\w?)"), "(?:\\d+\\w?)"),
...@@ -373,21 +377,27 @@ std::string ParseStreetNameHouseNumberExpression() { ...@@ -373,21 +377,27 @@ std::string ParseStreetNameHouseNumberExpression() {
std::string ParseStreetNameHouseNumberExpressionSuffixedFloor() { std::string ParseStreetNameHouseNumberExpressionSuffixedFloor() {
return CaptureTypeWithPattern( return CaptureTypeWithPattern(
ADDRESS_HOME_STREET_ADDRESS, ADDRESS_HOME_STREET_ADDRESS,
{CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME, kMultipleLazyWordsRe), {
CaptureTypeWithPrefixedPattern(ADDRESS_HOME_HOUSE_NUMBER,
kHouseNumberOptionalPrefixRe, CaptureTypeWithPattern(
"(?:\\d+\\w?)"), ADDRESS_HOME_STREET_AND_DEPENDENT_STREET_NAME,
CaptureTypeWithPattern( CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME,
ADDRESS_HOME_SUBPREMISE, kMultipleLazyWordsRe),
{ {.separator = ""}),
CaptureTypeWithSuffixedPattern( CaptureTypeWithPrefixedPattern(ADDRESS_HOME_HOUSE_NUMBER,
ADDRESS_HOME_FLOOR, "(?:(\\d{0,3}\\w?))", kFloorAffixRe, kHouseNumberOptionalPrefixRe,
{.quantifier = MATCH_OPTIONAL}), "(?:\\d+\\w?)"),
CaptureTypeWithPrefixedPattern( CaptureTypeWithPattern(
ADDRESS_HOME_APT_NUM, kApartmentNumberPrefix, ADDRESS_HOME_SUBPREMISE,
"(?:(\\d{0,3}\\w?))", {.quantifier = MATCH_OPTIONAL}), {
}, CaptureTypeWithSuffixedPattern(
{.quantifier = MATCH_OPTIONAL})}); ADDRESS_HOME_FLOOR, "(?:(\\d{0,3}\\w?))", kFloorAffixRe,
{.quantifier = MATCH_OPTIONAL}),
CaptureTypeWithPrefixedPattern(
ADDRESS_HOME_APT_NUM, kApartmentNumberPrefix,
"(?:(\\d{0,3}\\w?))", {.quantifier = MATCH_OPTIONAL}),
},
{.quantifier = MATCH_OPTIONAL})});
} }
// Returns an expression to parse a street address into the street name, the // Returns an expression to parse a street address into the street name, the
...@@ -400,7 +410,10 @@ std::string ParseHouseNumberStreetNameExpression() { ...@@ -400,7 +410,10 @@ std::string ParseHouseNumberStreetNameExpression() {
return CaptureTypeWithPattern( return CaptureTypeWithPattern(
ADDRESS_HOME_STREET_ADDRESS, ADDRESS_HOME_STREET_ADDRESS,
{CaptureTypeWithPattern(ADDRESS_HOME_HOUSE_NUMBER, "(?:\\d+\\w{0,3})"), {CaptureTypeWithPattern(ADDRESS_HOME_HOUSE_NUMBER, "(?:\\d+\\w{0,3})"),
CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME, kMultipleLazyWordsRe), CaptureTypeWithPattern(ADDRESS_HOME_STREET_AND_DEPENDENT_STREET_NAME,
CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME,
kMultipleLazyWordsRe),
{.separator = ""}),
CaptureTypeWithPattern( CaptureTypeWithPattern(
ADDRESS_HOME_SUBPREMISE, ADDRESS_HOME_SUBPREMISE,
{ {
......
...@@ -175,6 +175,34 @@ TEST(AutofillStructuredAddress, ParseStreetAddress) { ...@@ -175,6 +175,34 @@ TEST(AutofillStructuredAddress, ParseStreetAddress) {
TestAddressLineParsing(test_case); TestAddressLineParsing(test_case);
} }
TEST(AutofillStructuredAddress, ParseMultiLineStreetAddress) {
std::vector<AddressLineParsingTestCase> test_cases = {
{.street_address = "Implerstr. 73a\nObergeschoss 2 Wohnung 3",
.street_name = "Implerstr.",
.house_number = "73a",
.floor = "2",
.apartment = "3"},
{.street_address = "Implerstr. 73a\nSome Unparsable Text",
.street_name = "Implerstr.",
.house_number = "73a"},
{.street_address = "1600 Amphitheatre Parkway\nFloor 6 Apt 12",
.street_name = "Amphitheatre Parkway",
.house_number = "1600",
.floor = "6",
.apartment = "12"},
{.street_address = "1600 Amphitheatre Parkway\nSome UnparseableText",
.street_name = "Amphitheatre Parkway",
.house_number = "1600"},
{.street_address = "Av. Paulista, 1098\n1º andar, apto. 101",
.street_name = "Av. Paulista",
.house_number = "1098",
.floor = "1",
.apartment = "101"}};
for (const auto& test_case : test_cases)
TestAddressLineParsing(test_case);
}
TEST(AutofillStructuredAddress, TestStreetAddressFormatting) { TEST(AutofillStructuredAddress, TestStreetAddressFormatting) {
Address address; Address address;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment