Commit 2a87c5fb authored by dhollowa@chromium.org's avatar dhollowa@chromium.org

Heuristics for grabber-continental.com.out (select-one)

Adds specific input types to the field_type.h bitfield to utilize the "select-one" signal in identifying the Country, State, and Credit Card date fields.  Other fields should not classify now if the input type is "select-one".

This is Phase 1 of fixes for the grabber-continental.com.html.  Phase 2 will add lookahead logic for the "BusinessPhone" and "Email Address" issues.

BUG=76299
TEST=FormStructureBrowserTest.DataDrivenHeuristics with test file grabber-continental.com.html

Review URL: http://codereview.chromium.org/7063031

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@86717 0039d316-1c4b-4281-b951-d872f2087c98
parent 44c9416b
...@@ -195,7 +195,7 @@ bool AddressField::ParseAddressLines(AutofillScanner* scanner, ...@@ -195,7 +195,7 @@ bool AddressField::ParseAddressLines(AutofillScanner* scanner,
l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE); l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE);
if (!ParseField(scanner, pattern, &address_field->address1_) && if (!ParseField(scanner, pattern, &address_field->address1_) &&
!ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL, !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
&address_field->address1_)) { &address_field->address1_)) {
return false; return false;
} }
...@@ -214,7 +214,7 @@ bool AddressField::ParseAddressLines(AutofillScanner* scanner, ...@@ -214,7 +214,7 @@ bool AddressField::ParseAddressLines(AutofillScanner* scanner,
l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE); l10n_util::GetStringUTF16(IDS_AUTOFILL_ADDRESS_LINE_1_LABEL_RE);
if (!ParseEmptyLabel(scanner, &address_field->address2_) && if (!ParseEmptyLabel(scanner, &address_field->address2_) &&
!ParseField(scanner, pattern, &address_field->address2_)) { !ParseField(scanner, pattern, &address_field->address2_)) {
ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL, ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
&address_field->address2_); &address_field->address2_);
} }
} }
...@@ -250,7 +250,8 @@ bool AddressField::ParseCountry(AutofillScanner* scanner, ...@@ -250,7 +250,8 @@ bool AddressField::ParseCountry(AutofillScanner* scanner,
else else
pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COUNTRY_RE); pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_COUNTRY_RE);
return ParseField(scanner, pattern, &address_field->country_); return ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_SELECT,
&address_field->country_);
} }
// static // static
...@@ -334,7 +335,8 @@ bool AddressField::ParseState(AutofillScanner* scanner, ...@@ -334,7 +335,8 @@ bool AddressField::ParseState(AutofillScanner* scanner,
else else
pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_STATE_RE); pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_STATE_RE);
return ParseField(scanner, pattern, &address_field->state_); return ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_SELECT,
&address_field->state_);
} }
AddressType AddressField::AddressTypeFromText(const string16 &text) { AddressType AddressField::AddressTypeFromText(const string16 &text) {
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
zip|^-$|post2<!-- pt-BR, pt-PT -->|codpos2 zip|^-$|post2<!-- pt-BR, pt-PT -->|codpos2
</message> </message>
<message name="IDS_AUTOFILL_CITY_RE"> <message name="IDS_AUTOFILL_CITY_RE">
city|town<!-- de-DE -->|ort|stadt<!-- en-AU -->|suburb<!-- es -->|ciudad|provincia|localidad|poblacion<!-- fr-FR -->|ville|commune<!-- it-IT -->|localita<!-- ja-JP -->|市区町村<!-- pt-BR, pt-PT -->|cidade<!-- ru -->|Город<!-- zh-CN -->|市<!-- zh-TW -->|分區 city|town<!-- de-DE -->|^ort$|stadt<!-- en-AU -->|suburb<!-- es -->|ciudad|provincia|localidad|poblacion<!-- fr-FR -->|ville|commune<!-- it-IT -->|localita<!-- ja-JP -->|市区町村<!-- pt-BR, pt-PT -->|cidade<!-- ru -->|Город<!-- zh-CN -->|市<!-- zh-TW -->|分區
</message> </message>
<message name="IDS_AUTOFILL_STATE_RE"> <message name="IDS_AUTOFILL_STATE_RE">
state|county|region|province<!-- de-DE -->|land<!-- en-UK -->|county|principality<!-- ja-JP -->|都道府県<!-- pt-BR, pt-PT -->|estado|provincia<!-- ru -->|область<!-- zh-CN -->|省<!-- zh-TW -->|地區 state|county|region|province<!-- de-DE -->|land<!-- en-UK -->|county|principality<!-- ja-JP -->|都道府県<!-- pt-BR, pt-PT -->|estado|provincia<!-- ru -->|область<!-- zh-CN -->|省<!-- zh-TW -->|地區
......
...@@ -122,14 +122,15 @@ CreditCardField* CreditCardField::Parse(AutofillScanner* scanner, ...@@ -122,14 +122,15 @@ CreditCardField* CreditCardField::Parse(AutofillScanner* scanner,
if ((!credit_card_field->expiration_month_ || if ((!credit_card_field->expiration_month_ ||
credit_card_field->expiration_month_->IsEmpty()) && credit_card_field->expiration_month_->IsEmpty()) &&
ParseField(scanner, pattern, &credit_card_field->expiration_month_)) { ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_SELECT,
&credit_card_field->expiration_month_)) {
if (is_ecml) if (is_ecml)
pattern = GetEcmlPattern(kEcmlCardExpireYear); pattern = GetEcmlPattern(kEcmlCardExpireYear);
else else
pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_EXPIRATION_DATE_RE); pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_EXPIRATION_DATE_RE);
if (!ParseField(scanner, pattern, if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_SELECT,
&credit_card_field->expiration_year_)) { &credit_card_field->expiration_year_)) {
scanner->Rewind(); scanner->Rewind();
return NULL; return NULL;
} }
...@@ -137,8 +138,10 @@ CreditCardField* CreditCardField::Parse(AutofillScanner* scanner, ...@@ -137,8 +138,10 @@ CreditCardField* CreditCardField::Parse(AutofillScanner* scanner,
} }
} }
if (ParseField(scanner, GetEcmlPattern(kEcmlCardExpireDay), NULL)) if (ParseFieldSpecifics(scanner, GetEcmlPattern(kEcmlCardExpireDay),
MATCH_DEFAULT | MATCH_SELECT, NULL)) {
continue; continue;
}
// Some pages (e.g. ExpediaBilling.html) have a "card description" // Some pages (e.g. ExpediaBilling.html) have a "card description"
// field; we parse this field but ignore it. // field; we parse this field but ignore it.
......
...@@ -20,8 +20,10 @@ EmailField* EmailField::Parse(AutofillScanner* scanner, bool is_ecml) { ...@@ -20,8 +20,10 @@ EmailField* EmailField::Parse(AutofillScanner* scanner, bool is_ecml) {
pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_EMAIL_RE); pattern = l10n_util::GetStringUTF16(IDS_AUTOFILL_EMAIL_RE);
const AutofillField* field; const AutofillField* field;
if (ParseField(scanner, pattern, &field)) if (ParseFieldSpecifics(scanner, pattern,
MATCH_DEFAULT | MATCH_EMAIL, &field)) {
return new EmailField(field); return new EmailField(field);
}
return NULL; return NULL;
} }
......
...@@ -81,6 +81,26 @@ FormField* ParseFormField(AutofillScanner* scanner, bool is_ecml) { ...@@ -81,6 +81,26 @@ FormField* ParseFormField(AutofillScanner* scanner, bool is_ecml) {
return NameField::Parse(scanner, is_ecml); return NameField::Parse(scanner, is_ecml);
} }
bool IsTextField(const string16& type) {
return type == ASCIIToUTF16("text");
}
bool IsEmailField(const string16& type) {
return type == ASCIIToUTF16("email");
}
bool IsMonthField(const string16& type) {
return type == ASCIIToUTF16("month");
}
bool IsTelephoneField(const string16& type) {
return type == ASCIIToUTF16("tel");
}
bool IsSelectField(const string16& type) {
return type == ASCIIToUTF16("select-one");
}
} // namespace } // namespace
// static // static
...@@ -109,7 +129,7 @@ void FormField::ParseFormFields(const std::vector<AutofillField*>& fields, ...@@ -109,7 +129,7 @@ void FormField::ParseFormFields(const std::vector<AutofillField*>& fields,
bool FormField::ParseField(AutofillScanner* scanner, bool FormField::ParseField(AutofillScanner* scanner,
const string16& pattern, const string16& pattern,
const AutofillField** match) { const AutofillField** match) {
return ParseFieldSpecifics(scanner, pattern, MATCH_ALL, match); return ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, match);
} }
// static // static
...@@ -121,20 +141,31 @@ bool FormField::ParseFieldSpecifics(AutofillScanner* scanner, ...@@ -121,20 +141,31 @@ bool FormField::ParseFieldSpecifics(AutofillScanner* scanner,
return false; return false;
const AutofillField* field = scanner->Cursor(); const AutofillField* field = scanner->Cursor();
if (Match(field, pattern, match_type)) {
if (match) if ((match_type & MATCH_TEXT) && IsTextField(field->form_control_type))
*match = field; return MatchAndAdvance(scanner, pattern, match_type, match);
scanner->Advance();
return true; if ((match_type & MATCH_EMAIL) && IsEmailField(field->form_control_type))
return MatchAndAdvance(scanner, pattern, match_type, match);
if ((match_type & MATCH_TELEPHONE) &&
IsTelephoneField(field->form_control_type)) {
return MatchAndAdvance(scanner, pattern, match_type, match);
} }
if ((match_type & MATCH_SELECT) && IsSelectField(field->form_control_type))
return MatchAndAdvance(scanner, pattern, match_type, match);
return false; return false;
} }
// static // static
bool FormField::ParseEmptyLabel(AutofillScanner* scanner, bool FormField::ParseEmptyLabel(AutofillScanner* scanner,
const AutofillField** match) { const AutofillField** match) {
return ParseFieldSpecifics(scanner, ASCIIToUTF16("^$"), MATCH_LABEL, match); return ParseFieldSpecifics(scanner,
ASCIIToUTF16("^$"),
MATCH_LABEL | MATCH_ALL_INPUTS,
match);
} }
// static // static
...@@ -148,6 +179,22 @@ bool FormField::AddClassification(const AutofillField* field, ...@@ -148,6 +179,22 @@ bool FormField::AddClassification(const AutofillField* field,
return map->insert(make_pair(field->unique_name(), type)).second; return map->insert(make_pair(field->unique_name(), type)).second;
} }
// static.
bool FormField::MatchAndAdvance(AutofillScanner* scanner,
const string16& pattern,
int match_type,
const AutofillField** match) {
const AutofillField* field = scanner->Cursor();
if (FormField::Match(field, pattern, match_type)) {
if (match)
*match = field;
scanner->Advance();
return true;
}
return false;
}
// static // static
bool FormField::Match(const AutofillField* field, bool FormField::Match(const AutofillField* field,
const string16& pattern, const string16& pattern,
......
...@@ -32,9 +32,20 @@ class FormField { ...@@ -32,9 +32,20 @@ class FormField {
protected: protected:
// A bit-field used for matching specific parts of a field in question. // A bit-field used for matching specific parts of a field in question.
enum MatchType { enum MatchType {
MATCH_LABEL = 1 << 0, // Attributes.
MATCH_NAME = 1 << 1, MATCH_LABEL = 1 << 0,
MATCH_ALL = MATCH_LABEL | MATCH_NAME MATCH_NAME = 1 << 1,
// Input types.
MATCH_TEXT = 1 << 2,
MATCH_EMAIL = 1 << 3,
MATCH_TELEPHONE = 1 << 4,
MATCH_SELECT = 1 << 5,
MATCH_ALL_INPUTS =
MATCH_TEXT | MATCH_EMAIL | MATCH_TELEPHONE | MATCH_SELECT,
// By default match label and name for input/text types.
MATCH_DEFAULT = MATCH_LABEL | MATCH_NAME | MATCH_TEXT,
}; };
// Only derived classes may instantiate. // Only derived classes may instantiate.
...@@ -75,6 +86,15 @@ class FormField { ...@@ -75,6 +86,15 @@ class FormField {
private: private:
FRIEND_TEST_ALL_PREFIXES(FormFieldTest, Match); FRIEND_TEST_ALL_PREFIXES(FormFieldTest, Match);
// Matches |pattern| to the contents of the field at the head of the
// |scanner|.
// Returns |true| if a match is found according to |match_type|, and |false|
// otherwise.
static bool MatchAndAdvance(AutofillScanner* scanner,
const string16& pattern,
int match_type,
const AutofillField** match);
// Matches the regular expression |pattern| against the components of |field| // Matches the regular expression |pattern| against the components of |field|
// as specified in the |match_type| bit field (see |MatchType|). // as specified in the |match_type| bit field (see |MatchType|).
static bool Match(const AutofillField* field, static bool Match(const AutofillField* field,
......
...@@ -269,9 +269,11 @@ bool PhoneField::ParseInternal(PhoneField *phone_field, ...@@ -269,9 +269,11 @@ bool PhoneField::ParseInternal(PhoneField *phone_field,
// Attempt to parse according to the next grammar. // Attempt to parse according to the next grammar.
for (; i < arraysize(phone_field_grammars_) && for (; i < arraysize(phone_field_grammars_) &&
phone_field_grammars_[i].regex != REGEX_SEPARATOR; ++i) { phone_field_grammars_[i].regex != REGEX_SEPARATOR; ++i) {
if (!ParseField(scanner, if (!ParseFieldSpecifics(
phone_field->GetRegExp(phone_field_grammars_[i].regex), scanner,
&parsed_fields[phone_field_grammars_[i].phone_part])) phone_field->GetRegExp(phone_field_grammars_[i].regex),
MATCH_DEFAULT | MATCH_TELEPHONE,
&parsed_fields[phone_field_grammars_[i].phone_part]))
break; break;
if (phone_field_grammars_[i].max_size && if (phone_field_grammars_[i].max_size &&
(!parsed_fields[phone_field_grammars_[i].phone_part]->max_length || (!parsed_fields[phone_field_grammars_[i].phone_part]->max_length ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment