Commit 7284655c authored by Maxim Kolosovskiy's avatar Maxim Kolosovskiy Committed by Commit Bot

[Password Manager] Polishing HTML based Username Detector


Bug: 699530
Change-Id: I6325ed2be5868d519c5b0d0cb720d09e7b80da4d
Reviewed-on: https://chromium-review.googlesource.com/759792
Commit-Queue: Maxim Kolosovskiy <kolos@chromium.org>
Reviewed-by: default avatarVaclav Brozek <vabr@chromium.org>
Cr-Commit-Position: refs/heads/master@{#521018}
parent e59817eb
......@@ -14,6 +14,8 @@ static_library("renderer") {
"form_classifier.h",
"html_based_username_detector.cc",
"html_based_username_detector.h",
"html_based_username_detector_vocabulary.cc",
"html_based_username_detector_vocabulary.h",
"page_form_analyser_logger.cc",
"page_form_analyser_logger.h",
"page_passwords_analyser.cc",
......
......@@ -5,12 +5,15 @@
#include "components/autofill/content/renderer/html_based_username_detector.h"
#include <algorithm>
#include <map>
#include <tuple>
#include "base/containers/flat_set.h"
#include "base/i18n/case_conversion.h"
#include "base/macros.h"
#include "base/strings/string_split.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/content/renderer/form_autofill_util.h"
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "third_party/WebKit/public/web/WebFormElement.h"
using blink::WebFormControlElement;
......@@ -21,248 +24,38 @@ namespace autofill {
namespace {
// For each input element that can be username, we compute and save developer
// and user group, along with associated short tokens lists (to handle finding
// less than |kMinimumWordLength| letters long words).
// List of separators that can appear in HTML attribute values.
constexpr char kDelimiters[] = "$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";
// Minimum length of a word, in order not to be considered short word. Short
// words will not be searched in attribute values (especially after delimiters
// removing), because a short word may be a part of another word. A short word
// should be enclosed between delimiters, otherwise an occurrence doesn't count.
constexpr int kMinimumWordLength = 4;
// For each input element that can be a username, developer and user group
// values are computed. The user group value includes what a user sees: label,
// placeholder, aria-label (all are stored in FormFieldData.label). The
// developer group value consists of name and id attribute values.
// For each group the set of short tokens (tokens shorter than
// |kMinimumWordLength|) is computed as well.
struct UsernameFieldData {
WebInputElement input_element;
base::string16 developer_value;
std::vector<base::string16> developer_short_tokens;
base::flat_set<base::string16> developer_short_tokens;
base::string16 user_value;
std::vector<base::string16> user_short_tokens;
base::flat_set<base::string16> user_short_tokens;
};
// Words that the algorithm looks for are split into multiple categories based
// on feature reliability.
// A category may contain a latin dictionary and a non-latin dictionary. It is
// mandatory that it has a latin one, but a non-latin might be missing.
// "Latin" translations are the translations of the words for which the
// original translation is similar to the romanized translation (translation of
// the word only using ISO basic Latin alphabet).
// "Non-latin" translations are the translations of the words that have custom,
// country specific characters.
const char* const kNegativeLatin[] = {
"pin", "parola", "wagwoord", "wachtwoord",
"fake", "parole", "givenname", "achinsinsi",
"token", "parool", "firstname", "facalfaire",
"fname", "lozinka", "pasahitza", "focalfaire",
"lname", "passord", "pasiwedhi", "iphasiwedi",
"geslo", "huahuna", "passwuert", "katalaluan",
"heslo", "fullname", "phasewete", "adgangskode",
"parol", "optional", "wachtwurd", "contrasenya",
"sandi", "lastname", "cyfrinair", "contrasinal",
"senha", "kupuhipa", "katasandi", "kalmarsirri",
"hidden", "password", "loluszais", "tenimiafina",
"second", "passwort", "middlename", "paroladordine",
"codice", "pasvorto", "familyname", "inomboloyokuvula",
"modpas", "salasana", "motdepasse", "numeraeleiloaesesi"};
constexpr int kNegativeLatinSize = arraysize(kNegativeLatin);
const char* const kNegativeNonLatin[] = {"fjalëkalim",
"የይለፍቃል",
"كلمهالسر",
"գաղտնաբառ",
"пароль",
"পাসওয়ার্ড",
"парола",
"密码",
"密碼",
"დაგავიწყდათ",
"κωδικόςπρόσβασης",
"પાસવર્ડ",
"סיסמה",
"पासवर्ड",
"jelszó",
"lykilorð",
"paswọọdụ",
"パスワード",
"ಪಾಸ್ವರ್ಡ್",
"пароль",
"ការពាក្យសម្ងាត់",
"암호",
"şîfre",
"купуясөз",
"ລະຫັດຜ່ານ",
"slaptažodis",
"лозинка",
"पासवर्ड",
"нууцүг",
"စကားဝှက်ကို",
"पासवर्ड",
"رمز",
"کلمهعبور",
"hasło",
"пароль",
"лозинка",
"پاسورڊ",
"මුරපදය",
"contraseña",
"lösenord",
"гузарвожа",
"கடவுச்சொல்",
"పాస్వర్డ్",
"รหัสผ่าน",
"пароль",
"پاسورڈ",
"mậtkhẩu",
"פּאַראָל",
"ọrọigbaniwọle"};
constexpr int kNegativeNonLatinSize = arraysize(kNegativeNonLatin);
const char* const kUsernameLatin[] = {
"gatti", "uzantonomo", "solonanarana", "nombredeusuario",
"olumulo", "nomenusoris", "enwdefnyddiwr", "nomdutilisateur",
"lolowera", "notandanafn", "nomedeusuario", "vartotojovardas",
"username", "ahanjirimara", "gebruikersnaam", "numedeutilizator",
"brugernavn", "benotzernumm", "jinalamtumiaji", "erabiltzaileizena",
"brukernavn", "benutzername", "sunanmaiamfani", "foydalanuvchinomi",
"mosebedisi", "kasutajanimi", "ainmcleachdaidh", "igamalomsebenzisi",
"nomdusuari", "lomsebenzisi", "jenengpanganggo", "ingoakaiwhakamahi",
"nomeutente", "namapengguna"};
constexpr int kUsernameLatinSize = arraysize(kUsernameLatin);
const char* const kUsernameNonLatin[] = {"用户名",
"کاتيجونالو",
"用戶名",
"የተጠቃሚስም",
"логин",
"اسمالمستخدم",
"נאמען",
"کاصارفکانام",
"ユーザ名",
"όνομα χρήστη",
"brûkersnamme",
"корисничкоиме",
"nonitilizatè",
"корисничкоиме",
"ngaranpamaké",
"ຊື່ຜູ້ໃຊ້",
"användarnamn",
"యూజర్పేరు",
"korisničkoime",
"пайдаланушыаты",
"שםמשתמש",
"ім'якористувача",
"کارننوم",
"хэрэглэгчийннэр",
"nomedeusuário",
"имяпользователя",
"têntruynhập",
"பயனர்பெயர்",
"ainmúsáideora",
"ชื่อผู้ใช้",
"사용자이름",
"імякарыстальніка",
"lietotājvārds",
"потребителскоиме",
"uporabniškoime",
"колдонуучунунаты",
"kullanıcıadı",
"පරිශීලකනාමය",
"istifadəçiadı",
"օգտագործողիանունը",
"navêbikarhêner",
"ಬಳಕೆದಾರಹೆಸರು",
"emriipërdoruesit",
"वापरकर्तानाव",
"käyttäjätunnus",
"વપરાશકર્તાનામ",
"felhasználónév",
"उपयोगकर्तानाम",
"nazwaużytkownika",
"ഉപയോക്തൃനാമം",
"სახელი",
"အသုံးပြုသူအမည်",
"نامکاربری",
"प्रयोगकर्तानाम",
"uživatelskéjméno",
"ব্যবহারকারীরনাম",
"užívateľskémeno",
"ឈ្មោះអ្នកប្រើប្រាស់"};
constexpr int kUsernameNonLatinSize = arraysize(kUsernameNonLatin);
const char* const kUserLatin[] = {
"user", "wosuta", "gebruiker", "utilizator",
"usor", "notandi", "gumagamit", "vartotojas",
"fammi", "olumulo", "maiamfani", "cleachdaidh",
"utent", "pemakai", "mpampiasa", "umsebenzisi",
"bruger", "usuario", "panganggo", "utilisateur",
"bruker", "benotzer", "uporabnik", "doutilizador",
"numake", "benutzer", "covneegsiv", "erabiltzaile",
"usuari", "kasutaja", "defnyddiwr", "kaiwhakamahi",
"utente", "korisnik", "mosebedisi", "foydalanuvchi",
"uzanto", "pengguna", "mushandisi"};
constexpr int kUserLatinSize = arraysize(kUserLatin);
const char* const kUserNonLatin[] = {"用户",
"użytkownik",
"tagatafaʻaaogā",
"دکارونکيعکس",
"用戶",
"užívateľ",
"корисник",
"карыстальнік",
"brûker",
"kullanıcı",
"истифода",
"អ្នកប្រើ",
"ọrụ",
"ተጠቃሚ",
"באַניצער",
"хэрэглэгчийн",
"يوزر",
"istifadəçi",
"ຜູ້ໃຊ້",
"пользователь",
"صارف",
"meahoʻohana",
"потребител",
"वापरकर्ता",
"uživatel",
"ユーザー",
"מִשׁתַמֵשׁ",
"ผู้ใช้งาน",
"사용자",
"bikaranîvan",
"колдонуучу",
"વપરાશકર્તા",
"përdorues",
"ngườidùng",
"корисникот",
"उपयोगकर्ता",
"itilizatè",
"χρήστης",
"користувач",
"օգտվողիանձնագիրը",
"használó",
"faoiúsáideoir",
"შესახებ",
"ব্যবহারকারী",
"lietotājs",
"பயனர்",
"ಬಳಕೆದಾರ",
"ഉപയോക്താവ്",
"کاربر",
"యూజర్",
"පරිශීලක",
"प्रयोगकर्ता",
"användare",
"المستعمل",
"пайдаланушы",
"အသုံးပြုသူကို",
"käyttäjä"};
constexpr int kUserNonLatinSize = arraysize(kUserNonLatin);
const char* const kTechnicalWords[] = {
"uid", "newtel", "uaccount", "regaccount", "ureg",
"loginid", "laddress", "accountreg", "regid", "regname",
"loginname", "membername", "uname", "ucreate", "loginmail",
"accountname", "umail", "loginreg", "accountid", "loginaccount",
"ulogin", "regemail", "newmobile", "accountlogin"};
constexpr int kTechnicalWordsSize = arraysize(kTechnicalWords);
const char* const kWeakWords[] = {"id", "login", "mail"};
constexpr int kWeakWordsSize = arraysize(kWeakWords);
// Words that the algorithm looks for are split into multiple categories.
// A category may contain latin dictionary and non-latin dictionary. It is
// mandatory that it has latin one, but non-latin might be missing.
struct CategoryOfWords {
const char* const* const latin_dictionary;
const size_t latin_dictionary_size;
......@@ -270,53 +63,65 @@ struct CategoryOfWords {
const size_t non_latin_dictionary_size;
};
// Minimum length of a word, in order not to be considered short word.
// Short words will have different treatment than the others.
constexpr int kMinimumWordLength = 4;
void BuildValueAndShortTokens(
// 1. Removes delimiters from |raw_value| and appends it to |*field_data_value|.
// A sentinel symbol is added first if |*field_data_value| is not empty.
// 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
// from |raw_value| to |*field_data_short_tokens|, if any.
void AppendValueAndShortTokens(
const base::string16& raw_value,
base::string16* field_data_value,
std::vector<base::string16>* field_data_short_tokens) {
// List of separators that can appear in HTML attribute values.
static const std::string kDelimiters =
"\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";
base::flat_set<base::string16>* field_data_short_tokens) {
base::string16 lowercase_value = base::i18n::ToLower(raw_value);
const base::string16 delimiters = base::ASCIIToUTF16(kDelimiters);
std::vector<base::StringPiece16> tokens =
base::SplitStringPiece(lowercase_value, base::ASCIIToUTF16(kDelimiters),
base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
*field_data_value = base::JoinString(tokens, base::string16());
std::vector<base::StringPiece16> short_tokens;
std::copy_if(tokens.begin(), tokens.end(), std::back_inserter(short_tokens),
[](const base::StringPiece16& token) {
return token.size() < kMinimumWordLength;
});
for (const base::StringPiece16& token : short_tokens) {
field_data_short_tokens->push_back(token.as_string());
}
}
base::SplitStringPiece(lowercase_value, delimiters, base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
// Modify |lowercase_value| only when |tokens| has been processed.
std::vector<base::string16> short_tokens;
std::transform(
std::find_if(tokens.begin(), tokens.end(),
[](const base::StringPiece16& token) {
return token.size() < kMinimumWordLength;
}),
tokens.end(), std::back_inserter(short_tokens),
[](const base::StringPiece16& token) { return token.as_string(); });
// It is better to insert elements to a |flat_map| in one operation.
field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
// Now that tokens are processed, squeeze delimiters out of |lowercase_value|.
lowercase_value.erase(std::remove_if(
lowercase_value.begin(), lowercase_value.end(),
[delimiters](char c) { return delimiters.find(c) != delimiters.npos; }));
// For a given input element, compute developer and user value, along with
// developer and user short tokens.
UsernameFieldData ComputeFieldData(const blink::WebInputElement& input_element,
const FormFieldData& field) {
UsernameFieldData field_data;
field_data.input_element = input_element;
// When computing the developer value, '$' safety guard is being added
// between field name and id, so that forming of accidental words is
// prevented.
BuildValueAndShortTokens(field.name + base::ASCIIToUTF16("$") + field.id,
&field_data.developer_value,
&field_data.developer_short_tokens);
BuildValueAndShortTokens(field.label, &field_data.user_value,
&field_data.user_short_tokens);
if (!field_data_value->empty())
field_data_value->push_back('$');
*field_data_value += lowercase_value;
}
// For the given |input_element|, compute developer and user value, along with
// sets of short tokens, and returns it.
UsernameFieldData ComputeUsernameFieldData(
const blink::WebInputElement& input_element,
const FormFieldData& field) {
UsernameFieldData field_data;
field_data.input_element = input_element;
AppendValueAndShortTokens(field.name, &field_data.developer_value,
&field_data.developer_short_tokens);
AppendValueAndShortTokens(field.id, &field_data.developer_value,
&field_data.developer_short_tokens);
AppendValueAndShortTokens(field.label, &field_data.user_value,
&field_data.user_short_tokens);
return field_data;
}
// For the fields of the given form that can be username fields, compute data
// needed by the detector.
// For the fields of the given form that can be username fields
// (all_possible_usernames), computes |UsernameFieldData| needed by the
// detector.
void InferUsernameFieldData(
const std::vector<blink::WebInputElement>& all_possible_usernames,
const FormData& form_data,
......@@ -324,41 +129,43 @@ void InferUsernameFieldData(
// |all_possible_usernames| and |form_data.fields| may have different set of
// fields. Match them based on |WebInputElement.NameForAutofill| and
// |FormFieldData.name|.
size_t current_index = 0;
size_t next_element_range_begin = 0;
for (const blink::WebInputElement& input_element : all_possible_usernames) {
for (size_t i = current_index; i < form_data.fields.size(); ++i) {
const FormFieldData& field = form_data.fields[i];
const base::string16 element_name = input_element.NameForAutofill().Utf16();
for (size_t i = next_element_range_begin; i < form_data.fields.size();
++i) {
const FormFieldData& field_data = form_data.fields[i];
if (input_element.NameForAutofill().IsEmpty())
continue;
// Find matching form data and web input element.
if (field.name == input_element.NameForAutofill().Utf16()) {
current_index = i + 1;
// Find matching field data and web input element.
if (field_data.name == element_name) {
next_element_range_begin = i + 1;
possible_usernames_data->push_back(
ComputeFieldData(input_element, field));
ComputeUsernameFieldData(input_element, field_data));
break;
}
}
}
}
// Check if any word from the dictionary is encountered in computed field
// information.
bool SearchFieldInDictionary(const base::string16& value,
const std::vector<base::string16>& tokens,
const char* const* dictionary,
const size_t& dictionary_size) {
// Check if any word from |dictionary| is encountered in computed field
// information (i.e. |value|, |tokens|).
bool CheckFieldWithDictionary(
const base::string16& value,
const base::flat_set<base::string16>& short_tokens,
const char* const* dictionary,
const size_t& dictionary_size) {
for (size_t i = 0; i < dictionary_size; ++i) {
if (strlen(dictionary[i]) < kMinimumWordLength) {
// Treat short words by looking up for them in the tokens list.
for (const base::string16& token : tokens) {
if (token == base::UTF8ToUTF16(dictionary[i]))
return true;
}
const base::string16 word = base::UTF8ToUTF16(dictionary[i]);
if (word.length() < kMinimumWordLength) {
// Treat short words by looking them up in the tokens set.
if (short_tokens.find(word) != short_tokens.end())
return true;
} else {
// Treat long words by looking for them as a substring in |value|.
if (value.find(base::UTF8ToUTF16(dictionary[i])) != std::string::npos)
// Treat long words by looking them up as a substring in |value|.
if (value.find(word) != std::string::npos)
return true;
}
}
......@@ -366,33 +173,30 @@ bool SearchFieldInDictionary(const base::string16& value,
}
// Check if any word from |category| is encountered in computed field
// information.
// information (|possible_username|).
bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
const CategoryOfWords& category) {
// For user value, search in latin and non-latin dictionaries, because this
// value is user visible.
return SearchFieldInDictionary(
// value is user visible. For developer value, only look up in latin
/// dictionaries.
return CheckFieldWithDictionary(
possible_username.user_value, possible_username.user_short_tokens,
category.latin_dictionary, category.latin_dictionary_size) ||
SearchFieldInDictionary(possible_username.user_value,
possible_username.user_short_tokens,
category.non_latin_dictionary,
category.non_latin_dictionary_size) ||
// For developer value, only look up in latin dictionaries.
SearchFieldInDictionary(possible_username.developer_value,
possible_username.developer_short_tokens,
category.latin_dictionary,
category.latin_dictionary_size);
CheckFieldWithDictionary(possible_username.user_value,
possible_username.user_short_tokens,
category.non_latin_dictionary,
category.non_latin_dictionary_size) ||
CheckFieldWithDictionary(possible_username.developer_value,
possible_username.developer_short_tokens,
category.latin_dictionary,
category.latin_dictionary_size);
}
// Remove from |possible_usernames_data| the elements that definitely cannot be
// usernames, because their computed values contain at least one negative word.
void RemoveFieldsWithNegativeWords(
std::vector<UsernameFieldData>* possible_usernames_data) {
// Words that certainly point to a non-username field.
// If field values contain at least one negative word, then the field is
// excluded from the list of possible usernames.
static const CategoryOfWords kNegativeCategory{
static const CategoryOfWords kNegativeCategory = {
kNegativeLatin, kNegativeLatinSize, kNegativeNonLatin,
kNegativeNonLatinSize};
......@@ -406,11 +210,11 @@ void RemoveFieldsWithNegativeWords(
possible_usernames_data->end());
}
// Check if any word from the given category appears in fields from the form.
// If a word appears in more than 2 fields, we do not make a decision, because
// it may just be a prefix.
// If a word appears in 1 or 2 fields, we return the first field in which we
// found the substring as |username_element|.
// Check if any word from the given category (|category|) appears in fields from
// the form (|possible_usernames_data|). If the category words appear in more
// than 2 fields, do not make a decision, because it may just be a prefix. If
// the words appears in 1 or 2 fields, the first field is saved to
// |*username_element|.
bool FormContainsWordFromCategory(
const std::vector<UsernameFieldData>& possible_usernames_data,
const CategoryOfWords& category,
......@@ -419,20 +223,21 @@ bool FormContainsWordFromCategory(
// the form) in which a substring is encountered.
WebInputElement chosen_field;
size_t count = 0;
size_t fields_found = 0;
for (const UsernameFieldData& field_data : possible_usernames_data) {
if (ContainsWordFromCategory(field_data, category)) {
if (count == 0)
if (fields_found == 0)
chosen_field = field_data.input_element;
count++;
fields_found++;
}
}
if (count && count <= 2) {
if (fields_found > 0 && fields_found <= 2) {
*username_element = chosen_field;
return true;
} else {
return false;
}
return false;
}
// Find username element if there is no cached result for the given form.
......@@ -442,28 +247,18 @@ bool FindUsernameFieldInternal(
WebInputElement* username_element) {
DCHECK(username_element);
// Translations of "username".
static const CategoryOfWords kUsernameCategory{
static const CategoryOfWords kUsernameCategory = {
kUsernameLatin, kUsernameLatinSize, kUsernameNonLatin,
kUsernameNonLatinSize};
// Translations of "user".
static const CategoryOfWords kUserCategory{kUserLatin, kUserLatinSize,
kUserNonLatin, kUserNonLatinSize};
// Words that certainly point to a username field, if they appear in developer
// value. They are technical words, because they can only be used as variable
// names, and not as stand-alone words.
static const CategoryOfWords kTechnicalCategory{
static const CategoryOfWords kUserCategory = {
kUserLatin, kUserLatinSize, kUserNonLatin, kUserNonLatinSize};
static const CategoryOfWords kTechnicalCategory = {
kTechnicalWords, kTechnicalWordsSize, nullptr, 0};
// Words that might point to a username field.They have the smallest priority
// in the heuristic, because there are also field attribute values that
// contain them, but are not username fields.
static const CategoryOfWords kWeakCategory{kWeakWords, kWeakWordsSize,
nullptr, 0};
static const CategoryOfWords kWeakCategory = {kWeakWords, kWeakWordsSize,
nullptr, 0};
// These categories contain words that point to username field.
// Order of categories is vital: the detector searches for words in descending
// order of probability to point to a username field.
static const CategoryOfWords kPositiveCategories[] = {
kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory};
......@@ -473,8 +268,6 @@ bool FindUsernameFieldInternal(
RemoveFieldsWithNegativeWords(&possible_usernames_data);
// These are the searches performed by the username detector.
// Order of categories is vital: the detector searches for words in descending
// order of probability to point to a username field.
for (const CategoryOfWords& category : kPositiveCategories) {
if (FormContainsWordFromCategory(possible_usernames_data, category,
username_element)) {
......@@ -496,20 +289,34 @@ bool GetUsernameFieldBasedOnHtmlAttributes(
if (all_possible_usernames.empty())
return false;
// All elements in |all_possible_usernames| should have the same |Form()|.
DCHECK(
std::adjacent_find(
all_possible_usernames.begin(), all_possible_usernames.end(),
[](const blink::WebInputElement& a, const blink::WebInputElement& b) {
return a.Form() != b.Form();
}) == all_possible_usernames.end());
const blink::WebFormElement form = all_possible_usernames[0].Form();
if (!username_detector_cache ||
username_detector_cache->find(form) == username_detector_cache->end()) {
// True if the cache has no entry for |form|.
bool cache_miss = true;
// Iterator pointing to the entry for |form| if the entry for |form| is found.
UsernameDetectorCache::iterator form_position;
if (username_detector_cache) {
std::tie(form_position, cache_miss) = username_detector_cache->insert(
std::make_pair(form, blink::WebInputElement()));
}
if (!username_detector_cache || cache_miss) {
bool username_found = FindUsernameFieldInternal(
all_possible_usernames, form_data, username_element);
if (username_detector_cache) {
(*username_detector_cache)[form] =
username_found ? *username_element : blink::WebInputElement();
}
if (username_detector_cache && username_found)
form_position->second = *username_element;
return username_found;
} else {
*username_element = form_position->second;
return !username_element->IsNull();
}
// Use the cached value for |form|.
*username_element = (*username_detector_cache)[form];
return !username_element->IsNull();
}
} // namespace autofill
......@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <map>
#include "components/autofill/core/common/password_form.h"
#include "third_party/WebKit/public/web/WebFormControlElement.h"
#include "third_party/WebKit/public/web/WebInputElement.h"
......
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "base/macros.h"
namespace autofill {
const char* const kNegativeLatin[] = {
"pin", "parola", "wagwoord", "wachtwoord",
"fake", "parole", "givenname", "achinsinsi",
"token", "parool", "firstname", "facalfaire",
"fname", "lozinka", "pasahitza", "focalfaire",
"lname", "passord", "pasiwedhi", "iphasiwedi",
"geslo", "huahuna", "passwuert", "katalaluan",
"heslo", "fullname", "phasewete", "adgangskode",
"parol", "optional", "wachtwurd", "contrasenya",
"sandi", "lastname", "cyfrinair", "contrasinal",
"senha", "kupuhipa", "katasandi", "kalmarsirri",
"hidden", "password", "loluszais", "tenimiafina",
"second", "passwort", "middlename", "paroladordine",
"codice", "pasvorto", "familyname", "inomboloyokuvula",
"modpas", "salasana", "motdepasse", "numeraeleiloaesesi"};
const int kNegativeLatinSize = arraysize(kNegativeLatin);
const char* const kNegativeNonLatin[] = {"fjalëkalim",
"የይለፍቃል",
"كلمهالسر",
"գաղտնաբառ",
"пароль",
"পাসওয়ার্ড",
"парола",
"密码",
"密碼",
"დაგავიწყდათ",
"κωδικόςπρόσβασης",
"પાસવર્ડ",
"סיסמה",
"पासवर्ड",
"jelszó",
"lykilorð",
"paswọọdụ",
"パスワード",
"ಪಾಸ್ವರ್ಡ್",
"пароль",
"ការពាក្យសម្ងាត់",
"암호",
"şîfre",
"купуясөз",
"ລະຫັດຜ່ານ",
"slaptažodis",
"лозинка",
"पासवर्ड",
"нууцүг",
"စကားဝှက်ကို",
"पासवर्ड",
"رمز",
"کلمهعبور",
"hasło",
"пароль",
"лозинка",
"پاسورڊ",
"මුරපදය",
"contraseña",
"lösenord",
"гузарвожа",
"கடவுச்சொல்",
"పాస్వర్డ్",
"รหัสผ่าน",
"пароль",
"پاسورڈ",
"mậtkhẩu",
"פּאַראָל",
"ọrọigbaniwọle"};
const int kNegativeNonLatinSize = arraysize(kNegativeNonLatin);
const char* const kUsernameLatin[] = {
"gatti", "uzantonomo", "solonanarana", "nombredeusuario",
"olumulo", "nomenusoris", "enwdefnyddiwr", "nomdutilisateur",
"lolowera", "notandanafn", "nomedeusuario", "vartotojovardas",
"username", "ahanjirimara", "gebruikersnaam", "numedeutilizator",
"brugernavn", "benotzernumm", "jinalamtumiaji", "erabiltzaileizena",
"brukernavn", "benutzername", "sunanmaiamfani", "foydalanuvchinomi",
"mosebedisi", "kasutajanimi", "ainmcleachdaidh", "igamalomsebenzisi",
"nomdusuari", "lomsebenzisi", "jenengpanganggo", "ingoakaiwhakamahi",
"nomeutente", "namapengguna"};
const int kUsernameLatinSize = arraysize(kUsernameLatin);
const char* const kUsernameNonLatin[] = {"用户名",
"کاتيجونالو",
"用戶名",
"የተጠቃሚስም",
"логин",
"اسمالمستخدم",
"נאמען",
"کاصارفکانام",
"ユーザ名",
"όνομα χρήστη",
"brûkersnamme",
"корисничкоиме",
"nonitilizatè",
"корисничкоиме",
"ngaranpamaké",
"ຊື່ຜູ້ໃຊ້",
"användarnamn",
"యూజర్పేరు",
"korisničkoime",
"пайдаланушыаты",
"שםמשתמש",
"ім'якористувача",
"کارننوم",
"хэрэглэгчийннэр",
"nomedeusuário",
"имяпользователя",
"têntruynhập",
"பயனர்பெயர்",
"ainmúsáideora",
"ชื่อผู้ใช้",
"사용자이름",
"імякарыстальніка",
"lietotājvārds",
"потребителскоиме",
"uporabniškoime",
"колдонуучунунаты",
"kullanıcıadı",
"පරිශීලකනාමය",
"istifadəçiadı",
"օգտագործողիանունը",
"navêbikarhêner",
"ಬಳಕೆದಾರಹೆಸರು",
"emriipërdoruesit",
"वापरकर्तानाव",
"käyttäjätunnus",
"વપરાશકર્તાનામ",
"felhasználónév",
"उपयोगकर्तानाम",
"nazwaużytkownika",
"ഉപയോക്തൃനാമം",
"სახელი",
"အသုံးပြုသူအမည်",
"نامکاربری",
"प्रयोगकर्तानाम",
"uživatelskéjméno",
"ব্যবহারকারীরনাম",
"užívateľskémeno",
"ឈ្មោះអ្នកប្រើប្រាស់"};
const int kUsernameNonLatinSize = arraysize(kUsernameNonLatin);
const char* const kUserLatin[] = {
"user", "wosuta", "gebruiker", "utilizator",
"usor", "notandi", "gumagamit", "vartotojas",
"fammi", "olumulo", "maiamfani", "cleachdaidh",
"utent", "pemakai", "mpampiasa", "umsebenzisi",
"bruger", "usuario", "panganggo", "utilisateur",
"bruker", "benotzer", "uporabnik", "doutilizador",
"numake", "benutzer", "covneegsiv", "erabiltzaile",
"usuari", "kasutaja", "defnyddiwr", "kaiwhakamahi",
"utente", "korisnik", "mosebedisi", "foydalanuvchi",
"uzanto", "pengguna", "mushandisi"};
const int kUserLatinSize = arraysize(kUserLatin);
const char* const kUserNonLatin[] = {"用户",
"użytkownik",
"tagatafaʻaaogā",
"دکارونکيعکس",
"用戶",
"užívateľ",
"корисник",
"карыстальнік",
"brûker",
"kullanıcı",
"истифода",
"អ្នកប្រើ",
"ọrụ",
"ተጠቃሚ",
"באַניצער",
"хэрэглэгчийн",
"يوزر",
"istifadəçi",
"ຜູ້ໃຊ້",
"пользователь",
"صارف",
"meahoʻohana",
"потребител",
"वापरकर्ता",
"uživatel",
"ユーザー",
"מִשׁתַמֵשׁ",
"ผู้ใช้งาน",
"사용자",
"bikaranîvan",
"колдонуучу",
"વપરાશકર્તા",
"përdorues",
"ngườidùng",
"корисникот",
"उपयोगकर्ता",
"itilizatè",
"χρήστης",
"користувач",
"օգտվողիանձնագիրը",
"használó",
"faoiúsáideoir",
"შესახებ",
"ব্যবহারকারী",
"lietotājs",
"பயனர்",
"ಬಳಕೆದಾರ",
"ഉപയോക്താവ്",
"کاربر",
"యూజర్",
"පරිශීලක",
"प्रयोगकर्ता",
"användare",
"المستعمل",
"пайдаланушы",
"အသုံးပြုသူကို",
"käyttäjä"};
const int kUserNonLatinSize = arraysize(kUserNonLatin);
const char* const kTechnicalWords[] = {
"uid", "newtel", "uaccount", "regaccount", "ureg",
"loginid", "laddress", "accountreg", "regid", "regname",
"loginname", "membername", "uname", "ucreate", "loginmail",
"accountname", "umail", "loginreg", "accountid", "loginaccount",
"ulogin", "regemail", "newmobile", "accountlogin"};
const int kTechnicalWordsSize = arraysize(kTechnicalWords);
const char* const kWeakWords[] = {"id", "login", "mail"};
const int kWeakWordsSize = arraysize(kWeakWords);
} // namespace autofill
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
namespace autofill {
// Words that certainly point to a non-username field.
// If field values contain at least one negative word, then the field is
// excluded from the list of possible usernames.
extern const char* const kNegativeLatin[];
extern const int kNegativeLatinSize;
extern const char* const kNegativeNonLatin[];
extern const int kNegativeNonLatinSize;
// Translations of "username".
extern const char* const kUsernameLatin[];
extern const int kUsernameLatinSize;
extern const char* const kUsernameNonLatin[];
extern const int kUsernameNonLatinSize;
// Translations of "user".
extern const char* const kUserLatin[];
extern const int kUserLatinSize;
extern const char* const kUserNonLatin[];
extern const int kUserNonLatinSize;
// Words that certainly point to a username field, if they appear in developer
// value. They are technical words, because they can only be used as variable
// names, and not as stand-alone words.
extern const char* const kTechnicalWords[];
extern const int kTechnicalWordsSize;
// Words that might point to a username field.They have the smallest priority
// in the heuristic, because there are also field attribute values that
// contain them, but are not username fields.
extern const char* const kWeakWords[];
extern const int kWeakWordsSize;
} // namespace autofill
......@@ -221,7 +221,8 @@ class MAYBE_PasswordFormConversionUtilsTest : public content::RenderViewTest {
}
return CreatePasswordFormFromWebForm(
form, with_user_input ? &user_input : nullptr, predictions, nullptr);
form, with_user_input ? &user_input : nullptr, predictions,
&username_detector_cache_);
}
// Iterates on the form generated by the |html| and adds the fields and type
......@@ -270,6 +271,8 @@ class MAYBE_PasswordFormConversionUtilsTest : public content::RenderViewTest {
*form = forms[0];
}
UsernameDetectorCache username_detector_cache_;
private:
DISALLOW_COPY_AND_ASSIGN(MAYBE_PasswordFormConversionUtilsTest);
};
......@@ -334,7 +337,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest, OnlyDisabledFields) {
}
TEST_F(MAYBE_PasswordFormConversionUtilsTest,
IdentifyingUsernameFieldsFromDeveloperGroupWithHTMLDetector) {
HTMLDetector_DeveloperGroupAttributes) {
base::test::ScopedFeatureList feature_list;
feature_list.InitAndEnableFeature(
password_manager::features::kEnableHtmlBasedUsernameDetector);
......@@ -408,17 +411,18 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
{"email", "", "js@google.com"},
"email",
"js@google.com"},
// If word matches in maximum 2 fields, it is accepted.
// If a word matches in maximum 2 fields, it is accepted.
// First encounter is selected as username.
{{"loginusername", "", "johnsmith"},
{"loginemail", "", "js@google.com"},
"loginusername",
{{"username", "", "johnsmith"},
{"repeat_username", "", "johnsmith"},
"username",
"johnsmith"},
// Check treatment for short dictionary words.
{{"identity_name", "", "johnsmith"},
{"email", "", "js@google.com"},
"email",
"js@google.com"}};
// A short word should be enclosed between delimiters. Otherwise, an
// Occurrence doesn't count.
{{"identity_name", "idn", "johnsmith"},
{"id", "id", "123"},
"id",
"123"}};
for (size_t i = 0; i < arraysize(cases); ++i) {
SCOPED_TRACE(testing::Message() << "Iteration " << i);
......@@ -436,6 +440,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
builder.AddSubmitButton("submit");
std::string html = builder.ProduceHTML();
username_detector_cache_.clear();
std::unique_ptr<PasswordForm> password_form =
LoadHTMLAndConvertForm(html, nullptr, false);
......@@ -445,7 +450,19 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
password_form->username_element);
EXPECT_EQ(base::UTF8ToUTF16(cases[i].expected_username_value),
password_form->username_value);
// Check that the username field was found by HTML detector.
ASSERT_EQ(1u, username_detector_cache_.size());
ASSERT_FALSE(username_detector_cache_.begin()->second.IsNull());
EXPECT_EQ(
cases[i].expected_username_element,
username_detector_cache_.begin()->second.NameForAutofill().Utf8());
}
}
TEST_F(MAYBE_PasswordFormConversionUtilsTest, HTMLDetector_SeveralDetections) {
base::test::ScopedFeatureList feature_list;
feature_list.InitAndEnableFeature(
password_manager::features::kEnableHtmlBasedUsernameDetector);
// If word matches in more than 2 fields, we don't match on it.
// We search for match with another word.
......@@ -459,6 +476,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
builder.AddSubmitButton("submit");
std::string html = builder.ProduceHTML();
DCHECK(username_detector_cache_.empty());
std::unique_ptr<PasswordForm> password_form =
LoadHTMLAndConvertForm(html, nullptr, false);
......@@ -466,10 +484,15 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
EXPECT_EQ(base::UTF8ToUTF16("loginid"), password_form->username_element);
EXPECT_EQ(base::UTF8ToUTF16("johnsmith"), password_form->username_value);
// Check that the username field was found by HTML detector.
ASSERT_EQ(1u, username_detector_cache_.size());
ASSERT_FALSE(username_detector_cache_.begin()->second.IsNull());
EXPECT_EQ("loginid",
username_detector_cache_.begin()->second.NameForAutofill().Utf8());
}
TEST_F(MAYBE_PasswordFormConversionUtilsTest,
IdentifyingUsernameFieldsFromUserGroupWithHTMLDetector) {
HTMLDetector_UserGroupAttributes) {
base::test::ScopedFeatureList feature_list;
feature_list.InitAndEnableFeature(
password_manager::features::kEnableHtmlBasedUsernameDetector);
......@@ -483,12 +506,12 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
struct TestCase {
// Field parameters represent, in order of appearance, field name, field
// id, field value and field label or placeholder.
// Field name and field id don't contain any significant information.
const char* first_text_field_parameters[4];
const char* second_text_field_parameters[4];
const char* expected_username_element;
const char* expected_username_value;
} cases[] = {
// Developer group does not contain any significant information.
// Label information will decide username.
{{"name1", "id1", "johnsmith", "Username:"},
{"name2", "id2", "js@google.com", "Email:"},
......@@ -545,9 +568,10 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
{"username", "", "johnsmith", "Email:"},
"email",
"js@google.com"},
// Check treatment for short dictionary words.
// Check treatment for short dictionary words. "uid" has higher priority,
// but its occurrence is ignored because it is a part of another word.
{{"name1", "", "johnsmith", "Insert your id:"},
{"name2", "", "js@google.com", "Insert something:"},
{"name2", "uidentical", "js@google.com", "Insert something:"},
"name1",
"johnsmith"}};
......@@ -569,6 +593,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
builder.AddSubmitButton("submit");
std::string html = builder.ProduceHTML();
username_detector_cache_.clear();
std::unique_ptr<PasswordForm> password_form =
LoadHTMLAndConvertForm(html, nullptr, false);
......@@ -578,6 +603,12 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest,
password_form->username_element);
EXPECT_EQ(base::UTF8ToUTF16(cases[i].expected_username_value),
password_form->username_value);
// Check that the username field was found by HTML detector.
ASSERT_EQ(1u, username_detector_cache_.size());
ASSERT_FALSE(username_detector_cache_.begin()->second.IsNull());
EXPECT_EQ(
cases[i].expected_username_element,
username_detector_cache_.begin()->second.NameForAutofill().Utf8());
}
}
......@@ -613,7 +644,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest, HTMLDetectorCache) {
// will be the same because it was cached in |username_detector_cache|.
WebVector<WebFormControlElement> control_elements;
form.GetFormControlElements(control_elements);
control_elements[0].SetAttribute("name", "login");
control_elements[0].SetAttribute("name", "id");
password_form = CreatePasswordFormFromWebForm(form, nullptr, nullptr,
&username_detector_cache);
EXPECT_TRUE(password_form);
......@@ -633,7 +664,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest, HTMLDetectorCache) {
ASSERT_EQ(1u, username_detector_cache.size());
EXPECT_EQ(form, username_detector_cache.begin()->first);
ASSERT_FALSE(username_detector_cache.begin()->second.IsNull());
EXPECT_EQ("login",
EXPECT_EQ("id",
username_detector_cache.begin()->second.NameForAutofill().Utf8());
EXPECT_THAT(
histogram_tester.GetAllSamples("PasswordManager.UsernameDetectionMethod"),
......@@ -650,7 +681,7 @@ TEST_F(MAYBE_PasswordFormConversionUtilsTest, HTMLDetectorCache) {
ASSERT_EQ(1u, username_detector_cache.size());
EXPECT_EQ(form, username_detector_cache.begin()->first);
ASSERT_FALSE(username_detector_cache.begin()->second.IsNull());
EXPECT_EQ("login",
EXPECT_EQ("id",
username_detector_cache.begin()->second.NameForAutofill().Utf8());
EXPECT_THAT(
histogram_tester.GetAllSamples("PasswordManager.UsernameDetectionMethod"),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment