Commit 04d9da4f authored by Jan Wilken Dörrie's avatar Jan Wilken Dörrie Committed by Commit Bot

[zxcvbn] Add UTF8 support to zxcvbn::repeat_match.

This change modifies zxcvbn's repeat_match to correctly detect
repeated sequences of non-ASCII characters. Since std::regex does not
natively understand UTF8 codepoints, and re2 does not support the
required backreferences, this change updates the regex engine to use
icu::Regex instead.

Furthermore, since zxcvbn only intends to supports UTF8 strings, the
fuzzers are modified to only produce valid UTF8.

TBR=dpranke

Fixed: 1111771
Change-Id: I4a9985997b6d6cd33558cf8607e195e61dac8109
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2377934
Commit-Queue: Jan Wilken Dörrie <jdoerrie@chromium.org>
Reviewed-by: default avatarDirk Pranke <dpranke@google.com>
Reviewed-by: default avatarAlex Gough <ajgo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#806351}
parent 39104d5e
...@@ -30,7 +30,10 @@ source_set("zxcvbn-cpp") { ...@@ -30,7 +30,10 @@ source_set("zxcvbn-cpp") {
"native-src/zxcvbn/util.hpp", "native-src/zxcvbn/util.hpp",
] ]
deps = [ "//base" ] public_deps = [
"//base",
"//third_party/icu",
]
public_configs = [ ":zxcvbn_include_config" ] public_configs = [ ":zxcvbn_include_config" ]
} }
...@@ -52,9 +55,11 @@ test("zxcvbn_unittests") { ...@@ -52,9 +55,11 @@ test("zxcvbn_unittests") {
fuzzer_test("zxcvbn_matching_fuzzer") { fuzzer_test("zxcvbn_matching_fuzzer") {
sources = [ "test/matching_fuzzer.cc" ] sources = [ "test/matching_fuzzer.cc" ]
deps = [ ":zxcvbn-cpp" ] deps = [ ":zxcvbn-cpp" ]
libfuzzer_options = [ "max_len=128" ]
} }
fuzzer_test("zxcvbn_scoring_fuzzer") { fuzzer_test("zxcvbn_scoring_fuzzer") {
sources = [ "test/scoring_fuzzer.cc" ] sources = [ "test/scoring_fuzzer.cc" ]
deps = [ ":zxcvbn-cpp" ] deps = [ ":zxcvbn-cpp" ]
libfuzzer_options = [ "max_len=128" ]
} }
include_rules = [ include_rules = [
"+base/containers/flat_map.h", "+base",
"+base/no_destructor.h", "+testing",
"+base/strings", "+third_party/icu/source/common/unicode",
"+testing/gmock", "+third_party/icu/source/i18n/unicode",
"+testing/gtest",
] ]
...@@ -33,6 +33,10 @@ Applied the following patches: ...@@ -33,6 +33,10 @@ Applied the following patches:
number of computed guesses exceeds DBL_MAX. number of computed guesses exceeds DBL_MAX.
- patches/flat_map_ranked.diff: Use base::flat_map instead of std::unordered_map - patches/flat_map_ranked.diff: Use base::flat_map instead of std::unordered_map
for ranked dictionaries to reduce memory foot print. for ranked dictionaries to reduce memory foot print.
- patches/utf8_support.diff: Switched the regex engine in matching.cpp from
std::regex to ICU's regex, so that repeat matches work with multi-byte code
points. Also guarded an assert in scoring.cpp that was hit for non-ASCII
passwords.
- Dropped the data dictionary from the check-out, as the data will now be - Dropped the data dictionary from the check-out, as the data will now be
provided via component updater. The used dictionaries can be found at provided via component updater. The used dictionaries can be found at
https://github.com/rianhunter/zxcvbn-cpp/tree/cf092c952cd2325ce390b2691231a8f1cb195d59/data https://github.com/rianhunter/zxcvbn-cpp/tree/cf092c952cd2325ce390b2691231a8f1cb195d59/data
......
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
#include <unordered_set> #include <unordered_set>
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/i18n/unicode/regex.h"
namespace zxcvbn { namespace zxcvbn {
...@@ -453,69 +456,90 @@ std::vector<Match> spatial_match_helper(const std::string & password, ...@@ -453,69 +456,90 @@ std::vector<Match> spatial_match_helper(const std::string & password,
// repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------ // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
//------------------------------------------------------------------------------- //-------------------------------------------------------------------------------
std::vector<Match> repeat_match(const std::string & password) { std::vector<Match> repeat_match(const std::string& password) {
std::vector<Match> matches; std::vector<Match> matches;
std::regex greedy(R"((.+)\1+)");
std::regex lazy(R"((.+?)\1+)"); auto unicode_password = icu::UnicodeString::fromUTF8(password);
std::regex lazy_anchored(R"(^(.+?)\1+$)");
idx_t lastIndex = 0; UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile(
icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status));
std::unique_ptr<icu::RegexMatcher> greedy_matcher(
greedy_pattern->matcher(unicode_password, status));
std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile(
icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status));
std::unique_ptr<icu::RegexMatcher> lazy_matcher(
lazy_pattern->matcher(unicode_password, status));
std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern(
icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"),
0, status));
int lastUnicodeIndex = 0;
size_t lastIndex = 0;
while (lastIndex < password.length()) { while (lastIndex < password.length()) {
auto start_iter = lastIndex + password.begin(); if (!greedy_matcher->find(lastUnicodeIndex, status) ||
std::smatch greedy_match, lazy_match; !lazy_matcher->find(lastUnicodeIndex, status)) {
std::regex_search(start_iter, password.end(), break;
greedy_match, greedy); }
std::regex_search(start_iter, password.end(),
lazy_match, lazy); icu::RegexMatcher* matcher = nullptr;
if (!greedy_match.size()) break; icu::UnicodeString base_token;
std::smatch match; if (greedy_matcher->group(status).length() >
std::string base_token; lazy_matcher->group(status).length()) {
if (greedy_match[0].length() > lazy_match[0].length()) {
// greedy beats lazy for 'aabaab' // greedy beats lazy for 'aabaab'
// greedy: [aabaab, aab] // greedy: [aabaab, aab]
// lazy: [aa, a] // lazy: [aa, a]
match = greedy_match; matcher = greedy_matcher.get();
// greedy's repeated string might itself be repeated, eg. // greedy's repeated string might itself be repeated, eg.
// aabaab in aabaabaabaab. // aabaab in aabaabaabaab.
// run an anchored lazy match on greedy's repeated string // run an anchored lazy match on greedy's repeated string
// to find the shortest repeated string // to find the shortest repeated string
std::smatch lazy_anchored_match; auto greedy_found = matcher->group(status);
auto greedy_found = match.str(0);
auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored); std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher(
lazy_anchored_pattern->matcher(greedy_found, status));
auto ret = lazy_anchored_matcher->find(status);
assert(ret); assert(ret);
(void) ret; (void) ret;
base_token = lazy_anchored_match.str(1); base_token = lazy_anchored_matcher->group(1, status);
} } else {
else {
// lazy beats greedy for 'aaaaa' // lazy beats greedy for 'aaaaa'
// greedy: [aaaa, aa] // greedy: [aaaa, aa]
// lazy: [aaaaa, a] // lazy: [aaaaa, a]
match = std::move(lazy_match); matcher = lazy_matcher.get();
base_token = match.str(1); base_token = matcher->group(1, status);
} }
auto idx = lastIndex + match.position();
auto jdx = lastIndex + match.position() + match[0].length(); std::string matched_string;
matcher->group(status).toUTF8String(matched_string);
auto idx = password.find(matched_string, lastIndex);
auto jdx = idx + matched_string.size();
auto i = util::character_len(password, 0, idx); auto i = util::character_len(password, 0, idx);
auto j = i + util::character_len(password, idx, jdx) - 1; auto j = i + util::character_len(password, idx, jdx) - 1;
// recursively match and score the base string // recursively match and score the base string
auto sub_matches = omnimatch(base_token); std::string base_string;
auto base_analysis = most_guessable_match_sequence( base_token.toUTF8String(base_string);
base_token, auto sub_matches = omnimatch(base_string);
sub_matches, auto base_analysis =
false most_guessable_match_sequence(base_string, sub_matches, false);
);
std::vector<Match> base_matches; std::vector<Match> base_matches;
std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(), std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
std::back_inserter(base_matches)); std::back_inserter(base_matches));
auto & base_guesses = base_analysis.guesses; auto& base_guesses = base_analysis.guesses;
matches.push_back(Match(i, j, match.str(0), matches.push_back(Match(i, j, matched_string,
RepeatMatch{ RepeatMatch{
base_token, base_string,
base_guesses, base_guesses,
std::move(base_matches), std::move(base_matches),
match[0].length() / base_token.length(), matched_string.size() / base_string.size(),
})); }));
matches.back().idx = idx; matches.back().idx = idx;
matches.back().jdx = jdx; matches.back().jdx = jdx;
lastUnicodeIndex = matcher->end(status);
lastIndex = jdx; lastIndex = jdx;
} }
return matches; return matches;
......
...@@ -75,6 +75,10 @@ std::size_t token_len(const Match & m) __attribute__((pure)); ...@@ -75,6 +75,10 @@ std::size_t token_len(const Match & m) __attribute__((pure));
static static
std::size_t token_len(const Match & m) { std::size_t token_len(const Match & m) {
std::size_t result = m.j - m.i + 1; std::size_t result = m.j - m.i + 1;
// Bruteforce matches might be any substring of the original string, which are
// not necessarily aligned to UTF8 code points, and thus m.token might not be
// a valid UTF8 string.
if (m.get_pattern() != MatchPattern::BRUTEFORCE)
assert(result == util::character_len(m.token)); assert(result == util::character_len(m.token));
return result; return result;
} }
......
diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
index 8f4e6d2f0e00..13465dee1cd7 100644
--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
@@ -20,6 +20,9 @@
#include <unordered_set>
#include "base/no_destructor.h"
+#include "base/strings/string_util.h"
+#include "third_party/icu/source/common/unicode/unistr.h"
+#include "third_party/icu/source/i18n/unicode/regex.h"
namespace zxcvbn {
@@ -453,69 +456,91 @@ std::vector<Match> spatial_match_helper(const std::string & password,
// repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
//-------------------------------------------------------------------------------
-std::vector<Match> repeat_match(const std::string & password) {
+std::vector<Match> repeat_match(const std::string& password) {
std::vector<Match> matches;
- std::regex greedy(R"((.+)\1+)");
- std::regex lazy(R"((.+?)\1+)");
- std::regex lazy_anchored(R"(^(.+?)\1+$)");
- idx_t lastIndex = 0;
+
+ auto unicode_password = icu::UnicodeString::fromUTF8(password);
+
+ UErrorCode status = U_ZERO_ERROR;
+ std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile(
+ icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status));
+ std::unique_ptr<icu::RegexMatcher> greedy_matcher(
+ greedy_pattern->matcher(unicode_password, status));
+
+ std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile(
+ icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status));
+ std::unique_ptr<icu::RegexMatcher> lazy_matcher(
+ lazy_pattern->matcher(unicode_password, status));
+
+ std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern(
+ icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"),
+ 0, status));
+
+ int lastUnicodeIndex = 0;
+ size_t lastIndex = 0;
while (lastIndex < password.length()) {
- auto start_iter = lastIndex + password.begin();
- std::smatch greedy_match, lazy_match;
- std::regex_search(start_iter, password.end(),
- greedy_match, greedy);
- std::regex_search(start_iter, password.end(),
- lazy_match, lazy);
- if (!greedy_match.size()) break;
- std::smatch match;
- std::string base_token;
- if (greedy_match[0].length() > lazy_match[0].length()) {
+ if (!greedy_matcher->find(lastUnicodeIndex, status) ||
+ !lazy_matcher->find(lastUnicodeIndex, status)) {
+ break;
+ }
+
+ icu::RegexMatcher* matcher = nullptr;
+ icu::UnicodeString base_token;
+ if (greedy_matcher->group(status).length() >
+ lazy_matcher->group(status).length()) {
// greedy beats lazy for 'aabaab'
// greedy: [aabaab, aab]
// lazy: [aa, a]
- match = greedy_match;
+ matcher = greedy_matcher.get();
// greedy's repeated string might itself be repeated, eg.
// aabaab in aabaabaabaab.
// run an anchored lazy match on greedy's repeated string
// to find the shortest repeated string
- std::smatch lazy_anchored_match;
- auto greedy_found = match.str(0);
- auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored);
+ auto greedy_found = matcher->group(status);
+
+ std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher(
+ lazy_anchored_pattern->matcher(greedy_found, status));
+ auto ret = lazy_anchored_matcher->find(status);
assert(ret);
(void) ret;
- base_token = lazy_anchored_match.str(1);
+ base_token = lazy_anchored_matcher->group(1, status);
}
else {
// lazy beats greedy for 'aaaaa'
// greedy: [aaaa, aa]
// lazy: [aaaaa, a]
- match = std::move(lazy_match);
- base_token = match.str(1);
+ matcher = lazy_matcher.get();
+ base_token = matcher->group(1, status);
}
- auto idx = lastIndex + match.position();
- auto jdx = lastIndex + match.position() + match[0].length();
+
+ std::string matched_string;
+ matcher->group(status).toUTF8String(matched_string);
+
+ auto idx = password.find(matched_string, lastIndex);
+ auto jdx = idx + matched_string.size();
+
auto i = util::character_len(password, 0, idx);
auto j = i + util::character_len(password, idx, jdx) - 1;
// recursively match and score the base string
- auto sub_matches = omnimatch(base_token);
- auto base_analysis = most_guessable_match_sequence(
- base_token,
- sub_matches,
- false
- );
+ std::string base_string;
+ base_token.toUTF8String(base_string);
+ auto sub_matches = omnimatch(base_string);
+ auto base_analysis =
+ most_guessable_match_sequence(base_string, sub_matches, false);
std::vector<Match> base_matches;
std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
std::back_inserter(base_matches));
- auto & base_guesses = base_analysis.guesses;
- matches.push_back(Match(i, j, match.str(0),
+ auto& base_guesses = base_analysis.guesses;
+ matches.push_back(Match(i, j, matched_string,
RepeatMatch{
- base_token,
+ base_string,
base_guesses,
std::move(base_matches),
- match[0].length() / base_token.length(),
- }));
+ matched_string.size() / base_string.size(),
+ }));
matches.back().idx = idx;
matches.back().jdx = jdx;
+ lastUnicodeIndex = matcher->end(status);
lastIndex = jdx;
}
return matches;
diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
index a4e341935ffb..e5c120a86a5c 100644
--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
@@ -75,7 +75,11 @@ std::size_t token_len(const Match & m) __attribute__((pure));
static
std::size_t token_len(const Match & m) {
std::size_t result = m.j - m.i + 1;
- assert(result == util::character_len(m.token));
+ // Bruteforce matches might be any substring of the original string, which are
+ // not necessarily aligned to UTF8 code points, and thus m.token might not be
+ // a valid UTF8 string.
+ if (m.get_pattern() != MatchPattern::BRUTEFORCE)
+ assert(result == util::character_len(m.token));
return result;
}
...@@ -9,8 +9,12 @@ ...@@ -9,8 +9,12 @@
#include <string> #include <string>
#include "base/strings/string_util.h"
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
std::string password(reinterpret_cast<const char*>(data), size); std::string password(reinterpret_cast<const char*>(data), size);
if (!base::IsStringUTF8(password))
return 0;
zxcvbn::dictionary_match(password, {}); zxcvbn::dictionary_match(password, {});
zxcvbn::reverse_dictionary_match(password, {}); zxcvbn::reverse_dictionary_match(password, {});
......
...@@ -812,6 +812,20 @@ TEST(ZxcvbnTest, RepeatMatching) { ...@@ -812,6 +812,20 @@ TEST(ZxcvbnTest, RepeatMatching) {
.base_token = "ab", .base_token = "ab",
})); }));
} }
{
// identifies äö as repeat string, even though äöäö is also repeated.
// verifies that match.i and match.j operate in code point counts, and not
// in bytes.
std::string pattern = "\u00E4\u00F6\u00E4\u00F6\u00E4\u00F6\u00E4\u00F6";
std::vector<Match> matches = repeat_match(pattern);
EXPECT_THAT(matches, ElementsAre(ExpectedRepeatMatch{
.i = 0,
.j = 7,
.token = pattern,
.base_token = "\u00E4\u00F6",
}));
}
} }
TEST(ZxcvbnTest, RegexMatching) { TEST(ZxcvbnTest, RegexMatching) {
......
...@@ -9,10 +9,13 @@ ...@@ -9,10 +9,13 @@
#include <string> #include <string>
#include "base/strings/string_util.h"
#include "third_party/zxcvbn-cpp/native-src/zxcvbn/matching.hpp" #include "third_party/zxcvbn-cpp/native-src/zxcvbn/matching.hpp"
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
std::string password(reinterpret_cast<const char*>(data), size); std::string password(reinterpret_cast<const char*>(data), size);
if (!base::IsStringUTF8(password))
return 0;
auto matches = zxcvbn::omnimatch(password); auto matches = zxcvbn::omnimatch(password);
zxcvbn::most_guessable_match_sequence(password, matches); zxcvbn::most_guessable_match_sequence(password, matches);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment