[zxcvbn] Add UTF8 support to zxcvbn::repeat_match.

This change modifies zxcvbn's repeat_match to correctly detect repeated sequences of non-ASCII characters. Since std::regex does not natively understand UTF8 codepoints, and re2 does not support the required backreferences, this change updates the regex engine to use icu::Regex instead. Furthermore, since zxcvbn only intends to supports UTF8 strings, the fuzzers are modified to only produce valid UTF8. TBR=dpranke Fixed: 1111771 Change-Id: I4a9985997b6d6cd33558cf8607e195e61dac8109 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2377934 Commit-Queue: Jan Wilken Dörrie <jdoerrie@chromium.org> Reviewed-by: Dirk Pranke <dpranke@google.com> Reviewed-by: Alex Gough <ajgo@chromium.org> Cr-Commit-Position: refs/heads/master@{#806351}

[zxcvbn] Add UTF8 support to zxcvbn::repeat_match.
This change modifies zxcvbn's repeat_match to correctly detect repeated sequences of non-ASCII characters. Since std::regex does not natively understand UTF8 codepoints, and re2 does not support the required backreferences, this change updates the regex engine to use icu::Regex instead. Furthermore, since zxcvbn only intends to supports UTF8 strings, the fuzzers are modified to only produce valid UTF8. TBR=dpranke Fixed: 1111771 Change-Id: I4a9985997b6d6cd33558cf8607e195e61dac8109 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2377934 Commit-Queue: Jan Wilken Dörrie <jdoerrie@chromium.org> Reviewed-by: Dirk Pranke <dpranke@google.com> Reviewed-by: Alex Gough <ajgo@chromium.org> Cr-Commit-Position: refs/heads/master@{#806351}
04d9da4f · Jan Wilken Dörrie · Commit Bot · 39104d5e · 04d9da4f · 04d9da4f
Commit 04d9da4f authored Sep 11, 2020 by Jan Wilken Dörrie Committed by Commit Bot Sep 11, 2020
9 changed files
--- a/third_party/zxcvbn-cpp/BUILD.gn
+++ b/third_party/zxcvbn-cpp/BUILD.gn
@@ -30,7 +30,10 @@ source_set("zxcvbn-cpp") {
    "native-src/zxcvbn/util.hpp",
  ]
-  deps = [ "//base" ]
+  public_deps = [
+    "//base",
+    "//third_party/icu",
+  ]
  public_configs = [ ":zxcvbn_include_config" ]
 }
@@ -52,9 +55,11 @@ test("zxcvbn_unittests") {
 fuzzer_test("zxcvbn_matching_fuzzer") {
  sources = [ "test/matching_fuzzer.cc" ]
  deps = [ ":zxcvbn-cpp" ]
+  libfuzzer_options = [ "max_len=128" ]
 }
 fuzzer_test("zxcvbn_scoring_fuzzer") {
  sources = [ "test/scoring_fuzzer.cc" ]
  deps = [ ":zxcvbn-cpp" ]
+  libfuzzer_options = [ "max_len=128" ]
 }
--- a/third_party/zxcvbn-cpp/DEPS
+++ b/third_party/zxcvbn-cpp/DEPS
 include_rules = [
-    "+base/containers/flat_map.h",
+    "+base",
-    "+base/no_destructor.h",
+    "+testing",
-    "+base/strings",
+    "+third_party/icu/source/common/unicode",
-    "+testing/gmock",
+    "+third_party/icu/source/i18n/unicode",
-    "+testing/gtest",
 ]
--- a/third_party/zxcvbn-cpp/README.chromium
+++ b/third_party/zxcvbn-cpp/README.chromium
@@ -33,6 +33,10 @@ Applied the following patches:
  number of computed guesses exceeds DBL_MAX.
 - patches/flat_map_ranked.diff: Use base::flat_map instead of std::unordered_map
  for ranked dictionaries to reduce memory foot print.
+- patches/utf8_support.diff: Switched the regex engine in matching.cpp from
+  std::regex to ICU's regex, so that repeat matches work with multi-byte code
+  points. Also guarded an assert in scoring.cpp that was hit for non-ASCII
+  passwords.
 - Dropped the data dictionary from the check-out, as the data will now be
  provided via component updater. The used dictionaries can be found at
  https://github.com/rianhunter/zxcvbn-cpp/tree/cf092c952cd2325ce390b2691231a8f1cb195d59/data

--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
@@ -20,6 +20,9 @@
 #include <unordered_set>
 #include "base/no_destructor.h"
+#include "base/strings/string_util.h"
+#include "third_party/icu/source/common/unicode/unistr.h"
+#include "third_party/icu/source/i18n/unicode/regex.h"
 namespace zxcvbn {
@@ -453,69 +456,90 @@ std::vector<Match> spatial_match_helper(const std::string & password,
 // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
 //-------------------------------------------------------------------------------
-std::vector<Match> repeat_match(const std::string & password) {
+std::vector<Match> repeat_match(const std::string& password) {
  std::vector<Match> matches;
-  std::regex greedy(R"((.+)\1+)");
-  std::regex lazy(R"((.+?)\1+)");
+  auto unicode_password = icu::UnicodeString::fromUTF8(password);
-  std::regex lazy_anchored(R"(^(.+?)\1+$)");
-  idx_t lastIndex = 0;
+  UErrorCode status = U_ZERO_ERROR;
+  std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile(
+      icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status));
+  std::unique_ptr<icu::RegexMatcher> greedy_matcher(
+      greedy_pattern->matcher(unicode_password, status));
+  std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile(
+      icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status));
+  std::unique_ptr<icu::RegexMatcher> lazy_matcher(
+      lazy_pattern->matcher(unicode_password, status));
+  std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern(
+      icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"),
+                                 0, status));
+  int lastUnicodeIndex = 0;
+  size_t lastIndex = 0;
  while (lastIndex < password.length()) {
-    auto start_iter = lastIndex + password.begin();
+    if (!greedy_matcher->find(lastUnicodeIndex, status) ||
-    std::smatch greedy_match, lazy_match;
+        !lazy_matcher->find(lastUnicodeIndex, status)) {
-    std::regex_search(start_iter, password.end(),
+      break;
-                      greedy_match, greedy);
+    }
-    std::regex_search(start_iter, password.end(),
-                      lazy_match, lazy);
+    icu::RegexMatcher* matcher = nullptr;
-    if (!greedy_match.size()) break;
+    icu::UnicodeString base_token;
-    std::smatch match;
+    if (greedy_matcher->group(status).length() >
-    std::string base_token;
+        lazy_matcher->group(status).length()) {
-    if (greedy_match[0].length() > lazy_match[0].length()) {
      // greedy beats lazy for 'aabaab'
      //   greedy: [aabaab, aab]
      //   lazy:   [aa,     a]
-      match = greedy_match;
+      matcher = greedy_matcher.get();
      // greedy's repeated string might itself be repeated, eg.
      // aabaab in aabaabaabaab.
      // run an anchored lazy match on greedy's repeated string
      // to find the shortest repeated string
-      std::smatch lazy_anchored_match;
+      auto greedy_found = matcher->group(status);
-      auto greedy_found = match.str(0);
-      auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored);
+      std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher(
+          lazy_anchored_pattern->matcher(greedy_found, status));
+      auto ret = lazy_anchored_matcher->find(status);
      assert(ret);
      (void) ret;
-      base_token = lazy_anchored_match.str(1);
+      base_token = lazy_anchored_matcher->group(1, status);
-    }
+    } else {
-    else {
      // lazy beats greedy for 'aaaaa'
      //   greedy: [aaaa,  aa]
      //   lazy:   [aaaaa, a]
-      match = std::move(lazy_match);
+      matcher = lazy_matcher.get();
-      base_token = match.str(1);
+      base_token = matcher->group(1, status);
    }
-    auto idx = lastIndex + match.position();
-    auto jdx = lastIndex + match.position() + match[0].length();
+    std::string matched_string;
+    matcher->group(status).toUTF8String(matched_string);
+    auto idx = password.find(matched_string, lastIndex);
+    auto jdx = idx + matched_string.size();
    auto i = util::character_len(password, 0, idx);
    auto j = i + util::character_len(password, idx, jdx) - 1;
    // recursively match and score the base string
-    auto sub_matches = omnimatch(base_token);
+    std::string base_string;
-    auto base_analysis = most_guessable_match_sequence(
+    base_token.toUTF8String(base_string);
-      base_token,
+    auto sub_matches = omnimatch(base_string);
-      sub_matches,
+    auto base_analysis =
-      false
+        most_guessable_match_sequence(base_string, sub_matches, false);
-      );
    std::vector<Match> base_matches;
    std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
              std::back_inserter(base_matches));
-    auto & base_guesses = base_analysis.guesses;
+    auto& base_guesses = base_analysis.guesses;
-    matches.push_back(Match(i, j, match.str(0),
+    matches.push_back(Match(i, j, matched_string,
                            RepeatMatch{
-                              base_token,
+                                base_string,
                                base_guesses,
                                std::move(base_matches),
-                                match[0].length() / base_token.length(),
+                                matched_string.size() / base_string.size(),
                            }));
    matches.back().idx = idx;
    matches.back().jdx = jdx;
+    lastUnicodeIndex = matcher->end(status);
    lastIndex = jdx;
  }
  return matches;

--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
@@ -75,6 +75,10 @@ std::size_t token_len(const Match & m) __attribute__((pure));
 static
 std::size_t token_len(const Match & m) {
  std::size_t result = m.j - m.i + 1;
+  // Bruteforce matches might be any substring of the original string, which are
+  // not necessarily aligned to UTF8 code points, and thus m.token might not be
+  // a valid UTF8 string.
+  if (m.get_pattern() != MatchPattern::BRUTEFORCE)
    assert(result == util::character_len(m.token));
  return result;
 }

--- a/third_party/zxcvbn-cpp/patches/utf8_support.diff
+++ b/third_party/zxcvbn-cpp/patches/utf8_support.diff
+diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
+index 8f4e6d2f0e00..13465dee1cd7 100644
+--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/matching.cpp
+@@ -20,6 +20,9 @@
+ #include <unordered_set>
+ #include "base/no_destructor.h"
+#include "base/strings/string_util.h"
+#include "third_party/icu/source/common/unicode/unistr.h"
+#include "third_party/icu/source/i18n/unicode/regex.h"
+ namespace zxcvbn {
+@@ -453,69 +456,91 @@ std::vector<Match> spatial_match_helper(const std::string & password,
+ // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
+ //-------------------------------------------------------------------------------
+-std::vector<Match> repeat_match(const std::string & password) {
+std::vector<Match> repeat_match(const std::string& password) {
+   std::vector<Match> matches;
+-  std::regex greedy(R"((.+)\1+)");
+-  std::regex lazy(R"((.+?)\1+)");
+-  std::regex lazy_anchored(R"(^(.+?)\1+$)");
+-  idx_t lastIndex = 0;
+
+  auto unicode_password = icu::UnicodeString::fromUTF8(password);
+
+  UErrorCode status = U_ZERO_ERROR;
+  std::unique_ptr<icu::RegexPattern> greedy_pattern(icu::RegexPattern::compile(
+      icu::UnicodeString::fromUTF8(R"((.+)\1+)"), 0, status));
+  std::unique_ptr<icu::RegexMatcher> greedy_matcher(
+      greedy_pattern->matcher(unicode_password, status));
+
+  std::unique_ptr<icu::RegexPattern> lazy_pattern(icu::RegexPattern::compile(
+      icu::UnicodeString::fromUTF8(R"((.+?)\1+)"), 0, status));
+  std::unique_ptr<icu::RegexMatcher> lazy_matcher(
+      lazy_pattern->matcher(unicode_password, status));
+
+  std::unique_ptr<icu::RegexPattern> lazy_anchored_pattern(
+      icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(R"(^(.+?)\1+$)"),
+                                 0, status));
+
+  int lastUnicodeIndex = 0;
+  size_t lastIndex = 0;
+   while (lastIndex < password.length()) {
+-    auto start_iter = lastIndex + password.begin();
+-    std::smatch greedy_match, lazy_match;
+-    std::regex_search(start_iter, password.end(),
+-                      greedy_match, greedy);
+-    std::regex_search(start_iter, password.end(),
+-                      lazy_match, lazy);
+-    if (!greedy_match.size()) break;
+-    std::smatch match;
+-    std::string base_token;
+-    if (greedy_match[0].length() > lazy_match[0].length()) {
+    if (!greedy_matcher->find(lastUnicodeIndex, status) ||
+        !lazy_matcher->find(lastUnicodeIndex, status)) {
+      break;
+    }
+
+    icu::RegexMatcher* matcher = nullptr;
+    icu::UnicodeString base_token;
+    if (greedy_matcher->group(status).length() >
+        lazy_matcher->group(status).length()) {
+       // greedy beats lazy for 'aabaab'
+       //   greedy: [aabaab, aab]
+       //   lazy:   [aa,     a]
+-      match = greedy_match;
+      matcher = greedy_matcher.get();
+       // greedy's repeated string might itself be repeated, eg.
+       // aabaab in aabaabaabaab.
+       // run an anchored lazy match on greedy's repeated string
+       // to find the shortest repeated string
+-      std::smatch lazy_anchored_match;
+-      auto greedy_found = match.str(0);
+-      auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored);
+      auto greedy_found = matcher->group(status);
+
+      std::unique_ptr<icu::RegexMatcher> lazy_anchored_matcher(
+          lazy_anchored_pattern->matcher(greedy_found, status));
+      auto ret = lazy_anchored_matcher->find(status);
+       assert(ret);
+       (void) ret;
+-      base_token = lazy_anchored_match.str(1);
+      base_token = lazy_anchored_matcher->group(1, status);
+     }
+     else {
+       // lazy beats greedy for 'aaaaa'
+       //   greedy: [aaaa,  aa]
+       //   lazy:   [aaaaa, a]
+-      match = std::move(lazy_match);
+-      base_token = match.str(1);
+      matcher = lazy_matcher.get();
+      base_token = matcher->group(1, status);
+     }
+-    auto idx = lastIndex + match.position();
+-    auto jdx = lastIndex + match.position() + match[0].length();
+
+    std::string matched_string;
+    matcher->group(status).toUTF8String(matched_string);
+
+    auto idx = password.find(matched_string, lastIndex);
+    auto jdx = idx + matched_string.size();
+
+     auto i = util::character_len(password, 0, idx);
+     auto j = i + util::character_len(password, idx, jdx) - 1;
+     // recursively match and score the base string
+-    auto sub_matches = omnimatch(base_token);
+-    auto base_analysis = most_guessable_match_sequence(
+-      base_token,
+-      sub_matches,
+-      false
+-      );
+    std::string base_string;
+    base_token.toUTF8String(base_string);
+    auto sub_matches = omnimatch(base_string);
+    auto base_analysis =
+        most_guessable_match_sequence(base_string, sub_matches, false);
+     std::vector<Match> base_matches;
+     std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
+               std::back_inserter(base_matches));
+-    auto & base_guesses = base_analysis.guesses;
+-    matches.push_back(Match(i, j, match.str(0),
+    auto& base_guesses = base_analysis.guesses;
+    matches.push_back(Match(i, j, matched_string,
+                             RepeatMatch{
+-                              base_token,
+                                base_string,
+                                 base_guesses,
+                                 std::move(base_matches),
+-                                match[0].length() / base_token.length(),
+-                                }));
+                                matched_string.size() / base_string.size(),
+                            }));
+     matches.back().idx = idx;
+     matches.back().jdx = jdx;
+    lastUnicodeIndex = matcher->end(status);
+     lastIndex = jdx;
+   }
+   return matches;
+diff --git a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
+index a4e341935ffb..e5c120a86a5c 100644
+--- a/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
+++ b/third_party/zxcvbn-cpp/native-src/zxcvbn/scoring.cpp
+@@ -75,7 +75,11 @@ std::size_t token_len(const Match & m) __attribute__((pure));
+ static
+ std::size_t token_len(const Match & m) {
+   std::size_t result = m.j - m.i + 1;
+-  assert(result == util::character_len(m.token));
+  // Bruteforce matches might be any substring of the original string, which are
+  // not necessarily aligned to UTF8 code points, and thus m.token might not be
+  // a valid UTF8 string.
+  if (m.get_pattern() != MatchPattern::BRUTEFORCE)
+    assert(result == util::character_len(m.token));
+   return result;
+ }
--- a/third_party/zxcvbn-cpp/test/matching_fuzzer.cc
+++ b/third_party/zxcvbn-cpp/test/matching_fuzzer.cc
@@ -9,8 +9,12 @@
 #include <string>
+#include "base/strings/string_util.h"
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
  std::string password(reinterpret_cast<const char*>(data), size);
+  if (!base::IsStringUTF8(password))
+    return 0;
  zxcvbn::dictionary_match(password, {});
  zxcvbn::reverse_dictionary_match(password, {});

--- a/third_party/zxcvbn-cpp/test/matching_unittest.cc
+++ b/third_party/zxcvbn-cpp/test/matching_unittest.cc
@@ -812,6 +812,20 @@ TEST(ZxcvbnTest, RepeatMatching) {
                             .base_token = "ab",
                         }));
  }
+  {
+    // identifies äö as repeat string, even though äöäö is also repeated.
+    // verifies that match.i and match.j operate in code point counts, and not
+    // in bytes.
+    std::string pattern = "\u00E4\u00F6\u00E4\u00F6\u00E4\u00F6\u00E4\u00F6";
+    std::vector<Match> matches = repeat_match(pattern);
+    EXPECT_THAT(matches, ElementsAre(ExpectedRepeatMatch{
+                             .i = 0,
+                             .j = 7,
+                             .token = pattern,
+                             .base_token = "\u00E4\u00F6",
+                         }));
+  }
 }
 TEST(ZxcvbnTest, RegexMatching) {

--- a/third_party/zxcvbn-cpp/test/scoring_fuzzer.cc
+++ b/third_party/zxcvbn-cpp/test/scoring_fuzzer.cc
@@ -9,10 +9,13 @@
 #include <string>
+#include "base/strings/string_util.h"
 #include "third_party/zxcvbn-cpp/native-src/zxcvbn/matching.hpp"
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
  std::string password(reinterpret_cast<const char*>(data), size);
+  if (!base::IsStringUTF8(password))
+    return 0;
  auto matches = zxcvbn::omnimatch(password);
  zxcvbn::most_guessable_match_sequence(password, matches);