Provide the equivalent of GURL::DomainIs for url::Origin.

Move the implementation of GURL::DomainIs into a shared function in url_util/, to be used both by GURL and url::Origin. The reason for this refactoring is to reap significant performance gains in scenarios where previously url::Origins had to be converted to GURLs for DomainIs checks. This involved string copying, allocations, and parsing the entire URL once again. BUG=517560 Review-Url: https://codereview.chromium.org/2287483002 Cr-Commit-Position: refs/heads/master@{#415606}

Provide the equivalent of GURL::DomainIs for url::Origin.
Move the implementation of GURL::DomainIs into a shared function in url_util/, to be used both by GURL and url::Origin. The reason for this refactoring is to reap significant performance gains in scenarios where previously url::Origins had to be converted to GURLs for DomainIs checks. This involved string copying, allocations, and parsing the entire URL once again. BUG=517560 Review-Url: https://codereview.chromium.org/2287483002 Cr-Commit-Position: refs/heads/master@{#415606}
054f4033 · pkalinnikov · Commit bot · 6c92845a · 054f4033 · 054f4033
Commit 054f4033 authored Aug 31, 2016 by pkalinnikov Committed by Commit bot Aug 31, 2016
7 changed files
--- a/url/gurl.cc
+++ b/url/gurl.cc
@@ -491,47 +491,13 @@ const GURL& GURL::EmptyGURL() {
 #endif  // WIN32

 bool GURL::DomainIs(base::StringPiece lower_ascii_domain) const {
-  if (!is_valid_ || lower_ascii_domain.empty())
+  if (!is_valid_)
    return false;

-  // FileSystem URLs have empty parsed_.host, so check this first.
+  // FileSystem URLs have empty host_piece, so check this first.
  if (SchemeIsFileSystem() && inner_url_)
    return inner_url_->DomainIs(lower_ascii_domain);
-
-  if (!parsed_.host.is_nonempty())
-    return false;
-
-  // If the host name ends with a dot but the input domain doesn't,
-  // then we ignore the dot in the host name.
-  const char* host_last_pos = spec_.data() + parsed_.host.end() - 1;
-  int host_len = parsed_.host.len;
-  int domain_len = lower_ascii_domain.length();
-  if ('.' == *host_last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
-    host_last_pos--;
-    host_len--;
-  }
-
-  if (host_len < domain_len)
-    return false;
-
-  // |host_first_pos| is the start of the compared part of the host name, not
-  // start of the whole host name.
-  const char* host_first_pos = spec_.data() + parsed_.host.begin +
-                               host_len - domain_len;
-
-  if (!base::LowerCaseEqualsASCII(
-           base::StringPiece(host_first_pos, domain_len), lower_ascii_domain))
-    return false;
-
-  // Make sure there aren't extra characters in host before the compared part;
-  // if the host name is longer than the input domain name, then the character
-  // immediately before the compared part should be a dot. For example,
-  // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
-  if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
-      '.' != *(host_first_pos - 1))
-    return false;
-
-  return true;
+  return url::DomainIs(host_piece(), lower_ascii_domain);
 }

 void GURL::Swap(GURL* other) {

--- a/url/origin.cc
+++ b/url/origin.cc
@@ -72,6 +72,10 @@ bool Origin::IsSameOriginWith(const Origin& other) const {
  return tuple_.Equals(other.tuple_);
 }

+bool Origin::DomainIs(base::StringPiece lower_ascii_domain) const {
+  return !unique_ && url::DomainIs(tuple_.host(), lower_ascii_domain);
+}
+
 bool Origin::operator<(const Origin& other) const {
  return tuple_ < other.tuple_;
 }

--- a/url/origin.h
+++ b/url/origin.h
@@ -122,6 +122,9 @@ class URL_EXPORT Origin {
    return IsSameOriginWith(other);
  }

+  // Same as GURL::DomainIs. If |this| origin is unique, then returns false.
+  bool DomainIs(base::StringPiece lower_ascii_domain) const;
+
  // Allows Origin to be used as a key in STL (for example, a std::set or
  // std::map).
  bool operator<(const Origin& other) const;

--- a/url/origin_unittest.cc
+++ b/url/origin_unittest.cc
@@ -252,4 +252,61 @@ TEST(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) {
  }
 }

+TEST(OriginTest, DomainIs) {
+  const struct {
+    const char* url;
+    const char* lower_ascii_domain;
+    bool expected_domain_is;
+  } kTestCases[] = {
+      {"http://google.com/foo", "google.com", true},
+      {"http://www.google.com:99/foo", "google.com", true},
+      {"http://www.google.com.cn/foo", "google.com", false},
+      {"http://www.google.comm", "google.com", false},
+      {"http://www.iamnotgoogle.com/foo", "google.com", false},
+      {"http://www.google.com/foo", "Google.com", false},
+
+      // If the host ends with a dot, it matches domains with or without a dot.
+      {"http://www.google.com./foo", "google.com", true},
+      {"http://www.google.com./foo", "google.com.", true},
+      {"http://www.google.com./foo", ".com", true},
+      {"http://www.google.com./foo", ".com.", true},
+
+      // But, if the host doesn't end with a dot and the input domain does, then
+      // it's considered to not match.
+      {"http://google.com/foo", "google.com.", false},
+
+      // If the host ends with two dots, it doesn't match.
+      {"http://www.google.com../foo", "google.com", false},
+
+      // Filesystem scheme.
+      {"filesystem:http://www.google.com:99/foo/", "google.com", true},
+      {"filesystem:http://www.iamnotgoogle.com/foo/", "google.com", false},
+
+      // File scheme.
+      {"file:///home/user/text.txt", "", false},
+      {"file:///home/user/text.txt", "txt", false},
+  };
+
+  for (const auto& test_case : kTestCases) {
+    SCOPED_TRACE(testing::Message() << "(url, domain): (" << test_case.url
+                                    << ", " << test_case.lower_ascii_domain
+                                    << ")");
+    GURL url(test_case.url);
+    ASSERT_TRUE(url.is_valid());
+    url::Origin origin(url);
+
+    EXPECT_EQ(test_case.expected_domain_is,
+              origin.DomainIs(test_case.lower_ascii_domain));
+  }
+
+  // If the URL is invalid, DomainIs returns false.
+  GURL invalid_url("google.com");
+  ASSERT_FALSE(invalid_url.is_valid());
+  EXPECT_FALSE(url::Origin(invalid_url).DomainIs("google.com"));
+
+  // Unique origins.
+  EXPECT_FALSE(url::Origin().DomainIs(""));
+  EXPECT_FALSE(url::Origin().DomainIs("com"));
+}
+
 }  // namespace url
--- a/url/url_util.cc
+++ b/url/url_util.cc
@@ -488,6 +488,43 @@ bool FindAndCompareScheme(const base::char16* str,
  return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
 }

+bool DomainIs(base::StringPiece canonicalized_host,
+              base::StringPiece lower_ascii_domain) {
+  if (canonicalized_host.empty() || lower_ascii_domain.empty())
+    return false;
+
+  // If the host name ends with a dot but the input domain doesn't, then we
+  // ignore the dot in the host name.
+  size_t host_len = canonicalized_host.length();
+  if (canonicalized_host.back() == '.' && lower_ascii_domain.back() != '.')
+    --host_len;
+
+  if (host_len < lower_ascii_domain.length())
+    return false;
+
+  // |host_first_pos| is the start of the compared part of the host name, not
+  // start of the whole host name.
+  const char* host_first_pos =
+      canonicalized_host.data() + host_len - lower_ascii_domain.length();
+
+  if (!base::LowerCaseEqualsASCII(
+          base::StringPiece(host_first_pos, lower_ascii_domain.length()),
+          lower_ascii_domain)) {
+    return false;
+  }
+
+  // Make sure there aren't extra characters in host before the compared part;
+  // if the host name is longer than the input domain name, then the character
+  // immediately before the compared part should be a dot. For example,
+  // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
+  if (lower_ascii_domain[0] != '.' && host_len > lower_ascii_domain.length() &&
+      *(host_first_pos - 1) != '.') {
+    return false;
+  }
+
+  return true;
+}
+
 bool Canonicalize(const char* spec,
                  int spec_len,
                  bool trim_path_end,

--- a/url/url_util.h
+++ b/url/url_util.h
@@ -8,6 +8,7 @@
 #include <string>

 #include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
 #include "url/third_party/mozilla/url_parse.h"
 #include "url/url_canon.h"
 #include "url/url_constants.h"
@@ -35,7 +36,7 @@ URL_EXPORT void Initialize();
 // library.
 URL_EXPORT void Shutdown();

-// Schemes --------------------------------------------------------------------
+// Schemes ---------------------------------------------------------------------

 // Types of a scheme representing the requirements on the data represented by
 // the authority component of a URL with the scheme.
@@ -132,7 +133,20 @@ URL_EXPORT bool GetStandardSchemeType(const char* spec,
                                      const Component& scheme,
                                      SchemeType* type);

-// URL library wrappers -------------------------------------------------------
+// Domains ---------------------------------------------------------------------
+
+// Returns true if the |canonicalized_host| matches or is in the same domain as
+// the given |lower_ascii_domain| string. For example, if the canonicalized
+// hostname is "www.google.com", this will return true for "com", "google.com",
+// and "www.google.com" domains.
+//
+// If either of the input StringPieces is empty, the return value is false. The
+// input domain should be a lower-case ASCII string in order to match the
+// canonicalized host.
+URL_EXPORT bool DomainIs(base::StringPiece canonicalized_host,
+                         base::StringPiece lower_ascii_domain);
+
+// URL library wrappers --------------------------------------------------------

 // Parses the given spec according to the extracted scheme type. Normal users
 // should use the URL object, although this may be useful if performance is
@@ -204,7 +218,7 @@ URL_EXPORT bool ReplaceComponents(
    CanonOutput* output,
    Parsed* out_parsed);

-// String helper functions ----------------------------------------------------
+// String helper functions -----------------------------------------------------

 // Unescapes the given string using URL escaping rules.
 URL_EXPORT void DecodeURLEscapeSequences(const char* input,

--- a/url/url_util_unittest.cc
+++ b/url/url_util_unittest.cc
@@ -374,4 +374,47 @@ TEST(URLUtilTest, TestNoRefComponent) {
  EXPECT_FALSE(resolved_parsed.ref.is_valid());
 }

+TEST(URLUtilTest, TestDomainIs) {
+  const struct {
+    const char* canonicalized_host;
+    const char* lower_ascii_domain;
+    bool expected_domain_is;
+  } kTestCases[] = {
+      {"google.com", "google.com", true},
+      {"www.google.com", "google.com", true},      // Subdomain is ignored.
+      {"www.google.com.cn", "google.com", false},  // Different TLD.
+      {"www.google.comm", "google.com", false},
+      {"www.iamnotgoogle.com", "google.com", false},  // Different hostname.
+      {"www.google.com", "Google.com", false},  // The input is not lower-cased.
+
+      // If the host ends with a dot, it matches domains with or without a dot.
+      {"www.google.com.", "google.com", true},
+      {"www.google.com.", "google.com.", true},
+      {"www.google.com.", ".com", true},
+      {"www.google.com.", ".com.", true},
+
+      // But, if the host doesn't end with a dot and the input domain does, then
+      // it's considered to not match.
+      {"www.google.com", "google.com.", false},
+
+      // If the host ends with two dots, it doesn't match.
+      {"www.google.com..", "google.com", false},
+
+      // Empty parameters.
+      {"www.google.com", "", false},
+      {"", "www.google.com", false},
+      {"", "", false},
+  };
+
+  for (const auto& test_case : kTestCases) {
+    SCOPED_TRACE(testing::Message() << "(host, domain): ("
+                                    << test_case.canonicalized_host << ", "
+                                    << test_case.lower_ascii_domain << ")");
+
+    EXPECT_EQ(
+        test_case.expected_domain_is,
+        DomainIs(test_case.canonicalized_host, test_case.lower_ascii_domain));
+  }
+}
+
 }  // namespace url