Commit 833fa265 authored by pkasting@chromium.org's avatar pkasting@chromium.org

Loosen RFC 1738 compliance check to allow underscores where we already allowed...

Loosen RFC 1738 compliance check to allow underscores where we already allowed hyphens, to match real-world needs.

I don't believe further loosening will be required but that data will hopefully be coming soon.  In the meantime people are asking for this fix.

BUG=25714
TEST=Entring "a_b.com" in the omnibox should default to navigate, not search
Review URL: http://codereview.chromium.org/339017

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@30245 0039d316-1c4b-4281-b951-d872f2087c98
parent e33972ea
......@@ -175,14 +175,14 @@ AutocompleteInput::Type AutocompleteInput::Parse(
if (registry_length == std::wstring::npos)
return QUERY; // Could be a broken IP address, etc.
// See if the hostname is valid per RFC 1738. While IE and GURL allow
// hostnames to contain many other characters (perhaps for weird intranet
// machines), it's extremely unlikely that a user would be trying to type
// those in for anything other than a search query.
// See if the hostname is valid. While IE and GURL allow hostnames to contain
// many other characters (perhaps for weird intranet machines), it's extremely
// unlikely that a user would be trying to type those in for anything other
// than a search query.
url_canon::CanonHostInfo host_info;
const std::string canonicalized_host(net::CanonicalizeHost(host, &host_info));
if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&
!net::IsCanonicalizedHostRFC1738Compliant(canonicalized_host))
!net::IsCanonicalizedHostCompliant(canonicalized_host))
return QUERY;
// Presence of a port means this is likely a URL, if the port is really a port
......
......@@ -243,7 +243,9 @@ TEST(AutocompleteTest, InputType) {
{ L"http://foo", AutocompleteInput::URL },
{ L"http://foo.c", AutocompleteInput::URL },
{ L"http://foo.com", AutocompleteInput::URL },
{ L"http://foo_bar.com", AutocompleteInput::URL },
{ L"http://-.com", AutocompleteInput::QUERY },
{ L"http://_foo_.com", AutocompleteInput::QUERY },
{ L"http://foo.com:abc", AutocompleteInput::QUERY },
{ L"http://foo.com:123456", AutocompleteInput::QUERY },
{ L"http:user@foo.com", AutocompleteInput::URL },
......
......@@ -961,7 +961,7 @@ inline bool IsHostCharDigit(char c) {
return (c >= '0') && (c <= '9');
}
bool IsCanonicalizedHostRFC1738Compliant(const std::string& host) {
bool IsCanonicalizedHostCompliant(const std::string& host) {
if (host.empty())
return false;
......@@ -970,7 +970,7 @@ bool IsCanonicalizedHostRFC1738Compliant(const std::string& host) {
IN_COMPONENT_STARTED_DIGIT,
IN_COMPONENT_STARTED_ALPHA
} state = NOT_IN_COMPONENT;
bool last_char_was_hyphen = false;
bool last_char_was_hyphen_or_underscore = false;
for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
const char c = *i;
......@@ -983,13 +983,13 @@ bool IsCanonicalizedHostRFC1738Compliant(const std::string& host) {
return false;
} else {
if (c == '.') {
if (last_char_was_hyphen)
if (last_char_was_hyphen_or_underscore)
return false;
state = NOT_IN_COMPONENT;
} else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) {
last_char_was_hyphen = false;
} else if (c == '-') {
last_char_was_hyphen = true;
last_char_was_hyphen_or_underscore = false;
} else if ((c == '-') || (c == '_')) {
last_char_was_hyphen_or_underscore = true;
} else {
return false;
}
......
......@@ -158,16 +158,17 @@ std::string CanonicalizeHost(const std::string& host,
std::string CanonicalizeHost(const std::wstring& host,
url_canon::CanonHostInfo* host_info);
// Returns true if |host| is RFC 1738-compliant (and not an IP address). The
// rules are:
// Returns true if |host| is not an IP address and is compliant with a set of
// rules based on RFC 1738 and tweaked to be compatible with the real world.
// The rules are:
// * One or more components separated by '.'
// * Each component begins and ends with an alphanumeric character
// * Each component contains only alphanumeric characters and '-'
// * Each component contains only alphanumeric characters and '-' or '_'
// * The last component does not begin with a digit
//
// NOTE: You should only pass in hosts that have been returned from
// CanonicalizeHost(), or you may not get accurate results.
bool IsCanonicalizedHostRFC1738Compliant(const std::string& host);
bool IsCanonicalizedHostCompliant(const std::string& host);
// Call these functions to get the html snippet for a directory listing.
// The return values of both functions are in UTF-8.
......
......@@ -344,7 +344,7 @@ const IDNTestCase idn_cases[] = {
#endif
};
struct RFC1738Case {
struct CompliantHostCase {
const char* host;
bool expected_output;
};
......@@ -815,8 +815,8 @@ TEST(NetUtilTest, IDNToUnicodeSlow) {
}
}
TEST(NetUtilTest, RFC1738) {
const RFC1738Case rfc1738_cases[] = {
TEST(NetUtilTest, CompliantHost) {
const CompliantHostCase compliant_host_cases[] = {
{"", false},
{"a", true},
{"-", false},
......@@ -825,19 +825,20 @@ TEST(NetUtilTest, RFC1738) {
{"a.a", true},
{"9.a", true},
{"a.9", false},
{"_9a", false},
{"a.a9", true},
{"a.9a", false},
{"a+9a", false},
{"1-.a-b", false},
{"1-2.a-b", true},
{"1-2.a_b", true},
{"a.b.c.d.e", true},
{"1.2.3.4.e", true},
{"a.b.c.d.5", false},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(rfc1738_cases); ++i) {
EXPECT_EQ(rfc1738_cases[i].expected_output,
net::IsCanonicalizedHostRFC1738Compliant(rfc1738_cases[i].host));
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(compliant_host_cases); ++i) {
EXPECT_EQ(compliant_host_cases[i].expected_output,
net::IsCanonicalizedHostCompliant(compliant_host_cases[i].host));
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment