Commit aada226a authored by Matt Menke's avatar Matt Menke Committed by Commit Bot

Rework Content-Type parsing code to more closely resemble spec.

This aligns behavior more closely FireFox, though less closely
with Edge and Safari.  See RFC at https://mimesniff.spec.whatwg.org/
(This behavior is also more consistent with
https://tools.ietf.org/html/rfc7231#section-3.1.1.1)

In particular, no longer split the string around semi-colons first,
but instead parse character-by-character (Which only really affects
weird things like charset=";"), and use backslash as an escape
character in quoted string values.  This affects both charset and
boundary parameters, so we should keep an eye out for bug reports
about either of those breaking in corner cases.

This also applies the quoted string parsing logic to Content-Type
boundary parameters (It was just used for charsets before) - this is
unlikely to break anything, since the one consumer of that field
arbitrarily removes all leading/trailing spaces and quotes from
boundary strings.

Bug: 774429
Change-Id: I28c9c035e1c1c28e181bd0e8005266cfd63af7e9
Reviewed-on: https://chromium-review.googlesource.com/990995
Commit-Queue: Matt Menke <mmenke@chromium.org>
Reviewed-by: default avatarBence Béky <bnc@chromium.org>
Cr-Commit-Position: refs/heads/master@{#548008}
parent f15d3c2e
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
namespace net { namespace net {
namespace { namespace {
template <typename ConstIterator> template <typename ConstIterator>
void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) { void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) {
// leading whitespace // leading whitespace
...@@ -33,23 +34,6 @@ void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) { ...@@ -33,23 +34,6 @@ void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) {
--(*end); --(*end);
} }
// Helpers --------------------------------------------------------------------
// Returns the index of the closing quote of the string, if any. |start| points
// at the opening quote.
size_t FindStringEnd(const std::string& line, size_t start, char delim) {
DCHECK_LT(start, line.length());
DCHECK_EQ(line[start], delim);
DCHECK((delim == '"') || (delim == '\''));
const char set[] = { delim, '\\', '\0' };
for (size_t end = line.find_first_of(set, start + 1);
end != std::string::npos; end = line.find_first_of(set, end + 2)) {
if (line[end] != '\\')
return end;
}
return line.length();
}
} // namespace } // namespace
// HttpUtil ------------------------------------------------------------------- // HttpUtil -------------------------------------------------------------------
...@@ -79,65 +63,111 @@ void HttpUtil::ParseContentType(const std::string& content_type_str, ...@@ -79,65 +63,111 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
if (type_end == std::string::npos) if (type_end == std::string::npos)
type_end = content_type_str.length(); type_end = content_type_str.length();
size_t charset_val = 0; std::string charset_value;
size_t charset_end = 0;
bool type_has_charset = false; bool type_has_charset = false;
// Iterate over parameters // Iterate over parameters. Can't split the string around semicolons
size_t param_start = content_type_str.find_first_of(';', type_end); // preemptively because quoted strings may include semicolons. Matches logic
if (param_start != std::string::npos) { // in https://mimesniff.spec.whatwg.org/.
base::StringTokenizer tokenizer(begin + param_start, content_type_str.end(), std::string::size_type offset = content_type_str.find_first_of(';', type_end);
";"); while (offset < content_type_str.size()) {
tokenizer.set_quote_chars("\""); DCHECK_EQ(';', content_type_str[offset]);
while (tokenizer.GetNext()) { // Trim off the semicolon.
std::string::const_iterator equals_sign = ++offset;
std::find(tokenizer.token_begin(), tokenizer.token_end(), '=');
if (equals_sign == tokenizer.token_end()) // Trim off any following spaces.
continue; offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
std::string::size_type param_name_start = offset;
// Extend parameter name until run into a semicolon or equals sign. Per
// spec, trailing spaces are not removed.
offset = content_type_str.find_first_of(";=", offset);
// Nothing more to do if at end of string, or if there's no parameter
// value, since names without values aren't allowed.
if (offset == std::string::npos || content_type_str[offset] == ';')
continue;
std::string::const_iterator param_name_begin = tokenizer.token_begin(); base::StringPiece param_name(content_type_str.begin() + param_name_start,
std::string::const_iterator param_name_end = equals_sign; content_type_str.begin() + offset);
std::string::const_iterator param_value_begin = equals_sign + 1;
std::string::const_iterator param_value_end = tokenizer.token_end(); // Now parse the value.
DCHECK(param_value_begin <= tokenizer.token_end()); DCHECK_EQ('=', content_type_str[offset]);
// Trim off the '='.
// From parameter name, only trim leading whitespace. offset++;
// From parameter value, only trim trailing whitespace.
// See https://crbug.com/772834. // Remove leading spaces. This violates the spec, though it matches
TrimLWS(&param_name_begin, &param_value_end); // pre-existing behavior.
//
if (base::LowerCaseEqualsASCII( // TODO(mmenke): Consider doing this (only?) after parsing quotes, which
base::StringPiece(param_name_begin, param_name_end), "charset")) { // seems to align more with the spec - not the content-type spec, but the
// TODO(abarth): Refactor this function to consistently use iterators. // GET spec's way of getting an encoding, and the spec for handling
charset_val = param_value_begin - begin; // boundary values as well.
charset_end = param_value_end - begin; // See https://encoding.spec.whatwg.org/#names-and-labels.
type_has_charset = true; offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
} else if (base::LowerCaseEqualsASCII(
base::StringPiece(param_name_begin, param_name_end), std::string param_value;
"boundary")) { if (offset == std::string::npos) {
if (boundary) // Nothing to do here - an unquoted string of only whitespace is
boundary->assign(param_value_begin, param_value_end); // considered to have an empty value.
} else if (content_type_str[offset] != '"') {
// If the first character is not a quotation mark, copy data directly.
std::string::size_type value_start = offset;
offset = content_type_str.find_first_of(';', offset);
std::string::size_type value_end = offset;
// Remove terminal whitespace. If ran off the end of the string, have to
// update |value_end| first.
if (value_end == std::string::npos)
value_end = content_type_str.size();
while (value_end > value_start &&
IsLWS(content_type_str[value_end - 1])) {
--value_end;
} }
}
}
if (type_has_charset) { param_value =
// Trim leading whitespace from charset_val. content_type_str.substr(value_start, value_end - value_start);
charset_val = content_type_str.find_first_not_of(HTTP_LWS, charset_val);
charset_val = std::min(charset_val, charset_end);
char first_char = content_type_str[charset_val];
// RFC 7231 Section 3.1.1.1 allows double quotes around charset.
if (first_char == '"') {
charset_end = FindStringEnd(content_type_str, charset_val, first_char);
++charset_val;
DCHECK(charset_end >= charset_val);
} else { } else {
// Ignore the part after '('. This is not in the standard, but may occur // Otherwise, append data, with special handling for backslashes, until
// in rare cases. // a close quote.
// Skip open quote.
DCHECK_EQ('"', content_type_str[offset]);
++offset;
while (offset < content_type_str.size() &&
content_type_str[offset] != '"') {
// Skip over backslash and append the next character, when not at
// the end of the string. Otherwise, copy the next character (Which may
// be a backslash).
if (content_type_str[offset] == '\\' &&
offset + 1 < content_type_str.size()) {
++offset;
}
param_value += content_type_str[offset];
++offset;
}
offset = content_type_str.find_first_of(';', offset);
}
// 0-length parameter values are not considered valid.
if (!param_value.size())
continue;
// TODO(mmenke): Take first, rather than last, value in the case of
// duplicates.
// TODO(mmenke): Check that name has only valid characters.
if (base::LowerCaseEqualsASCII(param_name, "charset")) {
// Ignore the part after '('. This is not in the standard, but may
// occur in rare cases.
// TODO(bnc): Do not ignore the part after '('. // TODO(bnc): Do not ignore the part after '('.
// See https://crbug.com/772343. // See https://crbug.com/772343.
charset_end = std::min(content_type_str.find_first_of("(", charset_val), charset_value = param_value.substr(0, param_value.find("("));
charset_end); type_has_charset = true;
} else if (base::LowerCaseEqualsASCII(param_name, "boundary")) {
if (boundary)
boundary->assign(std::move(param_value));
} }
} }
...@@ -164,8 +194,7 @@ void HttpUtil::ParseContentType(const std::string& content_type_str, ...@@ -164,8 +194,7 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
} }
if ((!eq && *had_charset) || type_has_charset) { if ((!eq && *had_charset) || type_has_charset) {
*had_charset = true; *had_charset = true;
*charset = base::ToLowerASCII( *charset = base::ToLowerASCII(charset_value);
base::StringPiece(begin + charset_val, begin + charset_end));
} }
} }
......
...@@ -39,8 +39,8 @@ class NET_EXPORT HttpUtil { ...@@ -39,8 +39,8 @@ class NET_EXPORT HttpUtil {
// charset values are normalized to lowercase. The mime_type and charset // charset values are normalized to lowercase. The mime_type and charset
// output values are only modified if the content_type_str contains a mime // output values are only modified if the content_type_str contains a mime
// type and charset value, respectively. The boundary output value is // type and charset value, respectively. The boundary output value is
// optional and will be assigned the (quoted) value of the boundary // optional and will be assigned the (unquoted) value of the boundary
// paramter, if any. // parameter, if any.
static void ParseContentType(const std::string& content_type_str, static void ParseContentType(const std::string& content_type_str,
std::string* mime_type, std::string* mime_type,
std::string* charset, std::string* charset,
......
...@@ -734,6 +734,18 @@ TEST(HttpUtilTest, ParseContentType) { ...@@ -734,6 +734,18 @@ TEST(HttpUtilTest, ParseContentType) {
const bool expected_had_charset; const bool expected_had_charset;
const char* const expected_boundary; const char* const expected_boundary;
} tests[] = { } tests[] = {
{ "text/html",
"text/html",
"",
false,
""
},
{ "text/html;",
"text/html",
"",
false,
""
},
{ "text/html; charset=utf-8", { "text/html; charset=utf-8",
"text/html", "text/html",
"utf-8", "utf-8",
...@@ -763,7 +775,7 @@ TEST(HttpUtilTest, ParseContentType) { ...@@ -763,7 +775,7 @@ TEST(HttpUtilTest, ParseContentType) {
"text/html", "text/html",
"", "",
false, false,
"\"WebKit-ada-df-dsf-adsfadsfs\"" "WebKit-ada-df-dsf-adsfadsfs"
}, },
// Parameter name is "boundary ", not "boundary". // Parameter name is "boundary ", not "boundary".
// See https://crbug.com/772834. // See https://crbug.com/772834.
...@@ -778,20 +790,20 @@ TEST(HttpUtilTest, ParseContentType) { ...@@ -778,20 +790,20 @@ TEST(HttpUtilTest, ParseContentType) {
"text/html", "text/html",
"", "",
false, false,
" \"WebKit-ada-df-dsf-adsfadsfs\"" "WebKit-ada-df-dsf-adsfadsfs"
}, },
// Parameter value includes leading space. See https://crbug.com/772834. // Parameter value includes leading space. See https://crbug.com/772834.
{ "text/html; boundary= \"WebKit-ada-df-dsf-adsfadsfs\" ", { "text/html; boundary= \"WebKit-ada-df-dsf-adsfadsfs\" ",
"text/html", "text/html",
"", "",
false, false,
" \"WebKit-ada-df-dsf-adsfadsfs\"" "WebKit-ada-df-dsf-adsfadsfs"
}, },
{ "text/html; boundary=\"WebKit-ada-df-dsf-adsfadsfs \"", { "text/html; boundary=\"WebKit-ada-df-dsf-adsfadsfs \"",
"text/html", "text/html",
"", "",
false, false,
"\"WebKit-ada-df-dsf-adsfadsfs \"" "WebKit-ada-df-dsf-adsfadsfs "
}, },
{ "text/html; boundary=WebKit-ada-df-dsf-adsfadsfs", { "text/html; boundary=WebKit-ada-df-dsf-adsfadsfs",
"text/html", "text/html",
...@@ -799,12 +811,110 @@ TEST(HttpUtilTest, ParseContentType) { ...@@ -799,12 +811,110 @@ TEST(HttpUtilTest, ParseContentType) {
false, false,
"WebKit-ada-df-dsf-adsfadsfs" "WebKit-ada-df-dsf-adsfadsfs"
}, },
{ "text/html; charset",
"text/html",
"",
false,
""
},
{ "text/html; charset=",
"text/html",
"",
false,
""
},
{ "text/html; charset= ",
"text/html",
"",
false,
""
},
{ "text/html; charset= ;",
"text/html",
"",
false,
""
},
{ "text/html; charset=\"\"",
"text/html",
"",
false,
""
},
{ "text/html; charset=\" \"",
"text/html",
" ",
true,
""
},
{ "text/html; charset; charset=; charset=utf-8",
"text/html",
"utf-8",
true,
""
},
{ "text/html; charset=utf-8; charset=; charset;",
"text/html",
"utf-8",
true,
""
},
// Stray quotes ignored.
{ "text/html; \"; \"\"; charset=utf-8",
"text/html",
"utf-8",
true,
""
},
// Non-leading quotes kept as-is.
{ "text/html; charset=u\"tf-8\"",
"text/html",
"u\"tf-8\"",
true,
""
},
{ "text/html; charset=\"utf-8\"", { "text/html; charset=\"utf-8\"",
"text/html", "text/html",
"utf-8", "utf-8",
true, true,
"" ""
}, },
// No closing quote.
{ "text/html; charset=\"utf-8",
"text/html",
"utf-8",
true,
""
},
// Check that \ is treated as an escape character.
{ "text/html; charset=\"\\utf\\-\\8\"",
"text/html",
"utf-8",
true,
""
},
// More interseting escape character test - test escaped backslash, escaped
// quote, and backslash at end of input in unterminated quoted string.
{ "text/html; charset=\"\\\\\\\"\\",
"text/html",
"\\\"\\",
true,
""
},
// Check quoted semicolon.
{ "text/html; charset=\";charset=utf-8;\"",
"text/html",
";charset=utf-8;",
true,
""
},
// Unclear if this one should just return utf-8 or not.
{ "text/html; charset= \"utf-8\"",
"text/html",
"utf-8",
true,
""
},
// Regression test for https://crbug.com/772350: // Regression test for https://crbug.com/772350:
// Single quotes are not delimiters but must be treated as part of charset. // Single quotes are not delimiters but must be treated as part of charset.
{ "text/html; charset='utf-8'", { "text/html; charset='utf-8'",
...@@ -823,10 +933,14 @@ TEST(HttpUtilTest, ParseContentType) { ...@@ -823,10 +933,14 @@ TEST(HttpUtilTest, ParseContentType) {
std::string boundary; std::string boundary;
HttpUtil::ParseContentType(tests[i].content_type, &mime_type, &charset, HttpUtil::ParseContentType(tests[i].content_type, &mime_type, &charset,
&had_charset, &boundary); &had_charset, &boundary);
EXPECT_EQ(tests[i].expected_mime_type, mime_type) << "i=" << i; EXPECT_EQ(tests[i].expected_mime_type, mime_type)
EXPECT_EQ(tests[i].expected_charset, charset) << "i=" << i; << "content_type=" << tests[i].content_type;
EXPECT_EQ(tests[i].expected_had_charset, had_charset) << "i=" << i; EXPECT_EQ(tests[i].expected_charset, charset)
EXPECT_EQ(tests[i].expected_boundary, boundary) << "i=" << i; << "content_type=" << tests[i].content_type;
EXPECT_EQ(tests[i].expected_had_charset, had_charset)
<< "content_type=" << tests[i].content_type;
EXPECT_EQ(tests[i].expected_boundary, boundary)
<< "content_type=" << tests[i].content_type;
} }
} }
......
...@@ -15,7 +15,7 @@ PASS text/html;charset=gbk' ...@@ -15,7 +15,7 @@ PASS text/html;charset=gbk'
PASS text/html;test;charset=gbk PASS text/html;test;charset=gbk
PASS text/html;test=;charset=gbk PASS text/html;test=;charset=gbk
PASS text/html;';charset=gbk PASS text/html;';charset=gbk
FAIL text/html;";charset=gbk assert_equals: expected "GBK" but got "UTF-8" PASS text/html;";charset=gbk
PASS text/html ; ; charset=gbk PASS text/html ; ; charset=gbk
PASS text/html;;;;charset=gbk PASS text/html;;;;charset=gbk
PASS text/html;charset="gbk" PASS text/html;charset="gbk"
...@@ -23,7 +23,7 @@ PASS text/html;charset="gbk ...@@ -23,7 +23,7 @@ PASS text/html;charset="gbk
PASS text/html;charset=gbk" PASS text/html;charset=gbk"
FAIL text/html;charset=" gbk" assert_equals: expected "GBK" but got "UTF-8" FAIL text/html;charset=" gbk" assert_equals: expected "GBK" but got "UTF-8"
FAIL text/html;charset="\ gbk" assert_equals: expected "GBK" but got "UTF-8" FAIL text/html;charset="\ gbk" assert_equals: expected "GBK" but got "UTF-8"
FAIL text/html;charset="\g\b\k" assert_equals: expected "GBK" but got "UTF-8" PASS text/html;charset="\g\b\k"
PASS text/html;charset="gbk"x PASS text/html;charset="gbk"x
PASS text/html;charset={gbk} PASS text/html;charset={gbk}
PASS text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk PASS text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment