Rework Content-Type parsing code to more closely resemble spec.

This aligns behavior more closely FireFox, though less closely with Edge and Safari. See RFC at https://mimesniff.spec.whatwg.org/ (This behavior is also more consistent with https://tools.ietf.org/html/rfc7231#section-3.1.1.1) In particular, no longer split the string around semi-colons first, but instead parse character-by-character (Which only really affects weird things like charset=";"), and use backslash as an escape character in quoted string values. This affects both charset and boundary parameters, so we should keep an eye out for bug reports about either of those breaking in corner cases. This also applies the quoted string parsing logic to Content-Type boundary parameters (It was just used for charsets before) - this is unlikely to break anything, since the one consumer of that field arbitrarily removes all leading/trailing spaces and quotes from boundary strings. Bug: 774429 Change-Id: I28c9c035e1c1c28e181bd0e8005266cfd63af7e9 Reviewed-on: https://chromium-review.googlesource.com/990995 Commit-Queue: Matt Menke <mmenke@chromium.org> Reviewed-by: Bence Béky <bnc@chromium.org> Cr-Commit-Position: refs/heads/master@{#548008}

Rework Content-Type parsing code to more closely resemble spec.
This aligns behavior more closely FireFox, though less closely with Edge and Safari. See RFC at https://mimesniff.spec.whatwg.org/ (This behavior is also more consistent with https://tools.ietf.org/html/rfc7231#section-3.1.1.1) In particular, no longer split the string around semi-colons first, but instead parse character-by-character (Which only really affects weird things like charset=";"), and use backslash as an escape character in quoted string values. This affects both charset and boundary parameters, so we should keep an eye out for bug reports about either of those breaking in corner cases. This also applies the quoted string parsing logic to Content-Type boundary parameters (It was just used for charsets before) - this is unlikely to break anything, since the one consumer of that field arbitrarily removes all leading/trailing spaces and quotes from boundary strings. Bug: 774429 Change-Id: I28c9c035e1c1c28e181bd0e8005266cfd63af7e9 Reviewed-on: https://chromium-review.googlesource.com/990995 Commit-Queue: Matt Menke <mmenke@chromium.org> Reviewed-by: Bence Béky <bnc@chromium.org> Cr-Commit-Position: refs/heads/master@{#548008}
aada226a · Matt Menke · Commit Bot · f15d3c2e · aada226a · aada226a
Commit aada226a authored Apr 04, 2018 by Matt Menke Committed by Commit Bot Apr 04, 2018
4 changed files
--- a/net/http/http_util.cc
+++ b/net/http/http_util.cc
@@ -22,6 +22,7 @@
 namespace net {
 namespace {
 template <typename ConstIterator>
 void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) {
  // leading whitespace
@@ -33,23 +34,6 @@ void TrimLWSImplementation(ConstIterator* begin, ConstIterator* end) {
    --(*end);
 }
-// Helpers --------------------------------------------------------------------
-// Returns the index of the closing quote of the string, if any.  |start| points
-// at the opening quote.
-size_t FindStringEnd(const std::string& line, size_t start, char delim) {
-  DCHECK_LT(start, line.length());
-  DCHECK_EQ(line[start], delim);
-  DCHECK((delim == '"') || (delim == '\''));
-  const char set[] = { delim, '\\', '\0' };
-  for (size_t end = line.find_first_of(set, start + 1);
-       end != std::string::npos; end = line.find_first_of(set, end + 2)) {
-    if (line[end] != '\\')
-      return end;
-  }
-  return line.length();
-}
 }  // namespace
 // HttpUtil -------------------------------------------------------------------
@@ -79,65 +63,111 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
  if (type_end == std::string::npos)
    type_end = content_type_str.length();
-  size_t charset_val = 0;
+  std::string charset_value;
-  size_t charset_end = 0;
  bool type_has_charset = false;
-  // Iterate over parameters
+  // Iterate over parameters. Can't split the string around semicolons
-  size_t param_start = content_type_str.find_first_of(';', type_end);
+  // preemptively because quoted strings may include semicolons. Matches logic
-  if (param_start != std::string::npos) {
+  // in https://mimesniff.spec.whatwg.org/.
-    base::StringTokenizer tokenizer(begin + param_start, content_type_str.end(),
+  std::string::size_type offset = content_type_str.find_first_of(';', type_end);
-                                    ";");
+  while (offset < content_type_str.size()) {
-    tokenizer.set_quote_chars("\"");
+    DCHECK_EQ(';', content_type_str[offset]);
-    while (tokenizer.GetNext()) {
+    // Trim off the semicolon.
-      std::string::const_iterator equals_sign =
+    ++offset;
-          std::find(tokenizer.token_begin(), tokenizer.token_end(), '=');
-      if (equals_sign == tokenizer.token_end())
+    // Trim off any following spaces.
-        continue;
+    offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
+    std::string::size_type param_name_start = offset;
+    // Extend parameter name until run into a semicolon or equals sign.  Per
+    // spec, trailing spaces are not removed.
+    offset = content_type_str.find_first_of(";=", offset);
+    // Nothing more to do if at end of string, or if there's no parameter
+    // value, since names without values aren't allowed.
+    if (offset == std::string::npos || content_type_str[offset] == ';')
+      continue;
-      std::string::const_iterator param_name_begin = tokenizer.token_begin();
+    base::StringPiece param_name(content_type_str.begin() + param_name_start,
-      std::string::const_iterator param_name_end = equals_sign;
+                                 content_type_str.begin() + offset);
-      std::string::const_iterator param_value_begin = equals_sign + 1;
-      std::string::const_iterator param_value_end = tokenizer.token_end();
+    // Now parse the value.
-      DCHECK(param_value_begin <= tokenizer.token_end());
+    DCHECK_EQ('=', content_type_str[offset]);
+    // Trim off the '='.
-      // From parameter name, only trim leading whitespace.
+    offset++;
-      // From parameter value, only trim trailing whitespace.
-      // See https://crbug.com/772834.
+    // Remove leading spaces. This violates the spec, though it matches
-      TrimLWS(&param_name_begin, &param_value_end);
+    // pre-existing behavior.
+    //
-      if (base::LowerCaseEqualsASCII(
+    // TODO(mmenke): Consider doing this (only?) after parsing quotes, which
-              base::StringPiece(param_name_begin, param_name_end), "charset")) {
+    // seems to align more with the spec - not the content-type spec, but the
-        // TODO(abarth): Refactor this function to consistently use iterators.
+    // GET spec's way of getting an encoding, and the spec for handling
-        charset_val = param_value_begin - begin;
+    // boundary values as well.
-        charset_end = param_value_end - begin;
+    // See https://encoding.spec.whatwg.org/#names-and-labels.
-        type_has_charset = true;
+    offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
-      } else if (base::LowerCaseEqualsASCII(
-                     base::StringPiece(param_name_begin, param_name_end),
+    std::string param_value;
-                     "boundary")) {
+    if (offset == std::string::npos) {
-        if (boundary)
+      // Nothing to do here - an unquoted string of only whitespace is
-          boundary->assign(param_value_begin, param_value_end);
+      // considered to have an empty value.
+    } else if (content_type_str[offset] != '"') {
+      // If the first character is not a quotation mark, copy data directly.
+      std::string::size_type value_start = offset;
+      offset = content_type_str.find_first_of(';', offset);
+      std::string::size_type value_end = offset;
+      // Remove terminal whitespace. If ran off the end of the string, have to
+      // update |value_end| first.
+      if (value_end == std::string::npos)
+        value_end = content_type_str.size();
+      while (value_end > value_start &&
+             IsLWS(content_type_str[value_end - 1])) {
+        --value_end;
      }
-    }
-  }
-  if (type_has_charset) {
+      param_value =
-    // Trim leading whitespace from charset_val.
+          content_type_str.substr(value_start, value_end - value_start);
-    charset_val = content_type_str.find_first_not_of(HTTP_LWS, charset_val);
-    charset_val = std::min(charset_val, charset_end);
-    char first_char = content_type_str[charset_val];
-    // RFC 7231 Section 3.1.1.1 allows double quotes around charset.
-    if (first_char == '"') {
-      charset_end = FindStringEnd(content_type_str, charset_val, first_char);
-      ++charset_val;
-      DCHECK(charset_end >= charset_val);
    } else {
-      // Ignore the part after '('.  This is not in the standard, but may occur
+      // Otherwise, append data, with special handling for backslashes, until
-      // in rare cases.
+      // a close quote.
+      // Skip open quote.
+      DCHECK_EQ('"', content_type_str[offset]);
+      ++offset;
+      while (offset < content_type_str.size() &&
+             content_type_str[offset] != '"') {
+        // Skip over backslash and append the next character, when not at
+        // the end of the string. Otherwise, copy the next character (Which may
+        // be a backslash).
+        if (content_type_str[offset] == '\\' &&
+            offset + 1 < content_type_str.size()) {
+          ++offset;
+        }
+        param_value += content_type_str[offset];
+        ++offset;
+      }
+      offset = content_type_str.find_first_of(';', offset);
+    }
+    // 0-length parameter values are not considered valid.
+    if (!param_value.size())
+      continue;
+    // TODO(mmenke): Take first, rather than last, value in the case of
+    // duplicates.
+    // TODO(mmenke): Check that name has only valid characters.
+    if (base::LowerCaseEqualsASCII(param_name, "charset")) {
+      // Ignore the part after '('.  This is not in the standard, but may
+      // occur in rare cases.
      // TODO(bnc): Do not ignore the part after '('.
      // See https://crbug.com/772343.
-      charset_end = std::min(content_type_str.find_first_of("(", charset_val),
+      charset_value = param_value.substr(0, param_value.find("("));
-                             charset_end);
+      type_has_charset = true;
+    } else if (base::LowerCaseEqualsASCII(param_name, "boundary")) {
+      if (boundary)
+        boundary->assign(std::move(param_value));
    }
  }
@@ -164,8 +194,7 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
  }
  if ((!eq && *had_charset) || type_has_charset) {
    *had_charset = true;
-    *charset = base::ToLowerASCII(
+    *charset = base::ToLowerASCII(charset_value);
-        base::StringPiece(begin + charset_val, begin + charset_end));
  }
 }

--- a/net/http/http_util.h
+++ b/net/http/http_util.h
@@ -39,8 +39,8 @@ class NET_EXPORT HttpUtil {
  // charset values are normalized to lowercase.  The mime_type and charset
  // output values are only modified if the content_type_str contains a mime
  // type and charset value, respectively.  The boundary output value is
-  // optional and will be assigned the (quoted) value of the boundary
+  // optional and will be assigned the (unquoted) value of the boundary
-  // paramter, if any.
+  // parameter, if any.
  static void ParseContentType(const std::string& content_type_str,
                               std::string* mime_type,
                               std::string* charset,

--- a/net/http/http_util_unittest.cc
+++ b/net/http/http_util_unittest.cc
@@ -734,6 +734,18 @@ TEST(HttpUtilTest, ParseContentType) {
    const bool expected_had_charset;
    const char* const expected_boundary;
  } tests[] = {
+    { "text/html",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html;",
+      "text/html",
+      "",
+      false,
+      ""
+    },
    { "text/html; charset=utf-8",
      "text/html",
      "utf-8",
@@ -763,7 +775,7 @@ TEST(HttpUtilTest, ParseContentType) {
      "text/html",
      "",
      false,
-      "\"WebKit-ada-df-dsf-adsfadsfs\""
+      "WebKit-ada-df-dsf-adsfadsfs"
    },
    // Parameter name is "boundary ", not "boundary".
    // See https://crbug.com/772834.
@@ -778,20 +790,20 @@ TEST(HttpUtilTest, ParseContentType) {
      "text/html",
      "",
      false,
-      " \"WebKit-ada-df-dsf-adsfadsfs\""
+      "WebKit-ada-df-dsf-adsfadsfs"
    },
    // Parameter value includes leading space.  See https://crbug.com/772834.
    { "text/html; boundary= \"WebKit-ada-df-dsf-adsfadsfs\"   ",
      "text/html",
      "",
      false,
-      " \"WebKit-ada-df-dsf-adsfadsfs\""
+      "WebKit-ada-df-dsf-adsfadsfs"
    },
    { "text/html; boundary=\"WebKit-ada-df-dsf-adsfadsfs  \"",
      "text/html",
      "",
      false,
-      "\"WebKit-ada-df-dsf-adsfadsfs  \""
+      "WebKit-ada-df-dsf-adsfadsfs  "
    },
    { "text/html; boundary=WebKit-ada-df-dsf-adsfadsfs",
      "text/html",
@@ -799,12 +811,110 @@ TEST(HttpUtilTest, ParseContentType) {
      false,
      "WebKit-ada-df-dsf-adsfadsfs"
    },
+    { "text/html; charset",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html; charset=",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html; charset= ",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html; charset= ;",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html; charset=\"\"",
+      "text/html",
+      "",
+      false,
+      ""
+    },
+    { "text/html; charset=\" \"",
+      "text/html",
+      " ",
+      true,
+      ""
+    },
+    { "text/html; charset; charset=; charset=utf-8",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
+    { "text/html; charset=utf-8; charset=; charset;",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
+    // Stray quotes ignored.
+    { "text/html; \"; \"\"; charset=utf-8",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
+    // Non-leading quotes kept as-is.
+    { "text/html; charset=u\"tf-8\"",
+      "text/html",
+      "u\"tf-8\"",
+      true,
+      ""
+    },
    { "text/html; charset=\"utf-8\"",
      "text/html",
      "utf-8",
      true,
      ""
    },
+    // No closing quote.
+    { "text/html; charset=\"utf-8",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
+    // Check that \ is treated as an escape character.
+    { "text/html; charset=\"\\utf\\-\\8\"",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
+    // More interseting escape character test - test escaped backslash, escaped
+    // quote, and backslash at end of input in unterminated quoted string.
+    { "text/html; charset=\"\\\\\\\"\\",
+      "text/html",
+      "\\\"\\",
+      true,
+      ""
+    },
+    // Check quoted semicolon.
+    { "text/html; charset=\";charset=utf-8;\"",
+      "text/html",
+      ";charset=utf-8;",
+      true,
+      ""
+    },
+    // Unclear if this one should just return utf-8 or not.
+    { "text/html; charset= \"utf-8\"",
+      "text/html",
+      "utf-8",
+      true,
+      ""
+    },
    // Regression test for https://crbug.com/772350:
    // Single quotes are not delimiters but must be treated as part of charset.
    { "text/html; charset='utf-8'",
@@ -823,10 +933,14 @@ TEST(HttpUtilTest, ParseContentType) {
    std::string boundary;
    HttpUtil::ParseContentType(tests[i].content_type, &mime_type, &charset,
                               &had_charset, &boundary);
-    EXPECT_EQ(tests[i].expected_mime_type, mime_type) << "i=" << i;
+    EXPECT_EQ(tests[i].expected_mime_type, mime_type)
-    EXPECT_EQ(tests[i].expected_charset, charset) << "i=" << i;
+        << "content_type=" << tests[i].content_type;
-    EXPECT_EQ(tests[i].expected_had_charset, had_charset) << "i=" << i;
+    EXPECT_EQ(tests[i].expected_charset, charset)
-    EXPECT_EQ(tests[i].expected_boundary, boundary) << "i=" << i;
+        << "content_type=" << tests[i].content_type;
+    EXPECT_EQ(tests[i].expected_had_charset, had_charset)
+        << "content_type=" << tests[i].content_type;
+    EXPECT_EQ(tests[i].expected_boundary, boundary)
+        << "content_type=" << tests[i].content_type;
  }
 }

--- a/third_party/WebKit/LayoutTests/external/wpt/mimesniff/mime-types/charset-parameter.window-expected.txt
+++ b/third_party/WebKit/LayoutTests/external/wpt/mimesniff/mime-types/charset-parameter.window-expected.txt
@@ -15,7 +15,7 @@ PASS text/html;charset=gbk'
 PASS text/html;test;charset=gbk
 PASS text/html;test=;charset=gbk
 PASS text/html;';charset=gbk
-FAIL text/html;";charset=gbk assert_equals: expected "GBK" but got "UTF-8"
+PASS text/html;";charset=gbk
 PASS text/html ; ; charset=gbk
 PASS text/html;;;;charset=gbk
 PASS text/html;charset="gbk"
@@ -23,7 +23,7 @@ PASS text/html;charset="gbk
 PASS text/html;charset=gbk"
 FAIL text/html;charset=" gbk" assert_equals: expected "GBK" but got "UTF-8"
 FAIL text/html;charset="\ gbk" assert_equals: expected "GBK" but got "UTF-8"
-FAIL text/html;charset="\g\b\k" assert_equals: expected "GBK" but got "UTF-8"
+PASS text/html;charset="\g\b\k"
 PASS text/html;charset="gbk"x
 PASS text/html;charset={gbk}
 PASS text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk