Fixes for SafeXmlParser choking on DTD and DOCTYPE.

Changing the SafeXmlParser so it ignores nodes like DTDs and processing instructions. Bug: 789838 Change-Id: I0bf4a07509309f9e7720fdcea6283542a0b489ff Reviewed-on: https://chromium-review.googlesource.com/804946Reviewed-by: Scott Graham <scottmg@chromium.org> Reviewed-by: Robert Sesek <rsesek@chromium.org> Commit-Queue: Jay Civelli <jcivelli@chromium.org> Cr-Commit-Position: refs/heads/master@{#521392}

Fixes for SafeXmlParser choking on DTD and DOCTYPE.
Changing the SafeXmlParser so it ignores nodes like DTDs and processing instructions. Bug: 789838 Change-Id: I0bf4a07509309f9e7720fdcea6283542a0b489ff Reviewed-on: https://chromium-review.googlesource.com/804946Reviewed-by: Scott Graham <scottmg@chromium.org> Reviewed-by: Robert Sesek <rsesek@chromium.org> Commit-Queue: Jay Civelli <jcivelli@chromium.org> Cr-Commit-Position: refs/heads/master@{#521392}
acc25ce9 · Jay Civelli · Commit Bot · a1b4f009 · acc25ce9 · acc25ce9
Commit acc25ce9 authored Dec 04, 2017 by Jay Civelli Committed by Commit Bot Dec 04, 2017
7 changed files
--- a/services/data_decoder/xml_parser.cc
+++ b/services/data_decoder/xml_parser.cc
@@ -101,9 +101,6 @@ void XmlParser::Parse(const std::string& xml, ParseCallback callback) {
  base::Value root_element;
  std::vector<base::Value*> element_stack;
  while (xml_reader.Read()) {
-    if (xml_reader.IsWhiteSpace() || xml_reader.IsComment())
-      continue;
-
    if (xml_reader.IsClosingElement()) {
      if (element_stack.empty()) {
        ReportError(std::move(callback), "Invalid XML: unbalanced elements");
@@ -123,14 +120,17 @@ void XmlParser::Parse(const std::string& xml, ParseCallback callback) {
      new_element = CreateTextNode(text, TextNodeType::kText);
    } else if (xml_reader.GetTextIfCDataElement(&text)) {
      new_element = CreateTextNode(text, TextNodeType::kCData);
-    } else {
-      // Element node.
+    } else if (xml_reader.IsElement()) {
      new_element = CreateNewElement(xml_reader.NodeFullName());
      PopulateNamespaces(&new_element, &xml_reader);
      PopulateAttributes(&new_element, &xml_reader);
      // Self-closing (empty) element have no close tag (or children); don't
      // push them on the element stack.
      push_new_node_to_stack = !xml_reader.IsEmptyElement();
+    } else {
+      // Ignore all other node types (spaces, comments, processing instructions,
+      // DTDs...).
+      continue;
    }

    base::Value* new_element_ptr;
@@ -139,6 +139,7 @@ void XmlParser::Parse(const std::string& xml, ParseCallback callback) {
          AddChildToElement(current_element, std::move(new_element));
    } else {
      // First element we are parsing, it becomes the root element.
+      DCHECK(xml_reader.IsElement());
      DCHECK(root_element.is_none());
      root_element = std::move(new_element);
      new_element_ptr = &root_element;

--- a/services/data_decoder/xml_parser_fuzzer_corpus/input10
+++ b/services/data_decoder/xml_parser_fuzzer_corpus/input10
+<!-- This is a comment. -->
+<html>Some HTML</html>
--- a/services/data_decoder/xml_parser_fuzzer_corpus/input11
+++ b/services/data_decoder/xml_parser_fuzzer_corpus/input11
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+    Some fine HTML
+</html>
\ No newline at end of file
--- a/services/data_decoder/xml_parser_fuzzer_corpus/input9
+++ b/services/data_decoder/xml_parser_fuzzer_corpus/input9
+<?xml-stylesheet href="mystyle.css" type="text/css"?>
+<html>Some HTML</html>
+
--- a/services/data_decoder/xml_parser_unittest.cc
+++ b/services/data_decoder/xml_parser_unittest.cc
@@ -63,6 +63,33 @@ TEST_F(XmlParserTest, ParseBadXml) {
    TestParseXml(xml, "");
 }

+TEST_F(XmlParserTest, IgnoreComments) {
+  TestParseXml("<!-- This is the best XML document IN THE WORLD! --><a></a>",
+               R"( {"type": "element", "tag": "a"} )");
+}
+
+TEST_F(XmlParserTest, IgnoreProcessingCommands) {
+  TestParseXml(R"(<?xml-stylesheet href="mystyle.css" type="text/css"?>
+                  <a></a>)",
+               R"( {"type": "element", "tag": "a"} )");
+  TestParseXml("<a/><?hello?>", R"( {"type": "element", "tag": "a"} )");
+}
+
+TEST_F(XmlParserTest, IgnoreDocumentTypes) {
+  TestParseXml(
+      R"(<?xml version="1.0" encoding="utf-8"?>
+         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+             "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+         <html xmlns="http://www.w3.org/1999/xhtml">Some HTML</html>
+      )",
+      R"( {"type": "element",
+           "namespaces": {"": "http://www.w3.org/1999/xhtml"},
+           "tag": "html",
+           "children":[{"type": "text", "text": "Some HTML"}]
+          }
+      )");
+}
+
 TEST_F(XmlParserTest, ParseSelfClosingTag) {
  TestParseXml("<a/>", R"( {"type": "element", "tag": "a"} )");
  TestParseXml("<a><b/></a>",

--- a/third_party/libxml/chromium/libxml_utils.cc
+++ b/third_party/libxml/chromium/libxml_utils.cc
@@ -158,6 +158,10 @@ bool XmlReader::GetTextIfCDataElement(std::string* content) {
  return true;
 }

+bool XmlReader::IsElement() {
+  return NodeType() == XML_READER_TYPE_ELEMENT;
+}
+
 bool XmlReader::IsClosingElement() {
  return NodeType() == XML_READER_TYPE_END_ELEMENT;
 }
@@ -166,15 +170,6 @@ bool XmlReader::IsEmptyElement() {
  return xmlTextReaderIsEmptyElement(reader_);
 }

-bool XmlReader::IsWhiteSpace() {
-  return NodeType() == XML_READER_TYPE_WHITESPACE ||
-         NodeType() == XML_READER_TYPE_SIGNIFICANT_WHITESPACE;
-}
-
-bool XmlReader::IsComment() {
-  return NodeType() == XML_READER_TYPE_COMMENT;
-}
-
 bool XmlReader::ReadElementContent(std::string* content) {
  const int start_depth = Depth();


--- a/third_party/libxml/chromium/libxml_utils.h
+++ b/third_party/libxml/chromium/libxml_utils.h
@@ -89,6 +89,11 @@ class XmlReader {
  bool GetTextIfTextElement(std::string* content);
  bool GetTextIfCDataElement(std::string* content);

+  // Returns true if the node is an element (e.g. <foo>). Note this returns
+  // false for self-closing elements (e.g. <foo/>). Use IsEmptyElement() to
+  // check for those.
+  bool IsElement();
+
  // Returns true if the node is a closing element (e.g. </foo>).
  bool IsClosingElement();

@@ -96,12 +101,6 @@ class XmlReader {
  // <foo/>).
  bool IsEmptyElement();

-  // Returns true if the current node is a white-space node.
-  bool IsWhiteSpace();
-
-  // Returns true if the current node is a comment (e.g. <!-- comment -->).
-  bool IsComment();
-
  // Helper functions not provided by libxml ----------------------------------

  // Return the string content within an element.