[Presubmit Speed] Make pretty-printing histograms faster

What: This CL dramatically speeds up pretty printing of xml files. It speeds up 'git cl format' for CLs with histograms.xml changes from 9 seconds to 3 seconds. This also speeds up other common operations like 'git cl upload'. How: The pretty printer now processes an instance of ElementTree instead of minidom (and will convert minidom to etree if necessary). There is overhead converting minidom to etree but it's still an improvement for those cases where clients only speak minidom. The histograms pretty printer has been rewritten to speak ElementTree. The others (rappor, ukm, actions) still speak minidom and they should be converted in followup CLs but it's less important since their XML files are much smaller. Note: Top-level comments aren't supported by ElementTree, so I've written a SAX parser to copy everything above the first tag in the document and paste it into the result of pretty printing. This is in etree_util.py Otherwise, pretty printing should behave identically. I've improved the testing to catch any mistakes I may have made, but given this involves parsing I wouldn't be surprised if a follow-up CL or two isn't required. Bug: 945354 Change-Id: I28990d245f5d471203f6430e08c8a98e3ba70d03 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1538783 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#648683}

[Presubmit Speed] Make pretty-printing histograms faster
What: This CL dramatically speeds up pretty printing of xml files. It speeds up 'git cl format' for CLs with histograms.xml changes from 9 seconds to 3 seconds. This also speeds up other common operations like 'git cl upload'. How: The pretty printer now processes an instance of ElementTree instead of minidom (and will convert minidom to etree if necessary). There is overhead converting minidom to etree but it's still an improvement for those cases where clients only speak minidom. The histograms pretty printer has been rewritten to speak ElementTree. The others (rappor, ukm, actions) still speak minidom and they should be converted in followup CLs but it's less important since their XML files are much smaller. Note: Top-level comments aren't supported by ElementTree, so I've written a SAX parser to copy everything above the first tag in the document and paste it into the result of pretty printing. This is in etree_util.py Otherwise, pretty printing should behave identically. I've improved the testing to catch any mistakes I may have made, but given this involves parsing I wouldn't be surprised if a follow-up CL or two isn't required. Bug: 945354 Change-Id: I28990d245f5d471203f6430e08c8a98e3ba70d03 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1538783 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#648683}
13a5c886 · Josh Karlin · Commit Bot · 8211fc72 · 13a5c886 · 13a5c886
Commit 13a5c886 authored Apr 08, 2019 by Josh Karlin Committed by Commit Bot Apr 08, 2019
8 changed files
--- a/tools/metrics/BUILD.gn
+++ b/tools/metrics/BUILD.gn
@@ -97,6 +97,7 @@ group("metrics_python_tests") {
    "//tools/metrics/common/path_util.py",
    "//tools/metrics/common/presubmit_util.py",
    "//tools/metrics/common/pretty_print_xml.py",
+    "//tools/metrics/common/etree_util.py",
    "//tools/metrics/histograms/extract_histograms.py",
    "//tools/metrics/histograms/generate_expired_histograms_array.py",

--- a/tools/metrics/actions/actions_print_style.py
+++ b/tools/metrics/actions/actions_print_style.py
@@ -50,7 +50,7 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = {
 # Tags that we allow to be squished into a single line for brevity.
 TAGS_THAT_ALLOW_SINGLE_LINE = ['obsolete', 'owner', 'description']
-LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower()
+LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
 # Tags whose children we want to alphabetize. The key is the parent tag name,
 # and the value is a list of pairs of tag name and key functions that maps each

--- a/tools/metrics/common/etree_util.py
+++ b/tools/metrics/common/etree_util.py
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""Utility functions for parsing XML strings into ElementTree nodes."""
+import xml.etree.ElementTree as ET
+import xml.sax
+class _FirstTagFoundError(Exception):
+  """Raised when the first tag is found in an XML document.
+  This isn't actually an error. Raising this exception is how we end parsing XML
+  documents early.
+  """
+  pass
+class _FirstTagFinder(xml.sax.ContentHandler):
+  """An XML SAX parser that raises as soon as a tag is found.
+  Call getFirstTagLine to determine which line the tag was found on.
+  """
+  def __init__(self):
+    xml.sax.ContentHandler.__init__(self)
+    self.first_tag_line = 0
+    self.first_tag_column = 0
+  def GetFirstTagLine(self):
+    return self.first_tag_line
+  def GetFirstTagColumn(self):
+    return self.first_tag_column
+  def setDocumentLocator(self, locator):
+    self.location = locator
+  def startElement(self, tag, attributes):
+    del tag, attributes  # Unused.
+    # Now that the first tag is found, remember the location of it.
+    self.first_tag_line = self.location.getLineNumber()
+    self.first_tag_column = self.location.getColumnNumber()
+    # End parsing by throwing.
+    raise _FirstTagFoundError()
+class _CommentedXMLParser(ET.XMLParser):
+  """An ElementTree builder that preserves comments."""
+  def __init__(self, *args, **kwargs):
+    super(_CommentedXMLParser, self).__init__(*args, **kwargs)
+    self._parser.CommentHandler = self.comment
+  def comment(self, data):  # pylint: disable=invalid-name
+    self._target.start(ET.Comment, {})
+    self._target.data(data)
+    self._target.end(ET.Comment)
+def GetTopLevelContent(file_content):
+  """Returns a string of all the text in the xml file before the first tag."""
+  handler = _FirstTagFinder()
+  first_tag_line = 0
+  first_tag_column = 0
+  try:
+    xml.sax.parseString(file_content, handler)
+  except _FirstTagFoundError:
+    # This is the expected case, it means a tag was found in the doc.
+    first_tag_line = handler.GetFirstTagLine()
+    first_tag_column = handler.GetFirstTagColumn()
+  if first_tag_line == 0 and first_tag_column == 0:
+    return ''
+  char = 0
+  for _ in range(first_tag_line - 1):
+    char = file_content.index('\n', char) + 1
+  char += first_tag_column - 1
+  # |char| is now pointing at the final character before the opening tag '<'.
+  top_content = file_content[:char + 1].strip()
+  if not top_content:
+    return ''
+  return top_content + '\n\n'
+def ParseXMLString(raw_xml):
+  """Parses raw_xml and returns an ElementTree node that includes comments."""
+  return ET.fromstring(raw_xml, _CommentedXMLParser())
--- a/tools/metrics/common/pretty_print_xml.py
+++ b/tools/metrics/common/pretty_print_xml.py
--- a/tools/metrics/histograms/histograms_print_style.py
+++ b/tools/metrics/histograms/histograms_print_style.py
@@ -65,13 +65,13 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = {
 # Tags that we allow to be squished into a single line for brevity.
 TAGS_THAT_ALLOW_SINGLE_LINE = ['summary', 'int', 'owner']
-LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower()
+LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
 def _NaturalSortByName(node):
  """Sort by name, ordering numbers in the way humans expect."""
  # See: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
-  name = node.attributes['name'].value.lower()
+  name = node.get('name').lower()
  convert = lambda text: int(text) if text.isdigit() else text
  return [convert(c) for c in re.split('([0-9]+)', name)]
@@ -82,7 +82,7 @@ def _NaturalSortByName(node):
 TAGS_ALPHABETIZATION_RULES = {
    'histograms': [('histogram', LOWERCASE_NAME_FN)],
    'enums': [('enum', LOWERCASE_NAME_FN)],
-    'enum': [('int', lambda n: int(n.attributes['value'].value))],
+    'enum': [('int', lambda n: int(n.get('value')))],
    'histogram_suffixes_list': [('histogram_suffixes', LOWERCASE_NAME_FN)],
    'histogram_suffixes': [
        ('obsolete', lambda n: None),

--- a/tools/metrics/histograms/pretty_print.py
+++ b/tools/metrics/histograms/pretty_print.py
@@ -18,19 +18,17 @@ import logging
 import os
 import shutil
 import sys
-import xml.dom.minidom
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
 import diff_util
 import presubmit_util
+import etree_util
 import histograms_print_style
 class Error(Exception):
  pass
 UNIT_REWRITES = {
  'microsecond': 'microseconds',
  'us': 'microseconds',
@@ -49,34 +47,39 @@ UNIT_REWRITES = {
  'percentage': '%',
 }
 def canonicalizeUnits(tree):
  """Canonicalize the spelling of certain units in histograms."""
-  histograms = tree.getElementsByTagName('histogram')
+  if tree.tag == 'histogram':
-  for histogram in histograms:
+    units = tree.get('units')
-    units = histogram.attributes.get('units')
+    if units and units in UNIT_REWRITES:
-    if units and units.value in UNIT_REWRITES:
+      tree.set('units', UNIT_REWRITES[units])
-      histogram.attributes['units'] = UNIT_REWRITES[units.value]
+  for child in tree:
+    canonicalizeUnits(child)
 def fixObsoleteOrder(tree):
  """Put obsolete tags at the beginning of histogram tags."""
-  histograms = tree.getElementsByTagName('histogram')
+  for child in tree:
-  for histogram in histograms:
+    obsoletes = []
-    obsoletes = histogram.getElementsByTagName('obsolete')
+    if child.tag == 'obsolete':
-    if obsoletes:
+      obsoletes.append(child)
-      histogram.insertBefore(obsoletes[0], histogram.firstChild)
+    for obsolete in obsoletes:
+      tree.remove(obsolete)
+      tree.insert(0, obsolete)
+    fixObsoleteOrder(child)
 def DropNodesByTagName(tree, tag):
  """Drop all nodes with named tag from the XML tree."""
-  nodes = tree.getElementsByTagName(tag)
+  for child in tree:
-  for node in nodes:
+    removes = []
-    node.parentNode.removeChild(node)
+    if child.tag == tag:
+      removes.append(child)
+    for child in removes:
+      tree.remove(child)
+    DropNodesByTagName(child, tag)
 def PrettyPrintHistograms(raw_xml):
-  """Pretty-print the given XML.
+  """Pretty-print the given histograms XML.
  Args:
    raw_xml: The contents of the histograms XML file, as a string.
@@ -84,43 +87,48 @@ def PrettyPrintHistograms(raw_xml):
  Returns:
    The pretty-printed version.
  """
-  tree = xml.dom.minidom.parseString(raw_xml)
+  top_level_content = etree_util.GetTopLevelContent(raw_xml)
-  return PrettyPrintHistogramsTree(tree)
+  root = etree_util.ParseXMLString(raw_xml)
+  return top_level_content + PrettyPrintHistogramsTree(root)
 def PrettyPrintHistogramsTree(tree):
-  """Pretty-print the given xml.dom.minidom.Document object.
+  """Pretty-print the given ElementTree element.
  Args:
-    tree: The xml.dom.minidom.Document object.
+    tree: The ElementTree element.
  Returns:
    The pretty-printed version as an XML string.
  """
-  assert isinstance(tree, xml.dom.minidom.Document)
  # Prevent accidentally adding enums to histograms.xml
  DropNodesByTagName(tree, 'enums')
  canonicalizeUnits(tree)
  fixObsoleteOrder(tree)
  return histograms_print_style.GetPrintStyle().PrettyPrintXml(tree)
 def PrettyPrintEnums(raw_xml):
-  """Pretty print the enums.xml file."""
+  """Pretty print the given enums XML."""
-  tree = xml.dom.minidom.parseString(raw_xml)
+  root = etree_util.ParseXMLString(raw_xml)
  # Prevent accidentally adding histograms to enums.xml
-  DropNodesByTagName(tree, 'histograms')
+  DropNodesByTagName(root, 'histograms')
-  DropNodesByTagName(tree, 'histogram_suffixes_list')
+  DropNodesByTagName(root, 'histogram_suffixes_list')
-  return histograms_print_style.GetPrintStyle().PrettyPrintXml(tree)
+  top_level_content = etree_util.GetTopLevelContent(raw_xml)
+  formatted_xml = (histograms_print_style.GetPrintStyle()
+                  .PrettyPrintXml(root))
+  return top_level_content + formatted_xml
 def main():
  status1 = presubmit_util.DoPresubmit(sys.argv, 'enums.xml',
                                       'enums.before.pretty-print.xml',
                                       'pretty_print.py', PrettyPrintEnums)
  status2 = presubmit_util.DoPresubmit(sys.argv, 'histograms.xml',
-                                       'histograms.before.pretty-print.xml',
+                                        'histograms.before.pretty-print.xml',
-                                       'pretty_print.py', PrettyPrintHistograms)
+                                        'pretty_print.py',
+                                        PrettyPrintHistograms)
  sys.exit(status1 or status2)
 if __name__ == '__main__':

--- a/tools/metrics/histograms/pretty_print_test.py
+++ b/tools/metrics/histograms/pretty_print_test.py
@@ -9,30 +9,55 @@ import pretty_print
 ORIGINAL_XML = """
+<!-- Top level Comment 1 -->
+<!-- Top level Comment 2 -->
 <histogram-configuration>
 <histograms>
- <histogram name="Test.Histogram" units="things">
+ <histogram name="Test.Histogram" units="us">
   <owner>person@chromium.org</owner>
   <summary>A long line that should be formatted in a way that does not result
     in extra whitespace between words.
+        It has multiple paragraphs.
   </summary>
+   Mixed content.
+   <obsolete>
+       Removed 1/2019.
+   </obsolete>
+ </histogram>
+ <histogram name="Foo.Bar" units="xxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyzzzz">
+  <summary>Foo</summary>
 </histogram>
 </histograms>
+<enums>This shouldn't be here</enums>
 </histogram-configuration>
 """.strip()
 PRETTY_XML = """
+<!-- Top level Comment 1 -->
+<!-- Top level Comment 2 -->
 <histogram-configuration>
 <histograms>
-<histogram name="Test.Histogram" units="things">
+<histogram name="Foo.Bar" units="xxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyzzzz">
+  <summary>Foo</summary>
+</histogram>
+<histogram name="Test.Histogram" units="microseconds">
+  <obsolete>
+    Removed 1/2019.
+  </obsolete>
  <owner>person@chromium.org</owner>
  <summary>
    A long line that should be formatted in a way that does not result in extra
    whitespace between words.
+    It has multiple paragraphs.
  </summary>
+  Mixed content.
 </histogram>
 </histograms>
@@ -43,11 +68,10 @@ PRETTY_XML = """
 class PrettyPrintHistogramsXmlTest(unittest.TestCase):
-  def testWhitespaceWrapping(self):
+  def testPrettyPrinting(self):
    result = pretty_print.PrettyPrintHistograms(ORIGINAL_XML)
    self.maxDiff = None
    self.assertMultiLineEqual(PRETTY_XML, result.strip())
 if __name__ == '__main__':
  unittest.main()
--- a/tools/metrics/ukm/ukm_model.py
+++ b/tools/metrics/ukm/ukm_model.py
@@ -14,7 +14,7 @@ _OBSOLETE_TYPE = models.TextNodeType('obsolete')
 _OWNER_TYPE = models.TextNodeType('owner', single_line=True)
 _SUMMARY_TYPE = models.TextNodeType('summary')
-_LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower()
+_LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
 _ENUMERATION_TYPE = models.ObjectNodeType(
    'enumeration',