Commit 13a5c886 authored by Josh Karlin's avatar Josh Karlin Committed by Commit Bot

[Presubmit Speed] Make pretty-printing histograms faster

What: This CL dramatically speeds up pretty printing of xml files. It
speeds up 'git cl format' for CLs with histograms.xml changes from 9
seconds to 3 seconds. This also speeds up other common operations like
'git cl upload'.

How: The pretty printer now processes an instance of ElementTree
instead of minidom (and will convert minidom to etree if necessary).
There is overhead converting minidom to etree but it's still an
improvement for those cases where clients only speak minidom.

The histograms pretty printer has been rewritten to speak
ElementTree. The others (rappor, ukm, actions) still speak
minidom and they should be converted in followup CLs but it's
less important since their XML files are much smaller.

Note: Top-level comments aren't supported by ElementTree, so I've
written a SAX parser to copy everything above the first tag in
the document and paste it into the result of pretty printing.
This is in etree_util.py

Otherwise, pretty printing should behave identically. I've improved
the testing to catch any mistakes I may have made, but given this
involves parsing I wouldn't be surprised if a follow-up CL or two
isn't required.

Bug: 945354
Change-Id: I28990d245f5d471203f6430e08c8a98e3ba70d03
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1538783
Commit-Queue: Alexei Svitkine <asvitkine@chromium.org>
Reviewed-by: default avatarAlexei Svitkine <asvitkine@chromium.org>
Cr-Commit-Position: refs/heads/master@{#648683}
parent 8211fc72
...@@ -97,6 +97,7 @@ group("metrics_python_tests") { ...@@ -97,6 +97,7 @@ group("metrics_python_tests") {
"//tools/metrics/common/path_util.py", "//tools/metrics/common/path_util.py",
"//tools/metrics/common/presubmit_util.py", "//tools/metrics/common/presubmit_util.py",
"//tools/metrics/common/pretty_print_xml.py", "//tools/metrics/common/pretty_print_xml.py",
"//tools/metrics/common/etree_util.py",
"//tools/metrics/histograms/extract_histograms.py", "//tools/metrics/histograms/extract_histograms.py",
"//tools/metrics/histograms/generate_expired_histograms_array.py", "//tools/metrics/histograms/generate_expired_histograms_array.py",
......
...@@ -50,7 +50,7 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = { ...@@ -50,7 +50,7 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = {
# Tags that we allow to be squished into a single line for brevity. # Tags that we allow to be squished into a single line for brevity.
TAGS_THAT_ALLOW_SINGLE_LINE = ['obsolete', 'owner', 'description'] TAGS_THAT_ALLOW_SINGLE_LINE = ['obsolete', 'owner', 'description']
LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower() LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
# Tags whose children we want to alphabetize. The key is the parent tag name, # Tags whose children we want to alphabetize. The key is the parent tag name,
# and the value is a list of pairs of tag name and key functions that maps each # and the value is a list of pairs of tag name and key functions that maps each
......
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utility functions for parsing XML strings into ElementTree nodes."""
import xml.etree.ElementTree as ET
import xml.sax
class _FirstTagFoundError(Exception):
"""Raised when the first tag is found in an XML document.
This isn't actually an error. Raising this exception is how we end parsing XML
documents early.
"""
pass
class _FirstTagFinder(xml.sax.ContentHandler):
"""An XML SAX parser that raises as soon as a tag is found.
Call getFirstTagLine to determine which line the tag was found on.
"""
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.first_tag_line = 0
self.first_tag_column = 0
def GetFirstTagLine(self):
return self.first_tag_line
def GetFirstTagColumn(self):
return self.first_tag_column
def setDocumentLocator(self, locator):
self.location = locator
def startElement(self, tag, attributes):
del tag, attributes # Unused.
# Now that the first tag is found, remember the location of it.
self.first_tag_line = self.location.getLineNumber()
self.first_tag_column = self.location.getColumnNumber()
# End parsing by throwing.
raise _FirstTagFoundError()
class _CommentedXMLParser(ET.XMLParser):
"""An ElementTree builder that preserves comments."""
def __init__(self, *args, **kwargs):
super(_CommentedXMLParser, self).__init__(*args, **kwargs)
self._parser.CommentHandler = self.comment
def comment(self, data): # pylint: disable=invalid-name
self._target.start(ET.Comment, {})
self._target.data(data)
self._target.end(ET.Comment)
def GetTopLevelContent(file_content):
"""Returns a string of all the text in the xml file before the first tag."""
handler = _FirstTagFinder()
first_tag_line = 0
first_tag_column = 0
try:
xml.sax.parseString(file_content, handler)
except _FirstTagFoundError:
# This is the expected case, it means a tag was found in the doc.
first_tag_line = handler.GetFirstTagLine()
first_tag_column = handler.GetFirstTagColumn()
if first_tag_line == 0 and first_tag_column == 0:
return ''
char = 0
for _ in range(first_tag_line - 1):
char = file_content.index('\n', char) + 1
char += first_tag_column - 1
# |char| is now pointing at the final character before the opening tag '<'.
top_content = file_content[:char + 1].strip()
if not top_content:
return ''
return top_content + '\n\n'
def ParseXMLString(raw_xml):
"""Parses raw_xml and returns an ElementTree node that includes comments."""
return ET.fromstring(raw_xml, _CommentedXMLParser())
This diff is collapsed.
...@@ -65,13 +65,13 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = { ...@@ -65,13 +65,13 @@ TAGS_THAT_HAVE_EXTRA_NEWLINE = {
# Tags that we allow to be squished into a single line for brevity. # Tags that we allow to be squished into a single line for brevity.
TAGS_THAT_ALLOW_SINGLE_LINE = ['summary', 'int', 'owner'] TAGS_THAT_ALLOW_SINGLE_LINE = ['summary', 'int', 'owner']
LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower() LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
def _NaturalSortByName(node): def _NaturalSortByName(node):
"""Sort by name, ordering numbers in the way humans expect.""" """Sort by name, ordering numbers in the way humans expect."""
# See: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ # See: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
name = node.attributes['name'].value.lower() name = node.get('name').lower()
convert = lambda text: int(text) if text.isdigit() else text convert = lambda text: int(text) if text.isdigit() else text
return [convert(c) for c in re.split('([0-9]+)', name)] return [convert(c) for c in re.split('([0-9]+)', name)]
...@@ -82,7 +82,7 @@ def _NaturalSortByName(node): ...@@ -82,7 +82,7 @@ def _NaturalSortByName(node):
TAGS_ALPHABETIZATION_RULES = { TAGS_ALPHABETIZATION_RULES = {
'histograms': [('histogram', LOWERCASE_NAME_FN)], 'histograms': [('histogram', LOWERCASE_NAME_FN)],
'enums': [('enum', LOWERCASE_NAME_FN)], 'enums': [('enum', LOWERCASE_NAME_FN)],
'enum': [('int', lambda n: int(n.attributes['value'].value))], 'enum': [('int', lambda n: int(n.get('value')))],
'histogram_suffixes_list': [('histogram_suffixes', LOWERCASE_NAME_FN)], 'histogram_suffixes_list': [('histogram_suffixes', LOWERCASE_NAME_FN)],
'histogram_suffixes': [ 'histogram_suffixes': [
('obsolete', lambda n: None), ('obsolete', lambda n: None),
......
...@@ -18,19 +18,17 @@ import logging ...@@ -18,19 +18,17 @@ import logging
import os import os
import shutil import shutil
import sys import sys
import xml.dom.minidom
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import diff_util import diff_util
import presubmit_util import presubmit_util
import etree_util
import histograms_print_style import histograms_print_style
class Error(Exception): class Error(Exception):
pass pass
UNIT_REWRITES = { UNIT_REWRITES = {
'microsecond': 'microseconds', 'microsecond': 'microseconds',
'us': 'microseconds', 'us': 'microseconds',
...@@ -49,34 +47,39 @@ UNIT_REWRITES = { ...@@ -49,34 +47,39 @@ UNIT_REWRITES = {
'percentage': '%', 'percentage': '%',
} }
def canonicalizeUnits(tree): def canonicalizeUnits(tree):
"""Canonicalize the spelling of certain units in histograms.""" """Canonicalize the spelling of certain units in histograms."""
histograms = tree.getElementsByTagName('histogram') if tree.tag == 'histogram':
for histogram in histograms: units = tree.get('units')
units = histogram.attributes.get('units') if units and units in UNIT_REWRITES:
if units and units.value in UNIT_REWRITES: tree.set('units', UNIT_REWRITES[units])
histogram.attributes['units'] = UNIT_REWRITES[units.value]
for child in tree:
canonicalizeUnits(child)
def fixObsoleteOrder(tree): def fixObsoleteOrder(tree):
"""Put obsolete tags at the beginning of histogram tags.""" """Put obsolete tags at the beginning of histogram tags."""
histograms = tree.getElementsByTagName('histogram') for child in tree:
for histogram in histograms: obsoletes = []
obsoletes = histogram.getElementsByTagName('obsolete') if child.tag == 'obsolete':
if obsoletes: obsoletes.append(child)
histogram.insertBefore(obsoletes[0], histogram.firstChild) for obsolete in obsoletes:
tree.remove(obsolete)
tree.insert(0, obsolete)
fixObsoleteOrder(child)
def DropNodesByTagName(tree, tag): def DropNodesByTagName(tree, tag):
"""Drop all nodes with named tag from the XML tree.""" """Drop all nodes with named tag from the XML tree."""
nodes = tree.getElementsByTagName(tag) for child in tree:
for node in nodes: removes = []
node.parentNode.removeChild(node) if child.tag == tag:
removes.append(child)
for child in removes:
tree.remove(child)
DropNodesByTagName(child, tag)
def PrettyPrintHistograms(raw_xml): def PrettyPrintHistograms(raw_xml):
"""Pretty-print the given XML. """Pretty-print the given histograms XML.
Args: Args:
raw_xml: The contents of the histograms XML file, as a string. raw_xml: The contents of the histograms XML file, as a string.
...@@ -84,43 +87,48 @@ def PrettyPrintHistograms(raw_xml): ...@@ -84,43 +87,48 @@ def PrettyPrintHistograms(raw_xml):
Returns: Returns:
The pretty-printed version. The pretty-printed version.
""" """
tree = xml.dom.minidom.parseString(raw_xml) top_level_content = etree_util.GetTopLevelContent(raw_xml)
return PrettyPrintHistogramsTree(tree) root = etree_util.ParseXMLString(raw_xml)
return top_level_content + PrettyPrintHistogramsTree(root)
def PrettyPrintHistogramsTree(tree): def PrettyPrintHistogramsTree(tree):
"""Pretty-print the given xml.dom.minidom.Document object. """Pretty-print the given ElementTree element.
Args: Args:
tree: The xml.dom.minidom.Document object. tree: The ElementTree element.
Returns: Returns:
The pretty-printed version as an XML string. The pretty-printed version as an XML string.
""" """
assert isinstance(tree, xml.dom.minidom.Document)
# Prevent accidentally adding enums to histograms.xml # Prevent accidentally adding enums to histograms.xml
DropNodesByTagName(tree, 'enums') DropNodesByTagName(tree, 'enums')
canonicalizeUnits(tree) canonicalizeUnits(tree)
fixObsoleteOrder(tree) fixObsoleteOrder(tree)
return histograms_print_style.GetPrintStyle().PrettyPrintXml(tree) return histograms_print_style.GetPrintStyle().PrettyPrintXml(tree)
def PrettyPrintEnums(raw_xml): def PrettyPrintEnums(raw_xml):
"""Pretty print the enums.xml file.""" """Pretty print the given enums XML."""
tree = xml.dom.minidom.parseString(raw_xml)
root = etree_util.ParseXMLString(raw_xml)
# Prevent accidentally adding histograms to enums.xml # Prevent accidentally adding histograms to enums.xml
DropNodesByTagName(tree, 'histograms') DropNodesByTagName(root, 'histograms')
DropNodesByTagName(tree, 'histogram_suffixes_list') DropNodesByTagName(root, 'histogram_suffixes_list')
return histograms_print_style.GetPrintStyle().PrettyPrintXml(tree)
top_level_content = etree_util.GetTopLevelContent(raw_xml)
formatted_xml = (histograms_print_style.GetPrintStyle()
.PrettyPrintXml(root))
return top_level_content + formatted_xml
def main(): def main():
status1 = presubmit_util.DoPresubmit(sys.argv, 'enums.xml', status1 = presubmit_util.DoPresubmit(sys.argv, 'enums.xml',
'enums.before.pretty-print.xml', 'enums.before.pretty-print.xml',
'pretty_print.py', PrettyPrintEnums) 'pretty_print.py', PrettyPrintEnums)
status2 = presubmit_util.DoPresubmit(sys.argv, 'histograms.xml', status2 = presubmit_util.DoPresubmit(sys.argv, 'histograms.xml',
'histograms.before.pretty-print.xml', 'histograms.before.pretty-print.xml',
'pretty_print.py', PrettyPrintHistograms) 'pretty_print.py',
PrettyPrintHistograms)
sys.exit(status1 or status2) sys.exit(status1 or status2)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -9,30 +9,55 @@ import pretty_print ...@@ -9,30 +9,55 @@ import pretty_print
ORIGINAL_XML = """ ORIGINAL_XML = """
<!-- Top level Comment 1 -->
<!-- Top level Comment 2 -->
<histogram-configuration> <histogram-configuration>
<histograms> <histograms>
<histogram name="Test.Histogram" units="things"> <histogram name="Test.Histogram" units="us">
<owner>person@chromium.org</owner> <owner>person@chromium.org</owner>
<summary>A long line that should be formatted in a way that does not result <summary>A long line that should be formatted in a way that does not result
in extra whitespace between words. in extra whitespace between words.
It has multiple paragraphs.
</summary> </summary>
Mixed content.
<obsolete>
Removed 1/2019.
</obsolete>
</histogram>
<histogram name="Foo.Bar" units="xxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyzzzz">
<summary>Foo</summary>
</histogram> </histogram>
</histograms> </histograms>
<enums>This shouldn't be here</enums>
</histogram-configuration> </histogram-configuration>
""".strip() """.strip()
PRETTY_XML = """ PRETTY_XML = """
<!-- Top level Comment 1 -->
<!-- Top level Comment 2 -->
<histogram-configuration> <histogram-configuration>
<histograms> <histograms>
<histogram name="Test.Histogram" units="things"> <histogram name="Foo.Bar" units="xxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyzzzz">
<summary>Foo</summary>
</histogram>
<histogram name="Test.Histogram" units="microseconds">
<obsolete>
Removed 1/2019.
</obsolete>
<owner>person@chromium.org</owner> <owner>person@chromium.org</owner>
<summary> <summary>
A long line that should be formatted in a way that does not result in extra A long line that should be formatted in a way that does not result in extra
whitespace between words. whitespace between words.
It has multiple paragraphs.
</summary> </summary>
Mixed content.
</histogram> </histogram>
</histograms> </histograms>
...@@ -43,11 +68,10 @@ PRETTY_XML = """ ...@@ -43,11 +68,10 @@ PRETTY_XML = """
class PrettyPrintHistogramsXmlTest(unittest.TestCase): class PrettyPrintHistogramsXmlTest(unittest.TestCase):
def testWhitespaceWrapping(self): def testPrettyPrinting(self):
result = pretty_print.PrettyPrintHistograms(ORIGINAL_XML) result = pretty_print.PrettyPrintHistograms(ORIGINAL_XML)
self.maxDiff = None self.maxDiff = None
self.assertMultiLineEqual(PRETTY_XML, result.strip()) self.assertMultiLineEqual(PRETTY_XML, result.strip())
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -14,7 +14,7 @@ _OBSOLETE_TYPE = models.TextNodeType('obsolete') ...@@ -14,7 +14,7 @@ _OBSOLETE_TYPE = models.TextNodeType('obsolete')
_OWNER_TYPE = models.TextNodeType('owner', single_line=True) _OWNER_TYPE = models.TextNodeType('owner', single_line=True)
_SUMMARY_TYPE = models.TextNodeType('summary') _SUMMARY_TYPE = models.TextNodeType('summary')
_LOWERCASE_NAME_FN = lambda n: n.attributes['name'].value.lower() _LOWERCASE_NAME_FN = lambda n: n.get('name').lower()
_ENUMERATION_TYPE = models.ObjectNodeType( _ENUMERATION_TYPE = models.ObjectNodeType(
'enumeration', 'enumeration',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment