Improve performance of generating histograms.xml

This CL implements IterElementsWithTag to avoid not needed recursion. On local machine it saved around 1s (from 7.6s to 6.6s). Bug: 1016281 Change-Id: I0fea65892bc52574fd4e6fd3fd2455ae3ec646e0 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1872052 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#709564}

Improve performance of generating histograms.xml
This CL implements IterElementsWithTag to avoid not needed recursion. On local machine it saved around 1s (from 7.6s to 6.6s). Bug: 1016281 Change-Id: I0fea65892bc52574fd4e6fd3fd2455ae3ec646e0 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1872052 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#709564}
f2d92284 · Pawel Pluciennik · Commit Bot · 4ac8b448 · f2d92284 · f2d92284
Commit f2d92284 authored Oct 25, 2019 by Pawel Pluciennik Committed by Commit Bot Oct 25, 2019
3 changed files
--- a/tools/metrics/histograms/expand_owners.py
+++ b/tools/metrics/histograms/expand_owners.py
@@ -303,9 +303,10 @@ def ExpandHistogramsOWNERS(histograms):
    Error: Raised if the OWNERS file with the given path does not exist.
  """
  email_pattern = re.compile(_EMAIL_PATTERN)
+  iter_matches = extract_histograms.IterElementsWithTag

-  for histogram in histograms.getElementsByTagName('histogram'):
-    owners = histogram.getElementsByTagName('owner')
+  for histogram in iter_matches(histograms, 'histogram'):
+    owners = [owner for owner in iter_matches(histogram, 'owner', 1)]

    # owner is a DOM Element with a single child, which is a DOM Text Node.
    emails_with_dom_elements = set([
@@ -316,12 +317,10 @@ def ExpandHistogramsOWNERS(histograms):
    # component is a DOM Element with a single child, which is a DOM Text Node.
    components_with_dom_elements = set([
      extract_histograms.NormalizeString(component.childNodes[0].data)
-      for component in histogram.getElementsByTagName('component')])
+      for component in iter_matches(histogram, 'component', 1)])

-    for index in range(len(owners)):
-      owner = owners[index]
+    for index, owner in enumerate(owners):
      owner_text = owner.childNodes[0].data
-
      name = histogram.getAttribute('name')
      if _IsEmailOrPlaceholder(index == 0, owner_text, name):
        continue

--- a/tools/metrics/histograms/extract_histograms.py
+++ b/tools/metrics/histograms/extract_histograms.py
@@ -55,6 +55,7 @@ XML below will generate the following five histograms:
 </histogram-configuration>
 """

+import bisect
 import copy
 import datetime

@@ -82,10 +83,59 @@ EXPIRY_DATE_PATTERN = "%Y-%m-%d"
 EXPIRY_MILESTONE_RE = re.compile(r'M[0-9]{2,3}\Z')


+_ELEMENT_NODE = xml.dom.minidom.Node.ELEMENT_NODE
+
+# List of fields that need to copied by copy.copy to suffixed histogram.
+# Other fields may be copied by reference.
+_HISTOGRAM_COPY_FIELDS = [
+    'fieldtrial_groups', 'fieldtrial_names', 'fieldtrial_labels']
+
+
 class Error(Exception):
  pass


+def IterElementsWithTag(root, tag, depth=-1):
+  """Iterates over DOM tree and yields elements matching tag name.
+
+  It's meant to be replacement for `getElementsByTagName`,
+  (which does recursive search) but without recursive search
+  (nested tags are not supported in histograms files).
+
+  Note: This generator stops going deeper in the tree when it detects
+  that there are elements with given tag.
+
+  Args:
+    root: XML dom tree.
+    tag: Element's tag name.
+    depth: Defines how deep in the tree function should search for a match.
+
+  Yields:
+    xml.dom.minidom.Node: Element matching criteria.
+  """
+  if depth == 0 and root.nodeType == _ELEMENT_NODE and root.tagName == tag:
+    yield root
+    return
+
+  had_tag = False
+
+  skipped = 0
+
+  for child in root.childNodes:
+    if child.nodeType == _ELEMENT_NODE and child.tagName == tag:
+      had_tag = True
+      yield child
+    else:
+      skipped += 1
+
+  depth -= 1
+
+  if not had_tag and depth != 0:
+    for child in root.childNodes:
+      for match in IterElementsWithTag(child, tag, depth):
+        yield match
+
+
 def _GetTextFromChildNodes(node):
  """Returns a string concatenation of the text of the given node's children.

@@ -160,7 +210,7 @@ def _NormalizeAllAttributeValues(node):
  Returns:
    The normalized minidom node.
  """
-  if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
+  if node.nodeType == _ELEMENT_NODE:
    for a in node.attributes.keys():
      node.attributes[a].value = NormalizeString(node.attributes[a].value)

@@ -234,7 +284,7 @@ def ExtractEnumsFromXmlTree(tree):
  have_errors = False

  last_name = None
-  for enum in tree.getElementsByTagName('enum'):
+  for enum in IterElementsWithTag(tree, 'enum'):
    name = enum.getAttribute('name')
    if last_name is not None and name.lower() < last_name.lower():
      logging.error('Enums %s and %s are not in alphabetical order', last_name,
@@ -251,7 +301,9 @@ def ExtractEnumsFromXmlTree(tree):
    enum_dict['name'] = name
    enum_dict['values'] = {}

-    for int_tag in enum.getElementsByTagName('int'):
+    nodes = list(IterElementsWithTag(enum, 'int'))
+
+    for int_tag in nodes:
      value_dict = {}
      int_value = int(int_tag.getAttribute('value'))
      if int_value in enum_dict['values']:
@@ -265,7 +317,7 @@ def ExtractEnumsFromXmlTree(tree):
    enum_int_values = sorted(enum_dict['values'].keys())

    last_int_value = None
-    for int_tag in enum.getElementsByTagName('int'):
+    for int_tag in nodes:
      int_value = int(int_tag.getAttribute('value'))
      if last_int_value is not None and int_value < last_int_value:
        logging.error('Enum %s int values %d and %d are not in numerical order',
@@ -282,9 +334,9 @@ def ExtractEnumsFromXmlTree(tree):
      else:
        last_int_value = int_value

-    summary_nodes = enum.getElementsByTagName('summary')
-    if summary_nodes:
-      enum_dict['summary'] = _GetTextFromChildNodes(summary_nodes[0])
+    for summary in IterElementsWithTag(enum, 'summary'):
+      enum_dict['summary'] = _GetTextFromChildNodes(summary)
+      break

    enums[name] = enum_dict

@@ -309,8 +361,9 @@ def _ExtractOwners(histogram):
  owners = []
  has_owner = False

-  for owner_node in histogram.getElementsByTagName('owner'):
-    owner_text = _GetTextFromChildNodes(owner_node)
+  for owner_node in IterElementsWithTag(histogram, 'owner', 1):
+    child = owner_node.firstChild
+    owner_text = (child and child.nodeValue) or ''
    is_email = email_pattern.match(owner_text)

    if owner_text and (is_email or OWNER_PLACEHOLDER in owner_text):
@@ -355,7 +408,7 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
  histograms = {}
  have_errors = False
  last_name = None
-  for histogram in tree.getElementsByTagName('histogram'):
+  for histogram in IterElementsWithTag(tree, 'histogram'):
    name = histogram.getAttribute('name')
    if last_name is not None and name.lower() < last_name.lower():
      logging.error('Histograms %s and %s are not in alphabetical order',
@@ -382,12 +435,12 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
        have_errors = True

    # Find <owner> tag.
-    owners, hasOwner = _ExtractOwners(histogram)
+    owners, has_owner = _ExtractOwners(histogram)
    if owners:
      histogram_entry['owners'] = owners

    # Find <summary> tag.
-    summary_nodes = histogram.getElementsByTagName('summary')
+    summary_nodes = list(IterElementsWithTag(histogram, 'summary'))

    if summary_nodes:
      histogram_entry['summary'] = _GetTextFromChildNodes(summary_nodes[0])
@@ -395,7 +448,7 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
      histogram_entry['summary'] = 'TBD'

    # Find <obsolete> tag.
-    obsolete_nodes = histogram.getElementsByTagName('obsolete')
+    obsolete_nodes = list(IterElementsWithTag(histogram, 'obsolete', 1))
    if obsolete_nodes:
      reason = _GetTextFromChildNodes(obsolete_nodes[0])
      histogram_entry['obsolete'] = reason
@@ -406,7 +459,7 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
      have_errors = True

    # Non-obsolete histograms should specify <owner>s.
-    if not obsolete_nodes and not hasOwner:
+    if not obsolete_nodes and not has_owner:
      logging.error('histogram %s should specify <owner>s', name)
      have_errors = True

@@ -421,9 +474,9 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
      histogram_entry['units'] = histogram.getAttribute('units')

    # Find <details> tag.
-    details_nodes = histogram.getElementsByTagName('details')
-    if details_nodes:
-      histogram_entry['details'] = _GetTextFromChildNodes(details_nodes[0])
+    for node in IterElementsWithTag(histogram, 'details'):
+      histogram_entry['details'] = _GetTextFromChildNodes(node)
+      break

    # Handle enum types.
    if histogram.hasAttribute('enum'):
@@ -476,7 +529,9 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):

  # Verify order of histogram_suffixes fields first.
  last_name = None
-  for histogram_suffixes in tree.getElementsByTagName(histogram_suffix_tag):
+
+  for histogram_suffixes in IterElementsWithTag(
+      tree, histogram_suffix_tag, depth=1):
    name = histogram_suffixes.getAttribute('name')
    if last_name is not None and name.lower() < last_name.lower():
      logging.error('histogram_suffixes %s and %s are not in alphabetical '
@@ -491,7 +546,7 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
  reprocess_queue = []

  def GenerateHistogramSuffixes():
-    for f in tree.getElementsByTagName(histogram_suffix_tag):
+    for f in IterElementsWithTag(tree, histogram_suffix_tag):
      yield 0, f
    for r, f in reprocess_queue:
      yield r, f
@@ -499,8 +554,8 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
  for reprocess_count, histogram_suffixes in GenerateHistogramSuffixes():
    # Check dependencies first
    dependencies_valid = True
-    affected_histograms = histogram_suffixes.getElementsByTagName(
-        'affected-histogram')
+    affected_histograms = list(IterElementsWithTag(
+        histogram_suffixes, 'affected-histogram', 1))
    for affected_histogram in affected_histograms:
      histogram_name = affected_histogram.getAttribute('name')
      if histogram_name not in histograms:
@@ -524,7 +579,7 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
    group_obsolete_reason = _GetObsoleteReason(histogram_suffixes)

    name = histogram_suffixes.getAttribute('name')
-    suffix_nodes = histogram_suffixes.getElementsByTagName(suffix_tag)
+    suffix_nodes = list(IterElementsWithTag(histogram_suffixes, suffix_tag, 1))
    suffix_labels = {}
    for suffix in suffix_nodes:
      suffix_name = suffix.getAttribute('name')
@@ -546,7 +601,7 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
                      histogram_name, name)
        have_errors = True
      last_histogram_name = histogram_name
-      with_suffixes = affected_histogram.getElementsByTagName(with_tag)
+      with_suffixes = list(IterElementsWithTag(affected_histogram, with_tag, 1))
      if with_suffixes:
        suffixes_to_add = with_suffixes
      else:
@@ -571,26 +626,20 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):

          suffix_label = suffix_labels.get(suffix_name, '')

+          histogram_entry = histograms[new_histogram_name]
+          for field_name in _HISTOGRAM_COPY_FIELDS:
+            histogram_entry.setdefault(field_name, [])
+
          # TODO(yiyaoliu): Rename these to be consistent with the new naming.
          # It is kept unchanged for now to be it's used by dashboards.
-          if 'fieldtrial_groups' not in histograms[new_histogram_name]:
-            histograms[new_histogram_name]['fieldtrial_groups'] = []
-          histograms[new_histogram_name]['fieldtrial_groups'].append(
-              suffix_name)
-
-          if 'fieldtrial_names' not in histograms[new_histogram_name]:
-            histograms[new_histogram_name]['fieldtrial_names'] = []
-          histograms[new_histogram_name]['fieldtrial_names'].append(name)
-
-          if 'fieldtrial_labels' not in histograms[new_histogram_name]:
-            histograms[new_histogram_name]['fieldtrial_labels'] = []
-          histograms[new_histogram_name]['fieldtrial_labels'].append(
-              suffix_label)
+          histogram_entry['fieldtrial_groups'].append(suffix_name)
+          histogram_entry['fieldtrial_names'].append(name)
+          histogram_entry['fieldtrial_labels'].append(suffix_label)

          # If no owners are added for this histogram-suffixes, it inherits the
          # owners of its parents.
          if owners:
-            histograms[new_histogram_name]['owners'] = owners
+            histogram_entry['owners'] = owners

          # If a suffix has an obsolete node, it's marked as obsolete for the
          # specified reason, overwriting its group's obsoletion reason if the
@@ -602,9 +651,9 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
          # If the suffix has an obsolete tag, all histograms it generates
          # inherit it.
          if obsolete_reason:
-            histograms[new_histogram_name]['obsolete'] = obsolete_reason
+            histogram_entry['obsolete'] = obsolete_reason

-          _ProcessBaseHistogramAttribute(suffix, histograms[new_histogram_name])
+          _ProcessBaseHistogramAttribute(suffix, histogram_entry)

        except Error:
          have_errors = True
@@ -612,6 +661,26 @@ def _UpdateHistogramsWithSuffixes(tree, histograms):
  return have_errors


+def _GetTagSubTree(tree, tag, depth):
+  """Returns sub tree with tag element as a root.
+
+  When no element with tag name is found or there are many of them
+  original tree is returned.
+
+  Args:
+    tree: XML dom tree.
+    tag: Element's tag name.
+    depth: Defines how deep in the tree function should search for a match.
+
+  Returns:
+    xml.dom.minidom.Node: Sub tree (matching criteria) or original one.
+  """
+  entries = list(IterElementsWithTag(tree, tag, depth))
+  if len(entries) == 1:
+    tree = entries[0]
+  return tree
+
+
 def ExtractHistogramsFromDom(tree):
  """Computes the histogram names and descriptions from the XML representation.

@@ -621,13 +690,18 @@ def ExtractHistogramsFromDom(tree):
  Returns:
    a tuple of (histograms, status) where histograms is a dictionary mapping
    histogram names to dictionaries containing histogram descriptions and status
-    is a boolean indicating if errros were encoutered in processing.
+    is a boolean indicating if errros were encountered in processing.
  """
  _NormalizeAllAttributeValues(tree)

-  enums, enum_errors = ExtractEnumsFromXmlTree(tree)
-  histograms, histogram_errors = _ExtractHistogramsFromXmlTree(tree, enums)
-  update_errors = _UpdateHistogramsWithSuffixes(tree, histograms)
+  enums_tree = _GetTagSubTree(tree, 'enums', 2)
+  histograms_tree = _GetTagSubTree(tree, 'histograms', 2)
+  histogram_suffixes_tree = _GetTagSubTree(tree, 'histogram_suffixes_list', 2)
+  enums, enum_errors = ExtractEnumsFromXmlTree(enums_tree)
+  histograms, histogram_errors = _ExtractHistogramsFromXmlTree(
+      histograms_tree, enums)
+  update_errors = _UpdateHistogramsWithSuffixes(
+      histogram_suffixes_tree, histograms)

  return histograms, enum_errors or histogram_errors or update_errors


--- a/tools/metrics/histograms/merge_xml.py
+++ b/tools/metrics/histograms/merge_xml.py
@@ -6,9 +6,11 @@
 """A script to merge multiple source xml files into a single histograms.xml."""

 import argparse
-import expand_owners
 import xml.dom.minidom

+import expand_owners
+import extract_histograms
+

 def GetElementsByTagName(trees, tag):
  """Gets all elements with the specified tag from a set of DOM trees.
@@ -19,7 +21,8 @@ def GetElementsByTagName(trees, tag):
  Returns:
    A list of DOM nodes with the specified tag.
  """
-  return [e for t in trees for e in t.getElementsByTagName(tag)]
+  iterator = extract_histograms.IterElementsWithTag
+  return list(e for t in trees for e in iterator(t, tag, 2))


 def MakeNodeWithChildren(doc, tag, children):