Improves function that gets text from a node's children.

Paragraphs are now respected when a node with a child node whose text is split into paragraphs is encountered. Example of such a node: <tag> Paragraph 1. Still paragraph 1. Paragraph 2. </tag> Also, adds additional tests. Bug: 993948 Change-Id: I07c257e52a33026f5607946aa36b5e9ee5480056 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1754124 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#687404}

Improves function that gets text from a node's children.
Paragraphs are now respected when a node with a child node whose text is split into paragraphs is encountered. Example of such a node: <tag> Paragraph 1. Still paragraph 1. Paragraph 2. </tag> Also, adds additional tests. Bug: 993948 Change-Id: I07c257e52a33026f5607946aa36b5e9ee5480056 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1754124 Commit-Queue: Alexei Svitkine <asvitkine@chromium.org> Reviewed-by: Alexei Svitkine <asvitkine@chromium.org> Cr-Commit-Position: refs/heads/master@{#687404}
b0c10683 · Caitlin Fischer · Commit Bot · 524cbc9a · b0c10683 · b0c10683
Commit b0c10683 authored Aug 15, 2019 by Caitlin Fischer Committed by Commit Bot Aug 15, 2019
Showing with 363 additions and 31 deletions

tools/metrics/histograms/extract_histograms.py tools/metrics/histograms/extract_histograms.py +61 -31

tools/metrics/histograms/extract_histograms_test.py tools/metrics/histograms/extract_histograms_test.py +302 -0

No files found.
--- a/tools/metrics/histograms/extract_histograms.py
+++ b/tools/metrics/histograms/extract_histograms.py
@@ -18,11 +18,14 @@ XML below will generate the following five histograms:
 <histograms>

 <histogram name="HistogramTime" units="milliseconds">
+  <owner>person@chromium.org</owner>
+  <owner>some-team@chromium.org</owner>
  <summary>A brief description.</summary>
  <details>This is a more thorough description of this histogram.</details>
 </histogram>

 <histogram name="HistogramEnum" enum="MyEnumType">
+  <owner>person@chromium.org</owner>
  <summary>This histogram sports an enum value type.</summary>
 </histogram>

@@ -40,7 +43,7 @@ XML below will generate the following five histograms:

 <histogram_suffixes_list>

-<histogram_suffixes name="BrowserType">
+<histogram_suffixes name="BrowserType" separator="_">
  <suffix name="Chrome"/>
  <suffix name="IE"/>
  <suffix name="Firefox"/>
@@ -50,13 +53,11 @@ XML below will generate the following five histograms:
 </histogram_suffixes_list>

 </histogram-configuration>
-
 """

-import HTMLParser
-import bisect
 import copy
 import datetime
+import HTMLParser
 import logging
 import re
 import xml.dom.minidom
@@ -79,39 +80,69 @@ class Error(Exception):
  pass


-def _JoinChildNodes(tag):
-  """Joins child nodes into a single text.
+def _GetTextFromChildNodes(node):
+  """Returns a string concatenation of the text of the given node's children.

-  Applicable to leafs like 'summary' and 'detail'. Removes any comment in the
-  node.
+  Comments are ignored, consecutive lines of text are joined with a single
+  space, and paragraphs are maintained so that long text is more readable on
+  dashboards.

  Args:
-    tag: parent node
-
-  Returns:
-    a string with concatenated nodes' text representation.
+    node: The DOM Element whose children's text is to be extracted, processed,
+      and returned.
  """
-  return ''.join(c.toxml()
-                 for c in tag.childNodes
-                 if c.nodeType != xml.dom.minidom.Node.COMMENT_NODE).strip()
+  paragraph_break = '\n\n'
+  text_parts = []

+  for child in node.childNodes:
+    if child.nodeType != xml.dom.minidom.Node.COMMENT_NODE:
+      child_text = child.toxml()
+      if not child_text:
+        continue

-def NormalizeString(s):
-  r"""Replaces all whitespace sequences with a single space.
+      # If the given node has the below XML representation, then the text
+      # added to the list is 'Some words.\n\nWords.'
+      # <tag>
+      #   Some
+      #   words.
+      #
+      #   <!--Child comment node.-->
+      #
+      #   Words.
+      # </tag>

-  The function properly handles multi-line strings and XML escaped characters.
+      # In the case of the first child text node, raw_paragraphs would store
+      # ['\n  Some\n  words.', '  '], and in the case of the second,
+      # raw_paragraphs would store ['', '  Words.\n'].
+      raw_paragraphs = child_text.split(paragraph_break)
+
+      # In the case of the first child text node, processed_paragraphs would
+      # store ['Some words.', ''], and in the case of the second,
+      # processed_paragraphs would store ['Words.'].
+      processed_paragraphs = [NormalizeString(text)
+                              for text in raw_paragraphs
+                              if text]
+      text_parts.append(paragraph_break.join(processed_paragraphs))
+
+  return ''.join(text_parts).strip()
+
+
+def NormalizeString(text):
+  r"""Replaces all white space sequences with a single space.
+
+  Also, unescapes any HTML escaped characters, e.g. &quot; or &gt;.

  Args:
-    s: The string to normalize, ('  \\n a  b c\\n d  ').
+    text: The string to normalize, '\n\n a \n b&gt;c  '.

  Returns:
-    The normalized string (a b c d).
+    The normalized string 'a b>c'.
  """
-  singleline_value = ' '.join(s.split())
+  line = ' '.join(text.split())

  # Unescape using default ASCII encoding. Unescapes any HTML escaped character
  # like &quot; etc.
-  return HTMLParser.HTMLParser().unescape(singleline_value)
+  return HTMLParser.HTMLParser().unescape(line)


 def _NormalizeAllAttributeValues(node):
@@ -222,7 +253,7 @@ def ExtractEnumsFromXmlTree(tree):
        have_errors = True
        continue
      value_dict['label'] = int_tag.getAttribute('label')
-      value_dict['summary'] = _JoinChildNodes(int_tag)
+      value_dict['summary'] = _GetTextFromChildNodes(int_tag)
      enum_dict['values'][int_value] = value_dict

    enum_int_values = sorted(enum_dict['values'].keys())
@@ -247,7 +278,7 @@ def ExtractEnumsFromXmlTree(tree):

    summary_nodes = enum.getElementsByTagName('summary')
    if summary_nodes:
-      enum_dict['summary'] = NormalizeString(_JoinChildNodes(summary_nodes[0]))
+      enum_dict['summary'] = _GetTextFromChildNodes(summary_nodes[0])

    enums[name] = enum_dict

@@ -273,7 +304,7 @@ def _ExtractOwners(histogram):
  has_owner = False

  for owner_node in histogram.getElementsByTagName('owner'):
-    owner_text = NormalizeString(_JoinChildNodes(owner_node))
+    owner_text = _GetTextFromChildNodes(owner_node)
    is_email = email_pattern.match(owner_text)

    if owner_text and (is_email or OWNER_PLACEHOLDER in owner_text):
@@ -351,16 +382,16 @@ def _ExtractHistogramsFromXmlTree(tree, enums):

    # Find <summary> tag.
    summary_nodes = histogram.getElementsByTagName('summary')
+
    if summary_nodes:
-      histogram_entry['summary'] = NormalizeString(
-          _JoinChildNodes(summary_nodes[0]))
+      histogram_entry['summary'] = _GetTextFromChildNodes(summary_nodes[0])
    else:
      histogram_entry['summary'] = 'TBD'

    # Find <obsolete> tag.
    obsolete_nodes = histogram.getElementsByTagName('obsolete')
    if obsolete_nodes:
-      reason = _JoinChildNodes(obsolete_nodes[0])
+      reason = _GetTextFromChildNodes(obsolete_nodes[0])
      histogram_entry['obsolete'] = reason

    # Non-obsolete histograms should provide a <summary>.
@@ -380,8 +411,7 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
    # Find <details> tag.
    details_nodes = histogram.getElementsByTagName('details')
    if details_nodes:
-      histogram_entry['details'] = NormalizeString(
-          _JoinChildNodes(details_nodes[0]))
+      histogram_entry['details'] = _GetTextFromChildNodes(details_nodes[0])

    # Handle enum types.
    if histogram.hasAttribute('enum'):
@@ -408,7 +438,7 @@ def _GetObsoleteReason(node):
  for child in node.childNodes:
    if child.localName == 'obsolete':
      # There can be at most 1 obsolete element per node.
-      return _JoinChildNodes(child)
+      return _GetTextFromChildNodes(child)
  return None



--- a/tools/metrics/histograms/extract_histograms_test.py
+++ b/tools/metrics/histograms/extract_histograms_test.py
@@ -8,9 +8,311 @@ import xml.dom.minidom

 import extract_histograms

+TEST_SUFFIX_OBSOLETION_XML_CONTENT = """
+<histogram-configuration>
+<histograms>
+  <histogram name="Test.Test1">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Sample description.
+    </summary>
+  </histogram>
+  <histogram name="Test.Test2">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Sample description.
+    </summary>
+  </histogram>
+</histograms>
+<histogram_suffixes_list>
+  <histogram_suffixes name="NonObsoleteSuffix" separator="_">
+    <suffix name="NonObsolete1" label="First non-obsolete suffix"/>
+    <suffix name="NonObsolete2" label="Second non-obsolete suffix"/>
+    <affected-histogram name="Test.Test1"/>
+    <affected-histogram name="Test.Test2"/>
+  </histogram_suffixes>
+
+  <histogram_suffixes name="ObsoleteSuffixGroup" separator="_">
+    <obsolete>This suffix group is obsolete</obsolete>
+    <suffix name="ObsoleteSuffixGroup1" label="First obsolete suffix"/>
+    <suffix name="ObsoleteSuffixGroup2" label="Second obsolete suffix"/>
+    <affected-histogram name="Test.Test1"/>
+  </histogram_suffixes>
+
+  <histogram_suffixes name="ObsoleteSuffixNonObsoleteGroup" separator="_">
+    <suffix name="ObsoleteSuffixNonObsoleteGroup1" label="Obsolete suffix">
+      <obsolete>This suffix is obsolete</obsolete>
+    </suffix>
+    <suffix name="NonObsoleteSuffixNonObsoleteGroup2" label="Non obsolete suffix"/>
+    <affected-histogram name="Test.Test2"/>
+  </histogram_suffixes>
+
+  <histogram_suffixes name="ObsoleteSuffixObsoleteGroup" separator="_">
+    <obsolete>This suffix group is obsolete</obsolete>
+    <suffix name="ObsoleteSuffixObsoleteGroup1" label="First obsolete suffix">
+      <obsolete>This suffix is obsolete</obsolete>
+    </suffix>
+    <suffix name="NonObsoleteSuffixObsoleteGroup2" label="Second obsolete suffix"/>
+    <affected-histogram name="Test.Test2"/>
+  </histogram_suffixes>
+</histogram_suffixes_list>
+</histogram-configuration>
+"""
+
+TEST_BASE_HISTOGRAM_XML_CONTENT = """
+<histogram-configuration>
+<histograms>
+  <histogram base="true" name="Test.Base" expires_after="2211-11-22">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Base histogram.
+    </summary>
+  </histogram>
+  <histogram base="true" name="Test.Base.Obsolete">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Obsolete base histogram.
+    </summary>
+    <obsolete>
+      The whole related set of histograms is obsolete!
+    </obsolete>
+  </histogram>
+  <histogram base="false" name="Test.NotBase.Explicit">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Not a base histogram: base attribute explicitly set to "false".
+    </summary>
+  </histogram>
+  <histogram name="Test.NotBase.Implicit" expires_after="M100">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Not a base histogram: no base attribute specified.
+    </summary>
+  </histogram>
+</histograms>
+<histogram_suffixes_list>
+  <histogram_suffixes name="Suffixes" separator=".">
+    <suffix base="true" name="BaseSuffix" label="A base suffix"/>
+    <suffix name="NonBaseSuffix" label="A non-base suffix"/>
+    <suffix name="ObsoleteSuffix" label="An obsolete suffix">
+      <obsolete>This suffix is obsolete!</obsolete>
+    </suffix>
+    <affected-histogram name="Test.Base"/>
+    <affected-histogram name="Test.Base.Obsolete"/>
+  </histogram_suffixes>
+
+  <histogram_suffixes name="SuffixesPlusPlus" separator=".">
+    <suffix name="One" label="One suffix"/>
+    <suffix name="Two" label="Another suffix"/>
+    <affected-histogram name="Test.Base.BaseSuffix"/>
+    <affected-histogram name="Test.Base.NonBaseSuffix"/>
+  </histogram_suffixes>
+</histogram_suffixes_list>
+</histogram-configuration>
+"""
+

 class ExtractHistogramsTest(unittest.TestCase):

+  def testSuffixObsoletion(self):
+    histograms, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(TEST_SUFFIX_OBSOLETION_XML_CONTENT))
+    self.assertFalse(had_errors)
+    # Obsolete on suffixes doesn't affect the source histogram
+    self.assertNotIn('obsolete', histograms['Test.Test1'])
+    self.assertNotIn('obsolete', histograms['Test.Test2'])
+
+    self.assertNotIn('obsolete', histograms['Test.Test1_NonObsolete1'])
+    self.assertNotIn('obsolete', histograms['Test.Test1_NonObsolete2'])
+    self.assertNotIn('obsolete', histograms['Test.Test2_NonObsolete1'])
+    self.assertNotIn('obsolete', histograms['Test.Test2_NonObsolete2'])
+
+    self.assertIn('obsolete', histograms['Test.Test1_ObsoleteSuffixGroup1'])
+    self.assertIn('obsolete', histograms['Test.Test1_ObsoleteSuffixGroup2'])
+
+    # Obsolete suffixes should apply to individual suffixes and not their group.
+    self.assertIn('obsolete',
+                  histograms['Test.Test2_ObsoleteSuffixNonObsoleteGroup1'])
+    self.assertNotIn(
+        'obsolete', histograms['Test.Test2_NonObsoleteSuffixNonObsoleteGroup2'])
+    self.assertEqual(
+        'This suffix is obsolete',
+        histograms['Test.Test2_ObsoleteSuffixNonObsoleteGroup1']['obsolete'])
+
+    # Obsolete suffix reasons should overwrite the suffix group's obsoletion
+    # reason.
+    self.assertIn('obsolete',
+                  histograms['Test.Test2_ObsoleteSuffixObsoleteGroup1'])
+    self.assertIn('obsolete',
+                  histograms['Test.Test2_NonObsoleteSuffixObsoleteGroup2'])
+    self.assertEqual(
+        'This suffix is obsolete',
+        histograms['Test.Test2_ObsoleteSuffixObsoleteGroup1']['obsolete'])
+    self.assertEqual(
+        'This suffix group is obsolete',
+        histograms['Test.Test2_NonObsoleteSuffixObsoleteGroup2']['obsolete'])
+
+  def testBaseHistograms(self):
+    histograms, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(TEST_BASE_HISTOGRAM_XML_CONTENT))
+    self.assertFalse(had_errors)
+    # Base histograms are implicitly marked as obsolete.
+    self.assertIn('obsolete', histograms['Test.Base'])
+    self.assertIn('obsolete', histograms['Test.Base.Obsolete'])
+
+    # Other histograms shouldn't be implicitly marked as obsolete.
+    self.assertNotIn('obsolete', histograms['Test.NotBase.Explicit'])
+    self.assertNotIn('obsolete', histograms['Test.NotBase.Implicit'])
+
+    # Suffixes applied to base histograms shouldn't be marked as obsolete...
+    self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix'])
+    # ... unless the suffix is marked as obsolete,
+    self.assertIn('obsolete', histograms['Test.Base.ObsoleteSuffix'])
+    # ... or the suffix is a base suffix,
+    self.assertIn('obsolete', histograms['Test.Base.BaseSuffix'])
+    # ... or the base histogram is marked as obsolete,
+    self.assertIn('obsolete', histograms['Test.Base.Obsolete.BaseSuffix'])
+    self.assertIn('obsolete', histograms['Test.Base.Obsolete.NonBaseSuffix'])
+    self.assertIn('obsolete', histograms['Test.Base.Obsolete.ObsoleteSuffix'])
+
+    # It should be possible to have multiple levels of suffixes for base
+    # histograms.
+    self.assertNotIn('obsolete', histograms['Test.Base.BaseSuffix.One'])
+    self.assertNotIn('obsolete', histograms['Test.Base.BaseSuffix.Two'])
+    self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix.One'])
+    self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix.Two'])
+
+  def testExpiryFormat(self):
+    chrome_histogram_pattern = """<histogram-configuration>
+
+<histograms>
+
+<histogram name="Histogram.Name"{}>
+  <owner>SomeOne@google.com</owner>
+  <summary>Summary</summary>
+</histogram>
+
+</histograms>
+
+</histogram-configuration>
+"""
+    chrome_histogram_correct_expiry_date = chrome_histogram_pattern.format(
+        ' expires_after="2211-11-22"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_correct_expiry_date))
+    self.assertFalse(had_errors)
+
+    chrome_histogram_wrong_expiry_date_format = chrome_histogram_pattern.format(
+        ' expires_after="2211/11/22"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_date_format))
+    self.assertTrue(had_errors)
+
+    chrome_histogram_wrong_expiry_date_value = chrome_histogram_pattern.format(
+        ' expires_after="2211-22-11"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_date_value))
+    self.assertTrue(had_errors)
+
+    chrome_histogram_correct_expiry_milestone = chrome_histogram_pattern.format(
+        ' expires_after="M22"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_correct_expiry_milestone))
+    self.assertFalse(had_errors)
+
+    chrome_histogram_wrong_expiry_milestone = chrome_histogram_pattern.format(
+        ' expires_after="22"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_milestone))
+    self.assertTrue(had_errors)
+
+    chrome_histogram_wrong_expiry_milestone = chrome_histogram_pattern.format(
+        ' expires_after="MM22"')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_milestone))
+    self.assertTrue(had_errors)
+
+    chrome_histogram_no_expiry = chrome_histogram_pattern.format('')
+    _, had_errors = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_no_expiry))
+    self.assertFalse(had_errors)
+
+  def testExpiryDateExtraction(self):
+    chrome_histogram_pattern = """<histogram-configuration>
+
+<histograms>
+
+<histogram name="Histogram.Name"{}>
+  <owner>SomeOne@google.com</owner>
+  <summary>Summary</summary>
+</histogram>
+
+</histograms>
+
+</histogram-configuration>
+"""
+    date_str = '2211-11-22'
+    chrome_histogram_correct_expiry_date = chrome_histogram_pattern.format(
+        ' expires_after="{}"'.format(date_str))
+    histograms, _ = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_correct_expiry_date))
+    histogram_content = histograms['Histogram.Name']
+    self.assertIn('expires_after', histogram_content)
+    self.assertEqual(date_str, histogram_content['expires_after'])
+
+    milestone_str = 'M22'
+    chrome_histogram_correct_expiry_milestone = chrome_histogram_pattern.format(
+        ' expires_after="{}"'.format(milestone_str))
+    histograms, _ = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_correct_expiry_milestone))
+    histogram_content = histograms['Histogram.Name']
+    self.assertIn('expires_after', histogram_content)
+    self.assertEqual(milestone_str, histogram_content['expires_after'])
+
+    chrome_histogram_no_expiry = chrome_histogram_pattern.format('')
+    histograms, _ = extract_histograms.ExtractHistogramsFromDom(
+        xml.dom.minidom.parseString(chrome_histogram_no_expiry))
+    histogram_content = histograms['Histogram.Name']
+    self.assertNotIn('expires_after', histogram_content)
+
+  def testMultiParagraphSummary(self):
+    multiple_paragraph_pattern = xml.dom.minidom.parseString("""
+<histogram-configuration>
+<histograms>
+  <histogram name="MultiParagraphTest.Test1">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Sample description
+      Sample description.
+    </summary>
+  </histogram>
+
+  <histogram name="MultiParagraphTest.Test2">
+    <owner>chrome-metrics-team@google.com</owner>
+    <summary>
+      Multi-paragraph sample description UI&gt;Browser.
+      Words.
+
+      Still multi-paragraph sample description.
+
+      <!--Comments are allowed.-->
+
+      Here.
+    </summary>
+  </histogram>
+</histograms>
+</histogram-configuration>
+""")
+    histograms, _ = extract_histograms._ExtractHistogramsFromXmlTree(
+        multiple_paragraph_pattern, {})
+    self.assertEqual(histograms['MultiParagraphTest.Test1']['summary'],
+                     'Sample description Sample description.')
+    self.assertEqual(
+        histograms['MultiParagraphTest.Test2']['summary'],
+        'Multi-paragraph sample description UI>Browser. Words.\n\n'
+        'Still multi-paragraph sample description.\n\nHere.')
+
+
  def testNewHistogramWithoutSummary(self):
    histogram_without_summary = xml.dom.minidom.parseString("""
 <histogram-configuration>