Commit b0c10683 authored by Caitlin Fischer's avatar Caitlin Fischer Committed by Commit Bot

Improves function that gets text from a node's children.

Paragraphs are now respected when a node with a child node whose
text is split into paragraphs is encountered.

Example of such a node:
<tag>
  Paragraph 1.
  Still paragraph 1.

  Paragraph 2.
</tag>

Also, adds additional tests.

Bug: 993948

Change-Id: I07c257e52a33026f5607946aa36b5e9ee5480056
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1754124
Commit-Queue: Alexei Svitkine <asvitkine@chromium.org>
Reviewed-by: default avatarAlexei Svitkine <asvitkine@chromium.org>
Cr-Commit-Position: refs/heads/master@{#687404}
parent 524cbc9a
......@@ -18,11 +18,14 @@ XML below will generate the following five histograms:
<histograms>
<histogram name="HistogramTime" units="milliseconds">
<owner>person@chromium.org</owner>
<owner>some-team@chromium.org</owner>
<summary>A brief description.</summary>
<details>This is a more thorough description of this histogram.</details>
</histogram>
<histogram name="HistogramEnum" enum="MyEnumType">
<owner>person@chromium.org</owner>
<summary>This histogram sports an enum value type.</summary>
</histogram>
......@@ -40,7 +43,7 @@ XML below will generate the following five histograms:
<histogram_suffixes_list>
<histogram_suffixes name="BrowserType">
<histogram_suffixes name="BrowserType" separator="_">
<suffix name="Chrome"/>
<suffix name="IE"/>
<suffix name="Firefox"/>
......@@ -50,13 +53,11 @@ XML below will generate the following five histograms:
</histogram_suffixes_list>
</histogram-configuration>
"""
import HTMLParser
import bisect
import copy
import datetime
import HTMLParser
import logging
import re
import xml.dom.minidom
......@@ -79,39 +80,69 @@ class Error(Exception):
pass
def _JoinChildNodes(tag):
"""Joins child nodes into a single text.
def _GetTextFromChildNodes(node):
"""Returns a string concatenation of the text of the given node's children.
Applicable to leafs like 'summary' and 'detail'. Removes any comment in the
node.
Comments are ignored, consecutive lines of text are joined with a single
space, and paragraphs are maintained so that long text is more readable on
dashboards.
Args:
tag: parent node
Returns:
a string with concatenated nodes' text representation.
node: The DOM Element whose children's text is to be extracted, processed,
and returned.
"""
return ''.join(c.toxml()
for c in tag.childNodes
if c.nodeType != xml.dom.minidom.Node.COMMENT_NODE).strip()
paragraph_break = '\n\n'
text_parts = []
for child in node.childNodes:
if child.nodeType != xml.dom.minidom.Node.COMMENT_NODE:
child_text = child.toxml()
if not child_text:
continue
def NormalizeString(s):
r"""Replaces all whitespace sequences with a single space.
# If the given node has the below XML representation, then the text
# added to the list is 'Some words.\n\nWords.'
# <tag>
# Some
# words.
#
# <!--Child comment node.-->
#
# Words.
# </tag>
The function properly handles multi-line strings and XML escaped characters.
# In the case of the first child text node, raw_paragraphs would store
# ['\n Some\n words.', ' '], and in the case of the second,
# raw_paragraphs would store ['', ' Words.\n'].
raw_paragraphs = child_text.split(paragraph_break)
# In the case of the first child text node, processed_paragraphs would
# store ['Some words.', ''], and in the case of the second,
# processed_paragraphs would store ['Words.'].
processed_paragraphs = [NormalizeString(text)
for text in raw_paragraphs
if text]
text_parts.append(paragraph_break.join(processed_paragraphs))
return ''.join(text_parts).strip()
def NormalizeString(text):
r"""Replaces all white space sequences with a single space.
Also, unescapes any HTML escaped characters, e.g. &quot; or &gt;.
Args:
s: The string to normalize, (' \\n a b c\\n d ').
text: The string to normalize, '\n\n a \n b&gt;c '.
Returns:
The normalized string (a b c d).
The normalized string 'a b>c'.
"""
singleline_value = ' '.join(s.split())
line = ' '.join(text.split())
# Unescape using default ASCII encoding. Unescapes any HTML escaped character
# like &quot; etc.
return HTMLParser.HTMLParser().unescape(singleline_value)
return HTMLParser.HTMLParser().unescape(line)
def _NormalizeAllAttributeValues(node):
......@@ -222,7 +253,7 @@ def ExtractEnumsFromXmlTree(tree):
have_errors = True
continue
value_dict['label'] = int_tag.getAttribute('label')
value_dict['summary'] = _JoinChildNodes(int_tag)
value_dict['summary'] = _GetTextFromChildNodes(int_tag)
enum_dict['values'][int_value] = value_dict
enum_int_values = sorted(enum_dict['values'].keys())
......@@ -247,7 +278,7 @@ def ExtractEnumsFromXmlTree(tree):
summary_nodes = enum.getElementsByTagName('summary')
if summary_nodes:
enum_dict['summary'] = NormalizeString(_JoinChildNodes(summary_nodes[0]))
enum_dict['summary'] = _GetTextFromChildNodes(summary_nodes[0])
enums[name] = enum_dict
......@@ -273,7 +304,7 @@ def _ExtractOwners(histogram):
has_owner = False
for owner_node in histogram.getElementsByTagName('owner'):
owner_text = NormalizeString(_JoinChildNodes(owner_node))
owner_text = _GetTextFromChildNodes(owner_node)
is_email = email_pattern.match(owner_text)
if owner_text and (is_email or OWNER_PLACEHOLDER in owner_text):
......@@ -351,16 +382,16 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
# Find <summary> tag.
summary_nodes = histogram.getElementsByTagName('summary')
if summary_nodes:
histogram_entry['summary'] = NormalizeString(
_JoinChildNodes(summary_nodes[0]))
histogram_entry['summary'] = _GetTextFromChildNodes(summary_nodes[0])
else:
histogram_entry['summary'] = 'TBD'
# Find <obsolete> tag.
obsolete_nodes = histogram.getElementsByTagName('obsolete')
if obsolete_nodes:
reason = _JoinChildNodes(obsolete_nodes[0])
reason = _GetTextFromChildNodes(obsolete_nodes[0])
histogram_entry['obsolete'] = reason
# Non-obsolete histograms should provide a <summary>.
......@@ -380,8 +411,7 @@ def _ExtractHistogramsFromXmlTree(tree, enums):
# Find <details> tag.
details_nodes = histogram.getElementsByTagName('details')
if details_nodes:
histogram_entry['details'] = NormalizeString(
_JoinChildNodes(details_nodes[0]))
histogram_entry['details'] = _GetTextFromChildNodes(details_nodes[0])
# Handle enum types.
if histogram.hasAttribute('enum'):
......@@ -408,7 +438,7 @@ def _GetObsoleteReason(node):
for child in node.childNodes:
if child.localName == 'obsolete':
# There can be at most 1 obsolete element per node.
return _JoinChildNodes(child)
return _GetTextFromChildNodes(child)
return None
......
......@@ -8,9 +8,311 @@ import xml.dom.minidom
import extract_histograms
TEST_SUFFIX_OBSOLETION_XML_CONTENT = """
<histogram-configuration>
<histograms>
<histogram name="Test.Test1">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Sample description.
</summary>
</histogram>
<histogram name="Test.Test2">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Sample description.
</summary>
</histogram>
</histograms>
<histogram_suffixes_list>
<histogram_suffixes name="NonObsoleteSuffix" separator="_">
<suffix name="NonObsolete1" label="First non-obsolete suffix"/>
<suffix name="NonObsolete2" label="Second non-obsolete suffix"/>
<affected-histogram name="Test.Test1"/>
<affected-histogram name="Test.Test2"/>
</histogram_suffixes>
<histogram_suffixes name="ObsoleteSuffixGroup" separator="_">
<obsolete>This suffix group is obsolete</obsolete>
<suffix name="ObsoleteSuffixGroup1" label="First obsolete suffix"/>
<suffix name="ObsoleteSuffixGroup2" label="Second obsolete suffix"/>
<affected-histogram name="Test.Test1"/>
</histogram_suffixes>
<histogram_suffixes name="ObsoleteSuffixNonObsoleteGroup" separator="_">
<suffix name="ObsoleteSuffixNonObsoleteGroup1" label="Obsolete suffix">
<obsolete>This suffix is obsolete</obsolete>
</suffix>
<suffix name="NonObsoleteSuffixNonObsoleteGroup2" label="Non obsolete suffix"/>
<affected-histogram name="Test.Test2"/>
</histogram_suffixes>
<histogram_suffixes name="ObsoleteSuffixObsoleteGroup" separator="_">
<obsolete>This suffix group is obsolete</obsolete>
<suffix name="ObsoleteSuffixObsoleteGroup1" label="First obsolete suffix">
<obsolete>This suffix is obsolete</obsolete>
</suffix>
<suffix name="NonObsoleteSuffixObsoleteGroup2" label="Second obsolete suffix"/>
<affected-histogram name="Test.Test2"/>
</histogram_suffixes>
</histogram_suffixes_list>
</histogram-configuration>
"""
TEST_BASE_HISTOGRAM_XML_CONTENT = """
<histogram-configuration>
<histograms>
<histogram base="true" name="Test.Base" expires_after="2211-11-22">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Base histogram.
</summary>
</histogram>
<histogram base="true" name="Test.Base.Obsolete">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Obsolete base histogram.
</summary>
<obsolete>
The whole related set of histograms is obsolete!
</obsolete>
</histogram>
<histogram base="false" name="Test.NotBase.Explicit">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Not a base histogram: base attribute explicitly set to "false".
</summary>
</histogram>
<histogram name="Test.NotBase.Implicit" expires_after="M100">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Not a base histogram: no base attribute specified.
</summary>
</histogram>
</histograms>
<histogram_suffixes_list>
<histogram_suffixes name="Suffixes" separator=".">
<suffix base="true" name="BaseSuffix" label="A base suffix"/>
<suffix name="NonBaseSuffix" label="A non-base suffix"/>
<suffix name="ObsoleteSuffix" label="An obsolete suffix">
<obsolete>This suffix is obsolete!</obsolete>
</suffix>
<affected-histogram name="Test.Base"/>
<affected-histogram name="Test.Base.Obsolete"/>
</histogram_suffixes>
<histogram_suffixes name="SuffixesPlusPlus" separator=".">
<suffix name="One" label="One suffix"/>
<suffix name="Two" label="Another suffix"/>
<affected-histogram name="Test.Base.BaseSuffix"/>
<affected-histogram name="Test.Base.NonBaseSuffix"/>
</histogram_suffixes>
</histogram_suffixes_list>
</histogram-configuration>
"""
class ExtractHistogramsTest(unittest.TestCase):
def testSuffixObsoletion(self):
histograms, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(TEST_SUFFIX_OBSOLETION_XML_CONTENT))
self.assertFalse(had_errors)
# Obsolete on suffixes doesn't affect the source histogram
self.assertNotIn('obsolete', histograms['Test.Test1'])
self.assertNotIn('obsolete', histograms['Test.Test2'])
self.assertNotIn('obsolete', histograms['Test.Test1_NonObsolete1'])
self.assertNotIn('obsolete', histograms['Test.Test1_NonObsolete2'])
self.assertNotIn('obsolete', histograms['Test.Test2_NonObsolete1'])
self.assertNotIn('obsolete', histograms['Test.Test2_NonObsolete2'])
self.assertIn('obsolete', histograms['Test.Test1_ObsoleteSuffixGroup1'])
self.assertIn('obsolete', histograms['Test.Test1_ObsoleteSuffixGroup2'])
# Obsolete suffixes should apply to individual suffixes and not their group.
self.assertIn('obsolete',
histograms['Test.Test2_ObsoleteSuffixNonObsoleteGroup1'])
self.assertNotIn(
'obsolete', histograms['Test.Test2_NonObsoleteSuffixNonObsoleteGroup2'])
self.assertEqual(
'This suffix is obsolete',
histograms['Test.Test2_ObsoleteSuffixNonObsoleteGroup1']['obsolete'])
# Obsolete suffix reasons should overwrite the suffix group's obsoletion
# reason.
self.assertIn('obsolete',
histograms['Test.Test2_ObsoleteSuffixObsoleteGroup1'])
self.assertIn('obsolete',
histograms['Test.Test2_NonObsoleteSuffixObsoleteGroup2'])
self.assertEqual(
'This suffix is obsolete',
histograms['Test.Test2_ObsoleteSuffixObsoleteGroup1']['obsolete'])
self.assertEqual(
'This suffix group is obsolete',
histograms['Test.Test2_NonObsoleteSuffixObsoleteGroup2']['obsolete'])
def testBaseHistograms(self):
histograms, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(TEST_BASE_HISTOGRAM_XML_CONTENT))
self.assertFalse(had_errors)
# Base histograms are implicitly marked as obsolete.
self.assertIn('obsolete', histograms['Test.Base'])
self.assertIn('obsolete', histograms['Test.Base.Obsolete'])
# Other histograms shouldn't be implicitly marked as obsolete.
self.assertNotIn('obsolete', histograms['Test.NotBase.Explicit'])
self.assertNotIn('obsolete', histograms['Test.NotBase.Implicit'])
# Suffixes applied to base histograms shouldn't be marked as obsolete...
self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix'])
# ... unless the suffix is marked as obsolete,
self.assertIn('obsolete', histograms['Test.Base.ObsoleteSuffix'])
# ... or the suffix is a base suffix,
self.assertIn('obsolete', histograms['Test.Base.BaseSuffix'])
# ... or the base histogram is marked as obsolete,
self.assertIn('obsolete', histograms['Test.Base.Obsolete.BaseSuffix'])
self.assertIn('obsolete', histograms['Test.Base.Obsolete.NonBaseSuffix'])
self.assertIn('obsolete', histograms['Test.Base.Obsolete.ObsoleteSuffix'])
# It should be possible to have multiple levels of suffixes for base
# histograms.
self.assertNotIn('obsolete', histograms['Test.Base.BaseSuffix.One'])
self.assertNotIn('obsolete', histograms['Test.Base.BaseSuffix.Two'])
self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix.One'])
self.assertNotIn('obsolete', histograms['Test.Base.NonBaseSuffix.Two'])
def testExpiryFormat(self):
chrome_histogram_pattern = """<histogram-configuration>
<histograms>
<histogram name="Histogram.Name"{}>
<owner>SomeOne@google.com</owner>
<summary>Summary</summary>
</histogram>
</histograms>
</histogram-configuration>
"""
chrome_histogram_correct_expiry_date = chrome_histogram_pattern.format(
' expires_after="2211-11-22"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_correct_expiry_date))
self.assertFalse(had_errors)
chrome_histogram_wrong_expiry_date_format = chrome_histogram_pattern.format(
' expires_after="2211/11/22"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_date_format))
self.assertTrue(had_errors)
chrome_histogram_wrong_expiry_date_value = chrome_histogram_pattern.format(
' expires_after="2211-22-11"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_date_value))
self.assertTrue(had_errors)
chrome_histogram_correct_expiry_milestone = chrome_histogram_pattern.format(
' expires_after="M22"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_correct_expiry_milestone))
self.assertFalse(had_errors)
chrome_histogram_wrong_expiry_milestone = chrome_histogram_pattern.format(
' expires_after="22"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_milestone))
self.assertTrue(had_errors)
chrome_histogram_wrong_expiry_milestone = chrome_histogram_pattern.format(
' expires_after="MM22"')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_wrong_expiry_milestone))
self.assertTrue(had_errors)
chrome_histogram_no_expiry = chrome_histogram_pattern.format('')
_, had_errors = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_no_expiry))
self.assertFalse(had_errors)
def testExpiryDateExtraction(self):
chrome_histogram_pattern = """<histogram-configuration>
<histograms>
<histogram name="Histogram.Name"{}>
<owner>SomeOne@google.com</owner>
<summary>Summary</summary>
</histogram>
</histograms>
</histogram-configuration>
"""
date_str = '2211-11-22'
chrome_histogram_correct_expiry_date = chrome_histogram_pattern.format(
' expires_after="{}"'.format(date_str))
histograms, _ = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_correct_expiry_date))
histogram_content = histograms['Histogram.Name']
self.assertIn('expires_after', histogram_content)
self.assertEqual(date_str, histogram_content['expires_after'])
milestone_str = 'M22'
chrome_histogram_correct_expiry_milestone = chrome_histogram_pattern.format(
' expires_after="{}"'.format(milestone_str))
histograms, _ = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_correct_expiry_milestone))
histogram_content = histograms['Histogram.Name']
self.assertIn('expires_after', histogram_content)
self.assertEqual(milestone_str, histogram_content['expires_after'])
chrome_histogram_no_expiry = chrome_histogram_pattern.format('')
histograms, _ = extract_histograms.ExtractHistogramsFromDom(
xml.dom.minidom.parseString(chrome_histogram_no_expiry))
histogram_content = histograms['Histogram.Name']
self.assertNotIn('expires_after', histogram_content)
def testMultiParagraphSummary(self):
multiple_paragraph_pattern = xml.dom.minidom.parseString("""
<histogram-configuration>
<histograms>
<histogram name="MultiParagraphTest.Test1">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Sample description
Sample description.
</summary>
</histogram>
<histogram name="MultiParagraphTest.Test2">
<owner>chrome-metrics-team@google.com</owner>
<summary>
Multi-paragraph sample description UI&gt;Browser.
Words.
Still multi-paragraph sample description.
<!--Comments are allowed.-->
Here.
</summary>
</histogram>
</histograms>
</histogram-configuration>
""")
histograms, _ = extract_histograms._ExtractHistogramsFromXmlTree(
multiple_paragraph_pattern, {})
self.assertEqual(histograms['MultiParagraphTest.Test1']['summary'],
'Sample description Sample description.')
self.assertEqual(
histograms['MultiParagraphTest.Test2']['summary'],
'Multi-paragraph sample description UI>Browser. Words.\n\n'
'Still multi-paragraph sample description.\n\nHere.')
def testNewHistogramWithoutSummary(self):
histogram_without_summary = xml.dom.minidom.parseString("""
<histogram-configuration>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment