Orderfile: Allow symbols of type O in symbol_extractor

Also rewrite matching to be more assertive on particular sub-parts of the string. Hopefully this would allow resolving particular cases before the perf/memory regressions creep in. The new parsing finds extra ~2000 symbols compared to the previous version. For testing I compared the extracted symbol infos with the output of 'arm-linux-androideabi-nm --print-size --size-sort --reverse-sort' on the instrumented binary. The differences are: * 'linker_script_end_of_text' is ignored by the symbol extractor (the 'linker_script_start_of_text' is output as a special case because we rely on it later) * the symbol extractor seems to rightfully ignore local labels and pointers deep into functions, like: 'PRE_LOOP_STAGES_7_OR_8' or 'jsimd_idct_islow_neon_consts' * v8 builtins like 'Builtins_RecordWrite' have the size 0 and unfortunately are not distinguishable from labels above Bug: 893981 Change-Id: Ie2ac9a76508832cb90fdedda1ceb46f6c2b24dba Reviewed-on: https://chromium-review.googlesource.com/c/1326486Reviewed-by: Matthew Cary <mattcary@chromium.org> Commit-Queue: Egor Pasko <pasko@chromium.org> Cr-Commit-Position: refs/heads/master@{#606475}

Orderfile: Allow symbols of type O in symbol_extractor
Also rewrite matching to be more assertive on particular sub-parts of the string. Hopefully this would allow resolving particular cases before the perf/memory regressions creep in. The new parsing finds extra ~2000 symbols compared to the previous version. For testing I compared the extracted symbol infos with the output of 'arm-linux-androideabi-nm --print-size --size-sort --reverse-sort' on the instrumented binary. The differences are: * 'linker_script_end_of_text' is ignored by the symbol extractor (the 'linker_script_start_of_text' is output as a special case because we rely on it later) * the symbol extractor seems to rightfully ignore local labels and pointers deep into functions, like: 'PRE_LOOP_STAGES_7_OR_8' or 'jsimd_idct_islow_neon_consts' * v8 builtins like 'Builtins_RecordWrite' have the size 0 and unfortunately are not distinguishable from labels above Bug: 893981 Change-Id: Ie2ac9a76508832cb90fdedda1ceb46f6c2b24dba Reviewed-on: https://chromium-review.googlesource.com/c/1326486Reviewed-by: Matthew Cary <mattcary@chromium.org> Commit-Queue: Egor Pasko <pasko@chromium.org> Cr-Commit-Position: refs/heads/master@{#606475}
74a4cfc0 · Egor Pasko · Commit Bot · af762e38 · 74a4cfc0 · 74a4cfc0
Commit 74a4cfc0 authored Nov 08, 2018 by Egor Pasko Committed by Commit Bot Nov 08, 2018
5 changed files
--- a/tools/cygprofile/cygprofile_utils.py
+++ b/tools/cygprofile/cygprofile_utils.py
@@ -9,6 +9,10 @@ import logging
 import os
 import re

+
+START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'
+
+
 class WarningCollector(object):
  """Collects warnings, but limits the number printed to a set value."""
  def __init__(self, max_warnings, level=logging.WARNING):

--- a/tools/cygprofile/process_profiles.py
+++ b/tools/cygprofile/process_profiles.py
@@ -15,6 +15,7 @@ _SRC_PATH = os.path.abspath(os.path.join(
    os.path.dirname(__file__), os.pardir, os.pardir))
 path = os.path.join(_SRC_PATH, 'tools', 'cygprofile')
 sys.path.append(path)
+import cygprofile_utils
 import symbol_extractor


@@ -49,7 +50,6 @@ class SymbolOffsetProcessor(object):
  higher-level operations can be done in different orders without the caller
  managing all the state.
  """
-  START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'

  def __init__(self, binary_filename):
    self._binary_filename = binary_filename
@@ -257,7 +257,7 @@ class SymbolOffsetProcessor(object):
    """
    if self._offset_to_symbol_info is None:
      start_syms = [s for s in self.SymbolInfos()
-                    if s.name == self.START_OF_TEXT_SYMBOL]
+                    if s.name == cygprofile_utils.START_OF_TEXT_SYMBOL]
      assert len(start_syms) == 1, 'Can\'t find unique start of text symbol'
      start_of_text = start_syms[0].offset
      max_offset = max(s.offset + s.size for s in self.SymbolInfos())

--- a/tools/cygprofile/process_profiles_unittest.py
+++ b/tools/cygprofile/process_profiles_unittest.py
@@ -16,7 +16,7 @@ from test_utils import (ProfileFile,
                        TestProfileManager)

 class ProcessProfilesTestCase(unittest.TestCase):
-  START_SYMBOL = process_profiles.SymbolOffsetProcessor.START_OF_TEXT_SYMBOL
+  START_SYMBOL = 'linker_script_start_of_text'

  def setUp(self):
    self.symbol_0 = SimpleTestSymbol(self.START_SYMBOL, 0, 0)

--- a/tools/cygprofile/symbol_extractor.py
+++ b/tools/cygprofile/symbol_extractor.py
@@ -35,6 +35,56 @@ def SetArchitecture(arch):
  _arch = arch


+# Regular expression to match lines printed by 'objdump -t -w'. An example of
+# such line looks like this:
+# 018db2de l     F .text  00000060              .hidden _ZN8SkBitmapC2ERKS_
+#
+# The regex intentionally allows matching more than valid inputs. This gives
+# more protection against potentially incorrectly silently ignoring unmatched
+# input lines. Instead a few assertions early in _FromObjdumpLine() check the
+# validity of a few parts matched as groups.
+_OBJDUMP_LINE_RE = re.compile(r'''
+  # The offset of the function, as hex.
+  (?P<offset>^[0-9a-f]+)
+
+  # The space character.
+  [ ]
+
+  # The 7 groups of flag characters, one character each.
+  (
+    (?P<assert_scope>.)           # Global, local, unique local, etc.
+    (?P<assert_weak_or_strong>.)
+    (?P<assert_4spaces>.{4})      # Constructor, warning, indirect ref,
+                                  # debugger symbol.
+    (?P<symbol_type>.)            # Function, object, file or normal.
+  )
+
+  [ ]
+
+  # The section name should start with ".text", can be ".text.foo". With LLD,
+  # and especially LTO the traces of input sections are not preserved. Support
+  # ".text.foo" for a little longer time because it is easy.
+  (?P<section>.text[^0-9a-f]*)
+
+  (?P<assert_tab> \s+)
+
+  # The size of the symbol, as hex.
+  (?P<size>[0-9a-f]+)
+
+  # Normally separated out by 14 spaces, but some bits in ELF may theoretically
+  # affect this length.
+  (?P<assert_14spaces>[ ]+)
+
+  # Hidden symbols should be treated as usual.
+  (.hidden [ ])?
+
+  # The symbol name.
+  (?P<name>.*)
+
+  $
+  ''', re.VERBOSE)
+
+
 def _FromObjdumpLine(line):
  """Create a SymbolInfo by parsing a properly formatted objdump output line.

@@ -44,29 +94,57 @@ def _FromObjdumpLine(line):
  Returns:
    An instance of SymbolInfo if the line represents a symbol, None otherwise.
  """
-  # All of the symbol lines we care about are in the form
-  # 0000000000  g    F   .text.foo     000000000 [.hidden] foo
-  # where g (global) might also be l (local) or w (weak).
-  parts = line.split()
-  if len(parts) < 6 or parts[2] != 'F':
+  m = _OBJDUMP_LINE_RE.match(line)
+  if not m:
    return None

-  assert len(parts) == 6 or (len(parts) == 7 and parts[5] == '.hidden')
-  accepted_scopes = set(['g', 'l', 'w'])
-  assert parts[1] in accepted_scopes
+  assert m.group('assert_scope') in set(['g', 'l']), line
+  assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
+  assert m.group('assert_tab') == '\t', line
+  assert m.group('assert_4spaces') == ' ' * 4, line
+  assert m.group('assert_14spaces') == ' ' * 14, line
+  name = m.group('name')
+  offset = int(m.group('offset'), 16)
+
+  # Output the label that contains the earliest offset. It is needed later for
+  # translating offsets from the profile dumps.
+  if name == cygprofile_utils.START_OF_TEXT_SYMBOL:
+    return SymbolInfo(name=name, offset=offset, section='.text', size=0)
+
+  # Check symbol type for validity and ignore some types.
+  # From objdump manual page: The symbol is the name of a function (F) or a file
+  # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
+  # seens so far has been function-local labels.
+  symbol_type = m.group('symbol_type')
+  if symbol_type == ' ':
+    # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
+    # are indistinguishable from labels of size 0 other than by name.
+    return None
+  # Guard against file symbols, since they are normally not seen in the
+  # binaries we parse.
+  assert symbol_type != 'f', line
+
+  # Extract the size from the ELF field. This value sometimes does not reflect
+  # the real size of the function. One reason for that is the '.size' directive
+  # in the assembler. As a result, a few functions in .S files have the size 0.
+  # They are not instrumented (yet), but maintaining their order in the
+  # orderfile may be important in some cases.
+  size = int(m.group('size'), 16)

-  offset = int(parts[0], 16)
-  section = parts[3]
-  size = int(parts[4], 16)
-  name = parts[-1].rstrip('\n')
  # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
  # characters in a non-initial position, which can appear as a component of a
  # mangled name, e.g. Clang can mangle a lambda function to:
  # 02cd61e0 l     F .text  000000c0 _ZZL11get_globalsvENK3$_1clEv
  # The equivalent objdump line from GCC is:
  # 0325c58c l     F .text  000000d0 _ZZL11get_globalsvENKUlvE_clEv
-  assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name)
-  return SymbolInfo(name=name, offset=offset, section=section, size=size)
+  #
+  # Also disallow .internal and .protected symbols (as well as other flags),
+  # those have not appeared in the binaries we parse. Rejecting these extra
+  # prefixes is done by disallowing spaces in symbol names.
+  assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name
+
+  return SymbolInfo(name=name, offset=offset, section=m.group('section'),
+      size=size)


 def _SymbolInfosFromStream(objdump_lines):
@@ -81,7 +159,7 @@ def _SymbolInfosFromStream(objdump_lines):
  name_to_offsets = collections.defaultdict(list)
  symbol_infos = []
  for line in objdump_lines:
-    symbol_info = _FromObjdumpLine(line)
+    symbol_info = _FromObjdumpLine(line.rstrip('\n'))
    if symbol_info is not None:
      name_to_offsets[symbol_info.name].append(symbol_info.offset)
      symbol_infos.append(symbol_info)

--- a/tools/cygprofile/symbol_extractor_unittest.py
+++ b/tools/cygprofile/symbol_extractor_unittest.py
@@ -6,37 +6,67 @@
 import symbol_extractor
 import unittest

+
+# The number of spaces that objdump prefixes each symbol with.
+SPACES = ' ' * 14
+
+
 class TestSymbolInfo(unittest.TestCase):
  def testIgnoresBlankLine(self):
    symbol_info = symbol_extractor._FromObjdumpLine('')
    self.assertIsNone(symbol_info)

  def testIgnoresMalformedLine(self):
-    # This line is too short.
-    line = ('00c1b228      F .text  00000060 _ZN20trace_event')
-    symbol_info = symbol_extractor._FromObjdumpLine(line)
-    self.assertIsNone(symbol_info)
-    # This line has the wrong marker.
-    line = '00c1b228 l     f .text  00000060 _ZN20trace_event'
+    # This line is too short: only 6 flags.
+    line = ('00c1b228      F .text\t00000060' + SPACES + '_ZN20trace_event')
    symbol_info = symbol_extractor._FromObjdumpLine(line)
    self.assertIsNone(symbol_info)

+  def testWrongSymbolType(self):
+    # This line has unsupported 'f' as symbol type.
+    line = '00c1b228 l     f .text\t00000060' + SPACES + '_ZN20trace_event'
+    self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
+
  def testAssertionErrorOnInvalidLines(self):
    # This line has an invalid scope.
-    line = ('00c1b228 z     F .text  00000060 _ZN20trace_event')
+    line = ('00c1b228 z     F .text\t00000060' + SPACES + '_ZN20trace_event')
    self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
-    # This line has too many fields.
-    line = ('00c1b228 l     F .text  00000060 _ZN20trace_event too many')
+    # This line has the symbol name with spaces in it.
+    line = ('00c1b228 l     F .text\t00000060' + SPACES +
+        '_ZN20trace_event too many')
    self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
-    # This line has invalid characters in the symbol.
-    line = ('00c1b228 l     F .text  00000060 _ZN20trace_?bad')
+    # This line has invalid characters in the symbol name.
+    line = ('00c1b228 l     F .text\t00000060' + SPACES + '_ZN20trace_?bad')
    self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
    # This line has an invalid character at the start of the symbol name.
-    line = ('00c1b228 l     F .text  00000060 $_ZN20trace_bad')
+    line = ('00c1b228 l     F .text\t00000060' + SPACES + '$_ZN20trace_bad')
    self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)

+  def testSymbolTypeObject(self):
+    # Builds with ThinLTO produce symbols of type 'O'.
+    line = ('009faf60 l     O .text\t00000500' + SPACES + 'AES_Td')
+    symbol_info = symbol_extractor._FromObjdumpLine(line)
+    self.assertIsNotNone(symbol_info)
+    self.assertEquals(0x009faf60, symbol_info.offset)
+    self.assertEquals('.text', symbol_info.section)
+    self.assertEquals(0x500, symbol_info.size)
+    self.assertEquals('AES_Td', symbol_info.name)
+
+  def testSymbolFromLocalLabel(self):
+    line = ('00f64b80 l       .text\t00000000' + SPACES + 'Builtins_Abort')
+    symbol_info = symbol_extractor._FromObjdumpLine(line)
+    self.assertIsNone(symbol_info)
+
+  def testStartOfText(self):
+    line = ('00918000 l       .text\t00000000' + SPACES +
+        '.hidden linker_script_start_of_text')
+    symbol_info = symbol_extractor._FromObjdumpLine(line)
+    self.assertIsNotNone(symbol_info)
+    self.assertEquals(0x00918000, symbol_info.offset)
+    self.assertEquals('linker_script_start_of_text', symbol_info.name)
+
  def testSymbolInfo(self):
-    line = ('00c1c05c l     F .text  0000002c '
+    line = ('00c1c05c l     F .text\t0000002c' + SPACES +
            '_GLOBAL__sub_I_chrome_main_delegate.cc')
    test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc'
    test_offset = 0x00c1c05c
@@ -50,7 +80,7 @@ class TestSymbolInfo(unittest.TestCase):
    self.assertEquals(test_section, symbol_info.section)

  def testHiddenSymbol(self):
-    line = ('00c1c05c l     F .text  0000002c '
+    line = ('00c1c05c l     F .text\t0000002c' + SPACES +
            '.hidden _GLOBAL__sub_I_chrome_main_delegate.cc')
    test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc'
    test_offset = 0x00c1c05c
@@ -66,7 +96,8 @@ class TestSymbolInfo(unittest.TestCase):
  def testDollarInSymbolName(self):
    # A $ character elsewhere in the symbol name is fine.
    # This is an example of a lambda function name from Clang.
-    line = ('00c1b228 l     F .text  00000060 _ZZL11get_globalsvENK3$_1clEv')
+    line = ('00c1b228 l     F .text\t00000060' + SPACES +
+       '_ZZL11get_globalsvENK3$_1clEv')
    symbol_info = symbol_extractor._FromObjdumpLine(line)
    self.assertIsNotNone(symbol_info)
    self.assertEquals(0xc1b228, symbol_info.offset)
@@ -79,10 +110,10 @@ class TestSymbolInfosFromStream(unittest.TestCase):
  def testSymbolInfosFromStream(self):
    lines = ['Garbage',
             '',
-             '00c1c05c l     F .text  0000002c first',
+             '00c1c05c l     F .text\t0000002c' + SPACES + 'first',
             ''
             'more garbage',
-             '00155 g     F .text  00000012 second']
+             '00155 g     F .text\t00000012' + SPACES + 'second']
    symbol_infos = symbol_extractor._SymbolInfosFromStream(lines)
    self.assertEquals(len(symbol_infos), 2)
    first = symbol_extractor.SymbolInfo('first', 0x00c1c05c, 0x2c, '.text')