Commit 74a4cfc0 authored by Egor Pasko's avatar Egor Pasko Committed by Commit Bot

Orderfile: Allow symbols of type O in symbol_extractor

Also rewrite matching to be more assertive on particular sub-parts of
the string. Hopefully this would allow resolving particular cases before
the perf/memory regressions creep in.

The new parsing finds extra ~2000 symbols compared to the previous
version.

For testing I compared the extracted symbol infos with the output of
'arm-linux-androideabi-nm --print-size --size-sort --reverse-sort' on the
instrumented binary. The differences are:
* 'linker_script_end_of_text' is ignored by the symbol extractor (the
  'linker_script_start_of_text' is output as a special case because we
  rely on it later)
* the symbol extractor seems to rightfully ignore local labels and
  pointers deep into functions, like: 'PRE_LOOP_STAGES_7_OR_8' or
  'jsimd_idct_islow_neon_consts'
* v8 builtins like 'Builtins_RecordWrite' have the size 0 and
  unfortunately are not distinguishable from labels above

Bug: 893981
Change-Id: Ie2ac9a76508832cb90fdedda1ceb46f6c2b24dba
Reviewed-on: https://chromium-review.googlesource.com/c/1326486Reviewed-by: default avatarMatthew Cary <mattcary@chromium.org>
Commit-Queue: Egor Pasko <pasko@chromium.org>
Cr-Commit-Position: refs/heads/master@{#606475}
parent af762e38
...@@ -9,6 +9,10 @@ import logging ...@@ -9,6 +9,10 @@ import logging
import os import os
import re import re
START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'
class WarningCollector(object): class WarningCollector(object):
"""Collects warnings, but limits the number printed to a set value.""" """Collects warnings, but limits the number printed to a set value."""
def __init__(self, max_warnings, level=logging.WARNING): def __init__(self, max_warnings, level=logging.WARNING):
......
...@@ -15,6 +15,7 @@ _SRC_PATH = os.path.abspath(os.path.join( ...@@ -15,6 +15,7 @@ _SRC_PATH = os.path.abspath(os.path.join(
os.path.dirname(__file__), os.pardir, os.pardir)) os.path.dirname(__file__), os.pardir, os.pardir))
path = os.path.join(_SRC_PATH, 'tools', 'cygprofile') path = os.path.join(_SRC_PATH, 'tools', 'cygprofile')
sys.path.append(path) sys.path.append(path)
import cygprofile_utils
import symbol_extractor import symbol_extractor
...@@ -49,7 +50,6 @@ class SymbolOffsetProcessor(object): ...@@ -49,7 +50,6 @@ class SymbolOffsetProcessor(object):
higher-level operations can be done in different orders without the caller higher-level operations can be done in different orders without the caller
managing all the state. managing all the state.
""" """
START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'
def __init__(self, binary_filename): def __init__(self, binary_filename):
self._binary_filename = binary_filename self._binary_filename = binary_filename
...@@ -257,7 +257,7 @@ class SymbolOffsetProcessor(object): ...@@ -257,7 +257,7 @@ class SymbolOffsetProcessor(object):
""" """
if self._offset_to_symbol_info is None: if self._offset_to_symbol_info is None:
start_syms = [s for s in self.SymbolInfos() start_syms = [s for s in self.SymbolInfos()
if s.name == self.START_OF_TEXT_SYMBOL] if s.name == cygprofile_utils.START_OF_TEXT_SYMBOL]
assert len(start_syms) == 1, 'Can\'t find unique start of text symbol' assert len(start_syms) == 1, 'Can\'t find unique start of text symbol'
start_of_text = start_syms[0].offset start_of_text = start_syms[0].offset
max_offset = max(s.offset + s.size for s in self.SymbolInfos()) max_offset = max(s.offset + s.size for s in self.SymbolInfos())
......
...@@ -16,7 +16,7 @@ from test_utils import (ProfileFile, ...@@ -16,7 +16,7 @@ from test_utils import (ProfileFile,
TestProfileManager) TestProfileManager)
class ProcessProfilesTestCase(unittest.TestCase): class ProcessProfilesTestCase(unittest.TestCase):
START_SYMBOL = process_profiles.SymbolOffsetProcessor.START_OF_TEXT_SYMBOL START_SYMBOL = 'linker_script_start_of_text'
def setUp(self): def setUp(self):
self.symbol_0 = SimpleTestSymbol(self.START_SYMBOL, 0, 0) self.symbol_0 = SimpleTestSymbol(self.START_SYMBOL, 0, 0)
......
...@@ -35,6 +35,56 @@ def SetArchitecture(arch): ...@@ -35,6 +35,56 @@ def SetArchitecture(arch):
_arch = arch _arch = arch
# Regular expression to match lines printed by 'objdump -t -w'. An example of
# such line looks like this:
# 018db2de l F .text 00000060 .hidden _ZN8SkBitmapC2ERKS_
#
# The regex intentionally allows matching more than valid inputs. This gives
# more protection against potentially incorrectly silently ignoring unmatched
# input lines. Instead a few assertions early in _FromObjdumpLine() check the
# validity of a few parts matched as groups.
_OBJDUMP_LINE_RE = re.compile(r'''
# The offset of the function, as hex.
(?P<offset>^[0-9a-f]+)
# The space character.
[ ]
# The 7 groups of flag characters, one character each.
(
(?P<assert_scope>.) # Global, local, unique local, etc.
(?P<assert_weak_or_strong>.)
(?P<assert_4spaces>.{4}) # Constructor, warning, indirect ref,
# debugger symbol.
(?P<symbol_type>.) # Function, object, file or normal.
)
[ ]
# The section name should start with ".text", can be ".text.foo". With LLD,
# and especially LTO the traces of input sections are not preserved. Support
# ".text.foo" for a little longer time because it is easy.
(?P<section>.text[^0-9a-f]*)
(?P<assert_tab> \s+)
# The size of the symbol, as hex.
(?P<size>[0-9a-f]+)
# Normally separated out by 14 spaces, but some bits in ELF may theoretically
# affect this length.
(?P<assert_14spaces>[ ]+)
# Hidden symbols should be treated as usual.
(.hidden [ ])?
# The symbol name.
(?P<name>.*)
$
''', re.VERBOSE)
def _FromObjdumpLine(line): def _FromObjdumpLine(line):
"""Create a SymbolInfo by parsing a properly formatted objdump output line. """Create a SymbolInfo by parsing a properly formatted objdump output line.
...@@ -44,29 +94,57 @@ def _FromObjdumpLine(line): ...@@ -44,29 +94,57 @@ def _FromObjdumpLine(line):
Returns: Returns:
An instance of SymbolInfo if the line represents a symbol, None otherwise. An instance of SymbolInfo if the line represents a symbol, None otherwise.
""" """
# All of the symbol lines we care about are in the form m = _OBJDUMP_LINE_RE.match(line)
# 0000000000 g F .text.foo 000000000 [.hidden] foo if not m:
# where g (global) might also be l (local) or w (weak).
parts = line.split()
if len(parts) < 6 or parts[2] != 'F':
return None return None
assert len(parts) == 6 or (len(parts) == 7 and parts[5] == '.hidden') assert m.group('assert_scope') in set(['g', 'l']), line
accepted_scopes = set(['g', 'l', 'w']) assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
assert parts[1] in accepted_scopes assert m.group('assert_tab') == '\t', line
assert m.group('assert_4spaces') == ' ' * 4, line
assert m.group('assert_14spaces') == ' ' * 14, line
name = m.group('name')
offset = int(m.group('offset'), 16)
# Output the label that contains the earliest offset. It is needed later for
# translating offsets from the profile dumps.
if name == cygprofile_utils.START_OF_TEXT_SYMBOL:
return SymbolInfo(name=name, offset=offset, section='.text', size=0)
# Check symbol type for validity and ignore some types.
# From objdump manual page: The symbol is the name of a function (F) or a file
# (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
# seens so far has been function-local labels.
symbol_type = m.group('symbol_type')
if symbol_type == ' ':
# Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
# are indistinguishable from labels of size 0 other than by name.
return None
# Guard against file symbols, since they are normally not seen in the
# binaries we parse.
assert symbol_type != 'f', line
# Extract the size from the ELF field. This value sometimes does not reflect
# the real size of the function. One reason for that is the '.size' directive
# in the assembler. As a result, a few functions in .S files have the size 0.
# They are not instrumented (yet), but maintaining their order in the
# orderfile may be important in some cases.
size = int(m.group('size'), 16)
offset = int(parts[0], 16)
section = parts[3]
size = int(parts[4], 16)
name = parts[-1].rstrip('\n')
# Forbid ARM mapping symbols and other unexpected symbol names, but allow $ # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
# characters in a non-initial position, which can appear as a component of a # characters in a non-initial position, which can appear as a component of a
# mangled name, e.g. Clang can mangle a lambda function to: # mangled name, e.g. Clang can mangle a lambda function to:
# 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv # 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv
# The equivalent objdump line from GCC is: # The equivalent objdump line from GCC is:
# 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv # 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv
assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name) #
return SymbolInfo(name=name, offset=offset, section=section, size=size) # Also disallow .internal and .protected symbols (as well as other flags),
# those have not appeared in the binaries we parse. Rejecting these extra
# prefixes is done by disallowing spaces in symbol names.
assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name
return SymbolInfo(name=name, offset=offset, section=m.group('section'),
size=size)
def _SymbolInfosFromStream(objdump_lines): def _SymbolInfosFromStream(objdump_lines):
...@@ -81,7 +159,7 @@ def _SymbolInfosFromStream(objdump_lines): ...@@ -81,7 +159,7 @@ def _SymbolInfosFromStream(objdump_lines):
name_to_offsets = collections.defaultdict(list) name_to_offsets = collections.defaultdict(list)
symbol_infos = [] symbol_infos = []
for line in objdump_lines: for line in objdump_lines:
symbol_info = _FromObjdumpLine(line) symbol_info = _FromObjdumpLine(line.rstrip('\n'))
if symbol_info is not None: if symbol_info is not None:
name_to_offsets[symbol_info.name].append(symbol_info.offset) name_to_offsets[symbol_info.name].append(symbol_info.offset)
symbol_infos.append(symbol_info) symbol_infos.append(symbol_info)
......
...@@ -6,37 +6,67 @@ ...@@ -6,37 +6,67 @@
import symbol_extractor import symbol_extractor
import unittest import unittest
# The number of spaces that objdump prefixes each symbol with.
SPACES = ' ' * 14
class TestSymbolInfo(unittest.TestCase): class TestSymbolInfo(unittest.TestCase):
def testIgnoresBlankLine(self): def testIgnoresBlankLine(self):
symbol_info = symbol_extractor._FromObjdumpLine('') symbol_info = symbol_extractor._FromObjdumpLine('')
self.assertIsNone(symbol_info) self.assertIsNone(symbol_info)
def testIgnoresMalformedLine(self): def testIgnoresMalformedLine(self):
# This line is too short. # This line is too short: only 6 flags.
line = ('00c1b228 F .text 00000060 _ZN20trace_event') line = ('00c1b228 F .text\t00000060' + SPACES + '_ZN20trace_event')
symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNone(symbol_info)
# This line has the wrong marker.
line = '00c1b228 l f .text 00000060 _ZN20trace_event'
symbol_info = symbol_extractor._FromObjdumpLine(line) symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNone(symbol_info) self.assertIsNone(symbol_info)
def testWrongSymbolType(self):
# This line has unsupported 'f' as symbol type.
line = '00c1b228 l f .text\t00000060' + SPACES + '_ZN20trace_event'
self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
def testAssertionErrorOnInvalidLines(self): def testAssertionErrorOnInvalidLines(self):
# This line has an invalid scope. # This line has an invalid scope.
line = ('00c1b228 z F .text 00000060 _ZN20trace_event') line = ('00c1b228 z F .text\t00000060' + SPACES + '_ZN20trace_event')
self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line) self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
# This line has too many fields. # This line has the symbol name with spaces in it.
line = ('00c1b228 l F .text 00000060 _ZN20trace_event too many') line = ('00c1b228 l F .text\t00000060' + SPACES +
'_ZN20trace_event too many')
self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line) self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
# This line has invalid characters in the symbol. # This line has invalid characters in the symbol name.
line = ('00c1b228 l F .text 00000060 _ZN20trace_?bad') line = ('00c1b228 l F .text\t00000060' + SPACES + '_ZN20trace_?bad')
self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line) self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
# This line has an invalid character at the start of the symbol name. # This line has an invalid character at the start of the symbol name.
line = ('00c1b228 l F .text 00000060 $_ZN20trace_bad') line = ('00c1b228 l F .text\t00000060' + SPACES + '$_ZN20trace_bad')
self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line) self.assertRaises(AssertionError, symbol_extractor._FromObjdumpLine, line)
def testSymbolTypeObject(self):
# Builds with ThinLTO produce symbols of type 'O'.
line = ('009faf60 l O .text\t00000500' + SPACES + 'AES_Td')
symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNotNone(symbol_info)
self.assertEquals(0x009faf60, symbol_info.offset)
self.assertEquals('.text', symbol_info.section)
self.assertEquals(0x500, symbol_info.size)
self.assertEquals('AES_Td', symbol_info.name)
def testSymbolFromLocalLabel(self):
line = ('00f64b80 l .text\t00000000' + SPACES + 'Builtins_Abort')
symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNone(symbol_info)
def testStartOfText(self):
line = ('00918000 l .text\t00000000' + SPACES +
'.hidden linker_script_start_of_text')
symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNotNone(symbol_info)
self.assertEquals(0x00918000, symbol_info.offset)
self.assertEquals('linker_script_start_of_text', symbol_info.name)
def testSymbolInfo(self): def testSymbolInfo(self):
line = ('00c1c05c l F .text 0000002c ' line = ('00c1c05c l F .text\t0000002c' + SPACES +
'_GLOBAL__sub_I_chrome_main_delegate.cc') '_GLOBAL__sub_I_chrome_main_delegate.cc')
test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc' test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc'
test_offset = 0x00c1c05c test_offset = 0x00c1c05c
...@@ -50,7 +80,7 @@ class TestSymbolInfo(unittest.TestCase): ...@@ -50,7 +80,7 @@ class TestSymbolInfo(unittest.TestCase):
self.assertEquals(test_section, symbol_info.section) self.assertEquals(test_section, symbol_info.section)
def testHiddenSymbol(self): def testHiddenSymbol(self):
line = ('00c1c05c l F .text 0000002c ' line = ('00c1c05c l F .text\t0000002c' + SPACES +
'.hidden _GLOBAL__sub_I_chrome_main_delegate.cc') '.hidden _GLOBAL__sub_I_chrome_main_delegate.cc')
test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc' test_name = '_GLOBAL__sub_I_chrome_main_delegate.cc'
test_offset = 0x00c1c05c test_offset = 0x00c1c05c
...@@ -66,7 +96,8 @@ class TestSymbolInfo(unittest.TestCase): ...@@ -66,7 +96,8 @@ class TestSymbolInfo(unittest.TestCase):
def testDollarInSymbolName(self): def testDollarInSymbolName(self):
# A $ character elsewhere in the symbol name is fine. # A $ character elsewhere in the symbol name is fine.
# This is an example of a lambda function name from Clang. # This is an example of a lambda function name from Clang.
line = ('00c1b228 l F .text 00000060 _ZZL11get_globalsvENK3$_1clEv') line = ('00c1b228 l F .text\t00000060' + SPACES +
'_ZZL11get_globalsvENK3$_1clEv')
symbol_info = symbol_extractor._FromObjdumpLine(line) symbol_info = symbol_extractor._FromObjdumpLine(line)
self.assertIsNotNone(symbol_info) self.assertIsNotNone(symbol_info)
self.assertEquals(0xc1b228, symbol_info.offset) self.assertEquals(0xc1b228, symbol_info.offset)
...@@ -79,10 +110,10 @@ class TestSymbolInfosFromStream(unittest.TestCase): ...@@ -79,10 +110,10 @@ class TestSymbolInfosFromStream(unittest.TestCase):
def testSymbolInfosFromStream(self): def testSymbolInfosFromStream(self):
lines = ['Garbage', lines = ['Garbage',
'', '',
'00c1c05c l F .text 0000002c first', '00c1c05c l F .text\t0000002c' + SPACES + 'first',
'' ''
'more garbage', 'more garbage',
'00155 g F .text 00000012 second'] '00155 g F .text\t00000012' + SPACES + 'second']
symbol_infos = symbol_extractor._SymbolInfosFromStream(lines) symbol_infos = symbol_extractor._SymbolInfosFromStream(lines)
self.assertEquals(len(symbol_infos), 2) self.assertEquals(len(symbol_infos), 2)
first = symbol_extractor.SymbolInfo('first', 0x00c1c05c, 0x2c, '.text') first = symbol_extractor.SymbolInfo('first', 0x00c1c05c, 0x2c, '.text')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment