Refactor the symbolize step of the orderfile generation.

This builds on the previous refactorings (linked in the bug), and unifies the parsing of object files. It also removes the fuzzy matching of offsets done that was previously also done in patch_orderfile.py. It is also faster for three reasons: - Elimination of an O(N^2) search - Parallelization of object file parsing. - No binary search BUG=452879 Review URL: https://codereview.chromium.org/874683004 Cr-Commit-Position: refs/heads/master@{#314137}

Refactor the symbolize step of the orderfile generation.
This builds on the previous refactorings (linked in the bug), and unifies the parsing of object files. It also removes the fuzzy matching of offsets done that was previously also done in patch_orderfile.py. It is also faster for three reasons: - Elimination of an O(N^2) search - Parallelization of object file parsing. - No binary search BUG=452879 Review URL: https://codereview.chromium.org/874683004 Cr-Commit-Position: refs/heads/master@{#314137}
753ce836 · lizeb · Commit bot · 23c3f94f · 753ce836 · 753ce836
Commit 753ce836 authored Feb 02, 2015 by lizeb Committed by Commit bot Feb 02, 2015
Showing with 348 additions and 0 deletions

tools/cygprofile/cyglog_to_orderfile.py tools/cygprofile/cyglog_to_orderfile.py +262 -0

tools/cygprofile/cyglog_to_orderfile_unittest.py tools/cygprofile/cyglog_to_orderfile_unittest.py +86 -0

No files found.
--- a/tools/cygprofile/cyglog_to_orderfile.py
+++ b/tools/cygprofile/cyglog_to_orderfile.py
+#!/usr/bin/python
+# Copyright 2015 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Symbolizes a log file produced by cyprofile instrumentation.
+
+Given a log file and the binary being profiled, creates an orderfile.
+"""
+
+import logging
+import multiprocessing
+import optparse
+import os
+import tempfile
+import string
+import sys
+
+import symbol_extractor
+
+
+def _ParseLogLines(log_file_lines):
+  """Parses a merged cyglog produced by mergetraces.py.
+
+  Args:
+    log_file_lines: array of lines in log file produced by profiled run
+    lib_name: library or executable containing symbols
+
+    Below is an example of a small log file:
+    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
+    secs       usecs      pid:threadid    func
+    START
+    1314897086 795828     3587:1074648168 0x509e105c
+    1314897086 795874     3587:1074648168 0x509e0eb4
+    1314897086 796326     3587:1074648168 0x509e0e3c
+    1314897086 796552     3587:1074648168 0x509e07bc
+    END
+
+  Returns:
+    An ordered list of callee offsets.
+  """
+  call_lines = []
+  vm_start = 0
+  line = log_file_lines[0]
+  assert 'r-xp' in line
+  end_index = line.find('-')
+  vm_start = int(line[:end_index], 16)
+  for line in log_file_lines[3:]:
+    fields = line.split()
+    if len(fields) == 4:
+      call_lines.append(fields)
+    else:
+      assert fields[0] == 'END'
+  # Convert strings to int in fields.
+  call_info = []
+  for call_line in call_lines:
+    addr = int(call_line[3], 16)
+    if vm_start < addr:
+      addr -= vm_start
+      call_info.append(addr)
+  return call_info
+
+
+def _GroupLibrarySymbolInfosByOffset(lib_filename):
+  """Returns a dict {offset: [SymbolInfo]} from a library."""
+  symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename)
+  return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos)
+
+
+class SymbolNotFoundException(Exception):
+  def __init__(self, value):
+    super(SymbolNotFoundException, self).__init__(value)
+    self.value = value
+
+  def __str__(self):
+    return repr(self.value)
+
+
+def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset):
+  """Finds all SymbolInfo at a given offset.
+
+  Args:
+    offset_to_symbol_infos: {offset: [SymbolInfo]}
+    offset: offset to look the symbols at
+
+  Returns:
+    The list of SymbolInfo at the given offset
+
+  Raises:
+    SymbolNotFoundException if the offset doesn't match any symbol.
+  """
+  if offset in offset_to_symbol_infos:
+    return offset_to_symbol_infos[offset]
+  elif offset % 2 and (offset - 1) in offset_to_symbol_infos:
+    # On ARM, odd addresses are used to signal thumb instruction. They are
+    # generated by setting the LSB to 1 (see
+    # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfjhia.html).
+    # TODO(lizeb): Make sure this hack doesn't propagate to other archs.
+    return offset_to_symbol_infos[offset - 1]
+  else:
+    raise SymbolNotFoundException(offset)
+
+
+class WarningCollector(object):
+  """Collect warnings, but limit the number printed to a set value."""
+  def __init__(self, max_warnings):
+    self._warnings = 0
+    self._max_warnings = max_warnings
+
+  def Write(self, message):
+    if self._warnings < self._max_warnings:
+      logging.warning(message)
+    self._warnings += 1
+
+  def WriteEnd(self, message):
+    if self._warnings > self._max_warnings:
+      logging.warning('%d more warnings for: %s' % (
+          self._warnings - self._max_warnings, message))
+
+
+def _GetObjectFileNames(obj_dir):
+  """Returns the list of object files in a directory."""
+  obj_files = []
+  for (dirpath, _, filenames) in os.walk(obj_dir):
+    for file_name in filenames:
+      if file_name.endswith('.o'):
+        obj_files.append(os.path.join(dirpath, file_name))
+  return obj_files
+
+
+def _AllSymbolInfos(object_filenames):
+  """Returns a list of SymbolInfo from an iterable of filenames."""
+  pool = multiprocessing.Pool()
+  # Hopefully the object files are in the page cache at this step, so IO should
+  # not be a problem (hence no concurrency limit on the pool).
+  symbol_infos_nested = pool.map(
+      symbol_extractor.SymbolInfosFromBinary, object_filenames)
+  result = []
+  for symbol_infos in symbol_infos_nested:
+    result += symbol_infos
+  return result
+
+
+def _GetSymbolToSectionMapFromObjectFiles(obj_dir):
+  """ Creates a mapping from symbol to linker section name by scanning all
+      the object files.
+  """
+  object_files = _GetObjectFileNames(obj_dir)
+  symbol_to_section_map = {}
+  symbol_warnings = WarningCollector(300)
+  symbol_infos = _AllSymbolInfos(object_files)
+  for symbol_info in symbol_infos:
+    symbol = symbol_info.name
+    if symbol.startswith('.LTHUNK'):
+      continue
+    section = symbol_info.section
+    if ((symbol in symbol_to_section_map) and
+        (symbol_to_section_map[symbol] != symbol_info.section)):
+      symbol_warnings.Write('Symbol ' + symbol +
+                            ' in conflicting sections ' + section +
+                            ' and ' + symbol_to_section_map[symbol])
+    elif not section.startswith('.text'):
+      symbol_warnings.Write('Symbol ' + symbol +
+                            ' in incorrect section ' + section)
+    else:
+      symbol_to_section_map[symbol] = section
+  symbol_warnings.WriteEnd('bad sections')
+  return symbol_to_section_map
+
+
+def _WarnAboutDuplicates(offsets):
+  """Warns about duplicate offsets.
+
+  Args:
+    offsets: list of offsets to check for duplicates
+
+  Returns:
+    True if there are no duplicates, False otherwise.
+  """
+  seen_offsets = set()
+  ok = True
+  for offset in offsets:
+    if offset not in seen_offsets:
+      seen_offsets.add(offset)
+    else:
+      ok = False
+      logging.warning('Duplicate offset: ' + hex(offset))
+  return ok
+
+
+def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,
+                     output_file):
+  """Outputs the orderfile to output_file.
+
+  Args:
+    offsets: Iterable of offsets to match to section names
+    offset_to_symbol_infos: {offset: [SymbolInfo]}
+    symbol_to_section_map: {name: section}
+    output_file: file-like object to write the results to
+  """
+  success = True
+  unknown_symbol_warnings = WarningCollector(300)
+  symbol_not_found_warnings = WarningCollector(300)
+  for offset in offsets:
+    try:
+      symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset)
+      for symbol_info in symbol_infos:
+        if symbol_info.name in symbol_to_section_map:
+          output_file.write(symbol_to_section_map[symbol_info.name] + '\n')
+        else:
+          unknown_symbol_warnings.Write(
+              'No known section for symbol ' + symbol_info.name)
+    except SymbolNotFoundException:
+      symbol_not_found_warnings.Write(
+          'Did not find function in binary. offset: ' + hex(offset))
+      success = False
+  unknown_symbol_warnings.WriteEnd('no known section for symbol.')
+  symbol_not_found_warnings.WriteEnd('symbol not found in the binary.')
+  return success
+
+
+def main():
+  if len(sys.argv) != 4:
+    logging.error('Usage: cyglog_to_orderfile.py <merged_cyglog> '
+                  '<library> <output_filename>')
+    return 1
+  (log_filename, lib_filename, output_filename) = sys.argv[1:]
+
+  obj_dir = os.path.abspath(os.path.join(
+      os.path.dirname(lib_filename), '../obj'))
+
+  log_file_lines = map(string.rstrip, open(log_filename).readlines())
+  offsets = _ParseLogLines(log_file_lines)
+  _WarnAboutDuplicates(offsets)
+
+  offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_filename)
+  symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir)
+
+  success = False
+  temp_filename = None
+  output_file = None
+  try:
+    (fd, temp_filename) = tempfile.mkstemp(dir=os.path.dirname(output_filename))
+    output_file = os.fdopen(fd, 'w')
+    ok = _OutputOrderfile(
+        offsets, offset_to_symbol_infos, symbol_to_section_map, output_file)
+    output_file.close()
+    os.rename(temp_filename, output_filename)
+    temp_filename = None
+    success = ok
+  finally:
+    if output_file:
+      output_file.close()
+    if temp_filename:
+      os.remove(temp_filename)
+
+  return 0 if success else 1
+
+
+if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO)
+  sys.exit(main())
--- a/tools/cygprofile/cyglog_to_orderfile_unittest.py
+++ b/tools/cygprofile/cyglog_to_orderfile_unittest.py
+#!/usr/bin/python
+# Copyright 2015 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import unittest
+
+import cyglog_to_orderfile
+import symbol_extractor
+
+
+class TestCyglogToOrderfile(unittest.TestCase):
+  def testParseLogLines(self):
+    lines = """5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
+secs       usecs      pid:threadid    func
+START
+1314897086 795828     3587:1074648168 0x509e105c
+1314897086 795874     3587:1074648168 0x509e0eb4
+END""".split('\n')
+    offsets = cyglog_to_orderfile._ParseLogLines(lines)
+    self.assertListEqual(
+        offsets, [0x509e105c - 0x5086e000, 0x509e0eb4 - 0x5086e000])
+
+  def testFindSymbolInfosAtOffsetExactMatch(self):
+    offset_map = {0x10: [symbol_extractor.SymbolInfo(
+        name='Symbol', offset=0x10, size=0x13, section='.text')]}
+    functions = cyglog_to_orderfile._FindSymbolInfosAtOffset(offset_map, 0x10)
+    self.assertEquals(len(functions), 1)
+    self.assertEquals(functions[0], offset_map[0x10][0])
+
+  def testFindSymbolInfosAtOffsetInexactMatch(self):
+    offset_map = {0x10: [symbol_extractor.SymbolInfo(
+        name='Symbol', offset=0x10, size=0x13, section='.text')]}
+    functions = cyglog_to_orderfile._FindSymbolInfosAtOffset(offset_map, 0x11)
+    self.assertEquals(len(functions), 1)
+    self.assertEquals(functions[0], offset_map[0x10][0])
+
+  def testFindSymbolInfosAtOffsetNoMatch(self):
+    offset_map = {0x10: [symbol_extractor.SymbolInfo(
+        name='Symbol', offset=0x10, size=0x13, section='.text')]}
+    self.assertRaises(
+        cyglog_to_orderfile.SymbolNotFoundException,
+        cyglog_to_orderfile._FindSymbolInfosAtOffset, offset_map, 0x12)
+
+  def testWarnAboutDuplicates(self):
+    offsets = [0x1, 0x2, 0x3]
+    self.assertTrue(cyglog_to_orderfile._WarnAboutDuplicates(offsets))
+    offsets.append(0x1)
+    self.assertFalse(cyglog_to_orderfile._WarnAboutDuplicates(offsets))
+
+  def testOutputOrderfile(self):
+    class FakeOutputFile(object):
+      def __init__(self):
+        self.writes = []
+      def write(self, data):
+        self.writes.append(data)
+
+    # One symbol not matched, one with an odd address, one regularly matched
+    # And two symbols aliased to the same address
+    offsets = [0x12, 0x17]
+    offset_to_symbol_infos = {
+        0x10:[symbol_extractor.SymbolInfo(
+                  name='Symbol', offset=0x10, size=0x13, section='dummy')],
+        0x12:[symbol_extractor.SymbolInfo(
+                  name='Symbol2', offset=0x12, size=0x13, section='dummy')],
+        0x16:[symbol_extractor.SymbolInfo(
+                  name='Symbol3', offset=0x16, size=0x13, section='dummy'),
+              symbol_extractor.SymbolInfo(
+                  name='Symbol32', offset=0x16, size=0x13, section='dummy'),]}
+    symbol_to_section_map = {
+        'Symbol': '.text.Symbol',
+        'Symbol2': '.text.Symbol2',
+        'Symbol3': '.text.Symbol3',
+        'Symbol32': '.text.Symbol32'}
+    fake_output = FakeOutputFile()
+    cyglog_to_orderfile._OutputOrderfile(
+        offsets, offset_to_symbol_infos, symbol_to_section_map, fake_output)
+    expected = """.text.Symbol2
+.text.Symbol3
+.text.Symbol32
+"""
+    self.assertEquals(expected, "".join(fake_output.writes))
+
+
+if __name__ == '__main__':
+  unittest.main()