[Supersize] Split nm.py into {nm.py, obj_analyzer.py, string_extract.py}.

This is a code movement CL with minimal changes. Details: - Add obj_analyzer.py: New home for BulkObjectFilenalyzer, along with helper functions. Also inherits main() from nm.py for testing. - Add string_extract.py: New home for {LookupElfRodataInfo(), ReadFileChunks(), and ResolveStringPieces()}. Add top-level comments. - nm.py: Have content moved to the new files. Also, exposing RunNmOnIntermediates(), to be called from obj_analyzer.py. - Update archive.py and console.py to adapt to new code locations. Bug: 723798 Change-Id: I1d1670f04549a416f06de1da03c1a2b03c378461 Reviewed-on: https://chromium-review.googlesource.com/1136943 Commit-Queue: Samuel Huang <huangs@chromium.org> Reviewed-by: agrieve <agrieve@chromium.org> Cr-Commit-Position: refs/heads/master@{#575062}

[Supersize] Split nm.py into {nm.py, obj_analyzer.py, string_extract.py}.
This is a code movement CL with minimal changes. Details: - Add obj_analyzer.py: New home for BulkObjectFilenalyzer, along with helper functions. Also inherits main() from nm.py for testing. - Add string_extract.py: New home for {LookupElfRodataInfo(), ReadFileChunks(), and ResolveStringPieces()}. Add top-level comments. - nm.py: Have content moved to the new files. Also, exposing RunNmOnIntermediates(), to be called from obj_analyzer.py. - Update archive.py and console.py to adapt to new code locations. Bug: 723798 Change-Id: I1d1670f04549a416f06de1da03c1a2b03c378461 Reviewed-on: https://chromium-review.googlesource.com/1136943 Commit-Queue: Samuel Huang <huangs@chromium.org> Reviewed-by: agrieve <agrieve@chromium.org> Cr-Commit-Position: refs/heads/master@{#575062}
5fbc1731 · Samuel Huang · Commit Bot · af58eea6 · 5fbc1731 · 5fbc1731
Commit 5fbc1731 authored Jul 13, 2018 by Samuel Huang Committed by Commit Bot Jul 13, 2018
6 changed files
--- a/tools/binary_size/libsupersize/archive.py
+++ b/tools/binary_size/libsupersize/archive.py
@@ -30,6 +30,7 @@ import linker_map_parser
 import models
 import ninja_parser
 import nm
+import obj_analyzer
 import path_util

 sys.path.insert(1, os.path.join(path_util.SRC_ROOT, 'tools', 'grit'))
@@ -766,7 +767,7 @@ def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals,
    # Rather than record all paths for each symbol, set the paths to be the
    # common ancestor of all paths.
    if outdir_context:
-      bulk_analyzer = nm.BulkObjectFileAnalyzer(
+      bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
          tool_prefix, outdir_context.output_directory)
      bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)


--- a/tools/binary_size/libsupersize/console.py
+++ b/tools/binary_size/libsupersize/console.py
@@ -23,8 +23,8 @@ import diff
 import file_format
 import match_util
 import models
-import nm
 import path_util
+import string_extract


 # Number of lines before using less for Print().
@@ -122,7 +122,8 @@ class _Session(object):
    elf_path = self._ElfPathForSymbol(
        size_info, tool_prefix, elf_path)

-    address, offset, _ = nm.LookupElfRodataInfo(elf_path, tool_prefix)
+    address, offset, _ = string_extract.LookupElfRodataInfo(
+        elf_path, tool_prefix)
    adjust = offset - address
    ret = []
    with open(elf_path, 'rb') as f:

--- a/tools/binary_size/libsupersize/nm.py
+++ b/tools/binary_size/libsupersize/nm.py
--- a/tools/binary_size/libsupersize/obj_analyzer.py
+++ b/tools/binary_size/libsupersize/obj_analyzer.py
--- a/tools/binary_size/libsupersize/string_extract.py
+++ b/tools/binary_size/libsupersize/string_extract.py
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Utilities to extract string literals from object files.
+
+LookupElfRodataInfo():
+  Runs readelf to extract and return .rodata section spec of an ELF file.
+
+ReadFileChunks():
+  Reads raw data from a file, given a list of ranges in the file.
+
+ResolveStringPieces():
+  BulkForkAndCall() target: Given {path: [string addresses]} and
+  [raw_string_data for each string_section]:
+  - Reads {path: [src_strings]}.
+  - For each path, searches for src_strings in at most 1 raw_string_data over
+    each string_section. If found, translates to string_range and annotates it
+    to the string_section.
+  - Returns [{path: [string_ranges]} for each string_section].
+"""
+
+import collections
+import itertools
+import logging
+import os
+import subprocess
+
+import ar
+import concurrent
+import models
+import path_util
+
+
+def LookupElfRodataInfo(elf_path, tool_prefix):
+  """Returns (address, offset, size) for the .rodata section."""
+  args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path]
+  output = subprocess.check_output(args)
+  lines = output.splitlines()
+  for line in lines:
+    # [Nr] Name           Type        Addr     Off     Size   ES Flg Lk Inf Al
+    # [07] .rodata        PROGBITS    025e7000 237c000 5ec4f6 00   A  0   0 256
+    if '.rodata ' in line:
+      fields = line[line.index(models.SECTION_RODATA):].split()
+      return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16)
+  raise AssertionError('No .rodata for command: ' + repr(args))
+
+
+def ReadFileChunks(path, positions):
+  """Returns a list of strings corresponding to |positions|.
+
+  Args:
+    positions: List of (offset, size).
+  """
+  ret = []
+  if not positions:
+    return ret
+  with open(path, 'rb') as f:
+    for offset, size in positions:
+      f.seek(offset)
+      ret.append(f.read(size))
+  return ret
+
+
+def _ExtractArchivePath(path):
+  # E.g. foo/bar.a(baz.o)
+  if path.endswith(')'):
+    start_idx = path.index('(')
+    return path[:start_idx]
+  return None
+
+
+def _LookupStringSectionPositions(target, tool_prefix, output_directory):
+  """Returns a dict of object_path -> [(offset, size)...] of .rodata sections.
+
+  Args:
+    target: An archive path string (e.g., "foo.a") or a list of object paths.
+  """
+  is_archive = isinstance(target, basestring)
+  args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide']
+  if is_archive:
+    args.append(target)
+  else:
+    # Assign path for when len(target) == 1, (no File: line exists).
+    path = target[0]
+    args.extend(target)
+
+  output = subprocess.check_output(args, cwd=output_directory)
+  lines = output.splitlines()
+  section_positions_by_path = {}
+  cur_offsets = []
+  for line in lines:
+    # File: base/third_party/libevent/libevent.a(buffer.o)
+    # [Nr] Name              Type        Addr     Off    Size   ES Flg Lk Inf Al
+    # [11] .rodata.str1.1    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  1
+    # [11] .rodata.str4.4    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  4
+    # [11] .rodata.str8.8    PROGBITS    00000000 0000b4 000004 01 AMS  0   0  8
+    # [80] .rodata..L.str    PROGBITS    00000000 000530 000002 00   A  0   0  1
+    # The various string sections differ by alignment.
+    # The presence of a wchar_t literal (L"asdf") seems to make a str4 section.
+    # When multiple sections exist, nm gives us no indication as to which
+    # section each string corresponds to.
+    if line.startswith('File: '):
+      if cur_offsets:
+        section_positions_by_path[path] = cur_offsets
+        cur_offsets = []
+      path = line[6:]
+    elif '.rodata.' in line:
+      progbits_idx = line.find('PROGBITS ')
+      if progbits_idx != -1:
+        fields = line[progbits_idx:].split()
+        position = (int(fields[2], 16), int(fields[3], 16))
+        # The heuristics in _IterStringLiterals rely on str1 coming first.
+        if fields[-1] == '1':
+          cur_offsets.insert(0, position)
+        else:
+          cur_offsets.append(position)
+  if cur_offsets:
+    section_positions_by_path[path] = cur_offsets
+  return section_positions_by_path
+
+
+def _ReadStringSections(target, output_directory, positions_by_path):
+  """Returns a dict of object_path -> [string...] of .rodata chunks.
+
+  Args:
+    target: An archive path string (e.g., "foo.a") or a list of object paths.
+    positions_by_path: A dict of object_path -> [(offset, size)...]
+  """
+  is_archive = isinstance(target, basestring)
+  string_sections_by_path = {}
+  if is_archive:
+    for subpath, chunk in ar.IterArchiveChunks(
+        os.path.join(output_directory, target)):
+      path = '{}({})'.format(target, subpath)
+      positions = positions_by_path.get(path)
+      # No positions if file has no string literals.
+      if positions:
+        string_sections_by_path[path] = (
+            [chunk[offset:offset + size] for offset, size in positions])
+  else:
+    for path in target:
+      positions = positions_by_path.get(path)
+      # We already log a warning about this in _IterStringLiterals().
+      if positions:
+        string_sections_by_path[path] = ReadFileChunks(
+            os.path.join(output_directory, path), positions)
+  return string_sections_by_path
+
+
+def _IterStringLiterals(path, addresses, obj_sections):
+  """Yields all string literals (including \0) for the given object path.
+
+  Args:
+    path: Object file path.
+    addresses: List of string offsets encoded as hex strings.
+    obj_sections: List of contents of .rodata.str sections read from the given
+        object file.
+  """
+
+  next_offsets = sorted(int(a, 16) for a in addresses)
+  if not obj_sections:
+    # Happens when there is an address for a symbol which is not actually a
+    # string literal, or when string_sections_by_path is missing an entry.
+    logging.warning('Object has %d strings but no string sections: %s',
+                    len(addresses), path)
+    return
+  for section_data in obj_sections:
+    cur_offsets = next_offsets
+    # Always assume first element is 0. I'm not entirely sure why this is
+    # necessary, but strings get missed without it.
+    next_offsets = [0]
+    prev_offset = 0
+    # TODO(agrieve): Switch to using nm --print-size in order to capture the
+    #     address+size of each string rather than just the address.
+    for offset in cur_offsets[1:]:
+      if offset >= len(section_data):
+        # Remaining offsets are for next section.
+        next_offsets.append(offset)
+        continue
+      # Figure out which offsets apply to this section via heuristic of them
+      # all ending with a null character.
+      if offset == prev_offset or section_data[offset - 1] != '\0':
+        next_offsets.append(offset)
+        continue
+      yield section_data[prev_offset:offset]
+      prev_offset = offset
+
+    if prev_offset < len(section_data):
+      yield section_data[prev_offset:]
+
+
+# This is a target for BulkForkAndCall().
+def ResolveStringPieces(encoded_string_addresses_by_path, string_data,
+                        tool_prefix, output_directory):
+  string_addresses_by_path = concurrent.DecodeDictOfLists(
+      encoded_string_addresses_by_path)
+  # Assign |target| as archive path, or a list of object paths.
+  any_path = next(string_addresses_by_path.iterkeys())
+  target = _ExtractArchivePath(any_path)
+  if not target:
+    target = string_addresses_by_path.keys()
+
+  # Run readelf to find location of .rodata within the .o files.
+  section_positions_by_path = _LookupStringSectionPositions(
+      target, tool_prefix, output_directory)
+  # Load the .rodata sections (from object files) as strings.
+  string_sections_by_path = _ReadStringSections(
+      target, output_directory, section_positions_by_path)
+
+  # list of elf_positions_by_path.
+  ret = [collections.defaultdict(list) for _ in string_data]
+  # Brute-force search of strings within ** merge strings sections.
+  # This is by far the slowest part of AnalyzeStringLiterals().
+  # TODO(agrieve): Pre-process string_data into a dict of literal->address (at
+  #     least for ascii strings).
+  for path, object_addresses in string_addresses_by_path.iteritems():
+    for value in _IterStringLiterals(
+        path, object_addresses, string_sections_by_path.get(path)):
+      first_match = -1
+      first_match_dict = None
+      for target_dict, data in itertools.izip(ret, string_data):
+        # Set offset so that it will be 0 when len(value) is added to it below.
+        offset = -len(value)
+        while True:
+          offset = data.find(value, offset + len(value))
+          if offset == -1:
+            break
+          # Preferring exact matches (those following \0) over substring matches
+          # significantly increases accuracy (although shows that linker isn't
+          # being optimal).
+          if offset == 0 or data[offset - 1] == '\0':
+            break
+          if first_match == -1:
+            first_match = offset
+            first_match_dict = target_dict
+        if offset != -1:
+          break
+      if offset == -1:
+        # Exact match not found, so take suffix match if it exists.
+        offset = first_match
+        target_dict = first_match_dict
+      # Missing strings happen when optimization make them unused.
+      if offset != -1:
+        # Encode tuple as a string for easier mashalling.
+        target_dict[path].append(
+            str(offset) + ':' + str(len(value)))
+
+  return [concurrent.EncodeDictOfLists(x) for x in ret]
--- a/tools/binary_size/supersize.pydeps
+++ b/tools/binary_size/supersize.pydeps
@@ -57,4 +57,6 @@ libsupersize/match_util.py
 libsupersize/models.py
 libsupersize/ninja_parser.py
 libsupersize/nm.py
+libsupersize/obj_analyzer.py
 libsupersize/path_util.py
+libsupersize/string_extract.py