Commit 5fbc1731 authored by Samuel Huang's avatar Samuel Huang Committed by Commit Bot

[Supersize] Split nm.py into {nm.py, obj_analyzer.py, string_extract.py}.

This is a code movement CL with minimal changes. Details:
- Add obj_analyzer.py: New home for BulkObjectFilenalyzer, along with
  helper functions. Also inherits main() from nm.py for testing.
- Add string_extract.py: New home for {LookupElfRodataInfo(),
  ReadFileChunks(), and ResolveStringPieces()}. Add top-level comments.
- nm.py: Have content moved to the new files. Also, exposing
  RunNmOnIntermediates(), to be called from obj_analyzer.py.
- Update archive.py and console.py to adapt to new code locations.

Bug: 723798
Change-Id: I1d1670f04549a416f06de1da03c1a2b03c378461
Reviewed-on: https://chromium-review.googlesource.com/1136943
Commit-Queue: Samuel Huang <huangs@chromium.org>
Reviewed-by: default avataragrieve <agrieve@chromium.org>
Cr-Commit-Position: refs/heads/master@{#575062}
parent af58eea6
......@@ -30,6 +30,7 @@ import linker_map_parser
import models
import ninja_parser
import nm
import obj_analyzer
import path_util
sys.path.insert(1, os.path.join(path_util.SRC_ROOT, 'tools', 'grit'))
......@@ -766,7 +767,7 @@ def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals,
# Rather than record all paths for each symbol, set the paths to be the
# common ancestor of all paths.
if outdir_context:
bulk_analyzer = nm.BulkObjectFileAnalyzer(
bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
tool_prefix, outdir_context.output_directory)
bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)
......
......@@ -23,8 +23,8 @@ import diff
import file_format
import match_util
import models
import nm
import path_util
import string_extract
# Number of lines before using less for Print().
......@@ -122,7 +122,8 @@ class _Session(object):
elf_path = self._ElfPathForSymbol(
size_info, tool_prefix, elf_path)
address, offset, _ = nm.LookupElfRodataInfo(elf_path, tool_prefix)
address, offset, _ = string_extract.LookupElfRodataInfo(
elf_path, tool_prefix)
adjust = offset - address
ret = []
with open(elf_path, 'rb') as f:
......
This diff is collapsed.
This diff is collapsed.
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utilities to extract string literals from object files.
LookupElfRodataInfo():
Runs readelf to extract and return .rodata section spec of an ELF file.
ReadFileChunks():
Reads raw data from a file, given a list of ranges in the file.
ResolveStringPieces():
BulkForkAndCall() target: Given {path: [string addresses]} and
[raw_string_data for each string_section]:
- Reads {path: [src_strings]}.
- For each path, searches for src_strings in at most 1 raw_string_data over
each string_section. If found, translates to string_range and annotates it
to the string_section.
- Returns [{path: [string_ranges]} for each string_section].
"""
import collections
import itertools
import logging
import os
import subprocess
import ar
import concurrent
import models
import path_util
def LookupElfRodataInfo(elf_path, tool_prefix):
"""Returns (address, offset, size) for the .rodata section."""
args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path]
output = subprocess.check_output(args)
lines = output.splitlines()
for line in lines:
# [Nr] Name Type Addr Off Size ES Flg Lk Inf Al
# [07] .rodata PROGBITS 025e7000 237c000 5ec4f6 00 A 0 0 256
if '.rodata ' in line:
fields = line[line.index(models.SECTION_RODATA):].split()
return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16)
raise AssertionError('No .rodata for command: ' + repr(args))
def ReadFileChunks(path, positions):
"""Returns a list of strings corresponding to |positions|.
Args:
positions: List of (offset, size).
"""
ret = []
if not positions:
return ret
with open(path, 'rb') as f:
for offset, size in positions:
f.seek(offset)
ret.append(f.read(size))
return ret
def _ExtractArchivePath(path):
# E.g. foo/bar.a(baz.o)
if path.endswith(')'):
start_idx = path.index('(')
return path[:start_idx]
return None
def _LookupStringSectionPositions(target, tool_prefix, output_directory):
"""Returns a dict of object_path -> [(offset, size)...] of .rodata sections.
Args:
target: An archive path string (e.g., "foo.a") or a list of object paths.
"""
is_archive = isinstance(target, basestring)
args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide']
if is_archive:
args.append(target)
else:
# Assign path for when len(target) == 1, (no File: line exists).
path = target[0]
args.extend(target)
output = subprocess.check_output(args, cwd=output_directory)
lines = output.splitlines()
section_positions_by_path = {}
cur_offsets = []
for line in lines:
# File: base/third_party/libevent/libevent.a(buffer.o)
# [Nr] Name Type Addr Off Size ES Flg Lk Inf Al
# [11] .rodata.str1.1 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 1
# [11] .rodata.str4.4 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 4
# [11] .rodata.str8.8 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 8
# [80] .rodata..L.str PROGBITS 00000000 000530 000002 00 A 0 0 1
# The various string sections differ by alignment.
# The presence of a wchar_t literal (L"asdf") seems to make a str4 section.
# When multiple sections exist, nm gives us no indication as to which
# section each string corresponds to.
if line.startswith('File: '):
if cur_offsets:
section_positions_by_path[path] = cur_offsets
cur_offsets = []
path = line[6:]
elif '.rodata.' in line:
progbits_idx = line.find('PROGBITS ')
if progbits_idx != -1:
fields = line[progbits_idx:].split()
position = (int(fields[2], 16), int(fields[3], 16))
# The heuristics in _IterStringLiterals rely on str1 coming first.
if fields[-1] == '1':
cur_offsets.insert(0, position)
else:
cur_offsets.append(position)
if cur_offsets:
section_positions_by_path[path] = cur_offsets
return section_positions_by_path
def _ReadStringSections(target, output_directory, positions_by_path):
"""Returns a dict of object_path -> [string...] of .rodata chunks.
Args:
target: An archive path string (e.g., "foo.a") or a list of object paths.
positions_by_path: A dict of object_path -> [(offset, size)...]
"""
is_archive = isinstance(target, basestring)
string_sections_by_path = {}
if is_archive:
for subpath, chunk in ar.IterArchiveChunks(
os.path.join(output_directory, target)):
path = '{}({})'.format(target, subpath)
positions = positions_by_path.get(path)
# No positions if file has no string literals.
if positions:
string_sections_by_path[path] = (
[chunk[offset:offset + size] for offset, size in positions])
else:
for path in target:
positions = positions_by_path.get(path)
# We already log a warning about this in _IterStringLiterals().
if positions:
string_sections_by_path[path] = ReadFileChunks(
os.path.join(output_directory, path), positions)
return string_sections_by_path
def _IterStringLiterals(path, addresses, obj_sections):
"""Yields all string literals (including \0) for the given object path.
Args:
path: Object file path.
addresses: List of string offsets encoded as hex strings.
obj_sections: List of contents of .rodata.str sections read from the given
object file.
"""
next_offsets = sorted(int(a, 16) for a in addresses)
if not obj_sections:
# Happens when there is an address for a symbol which is not actually a
# string literal, or when string_sections_by_path is missing an entry.
logging.warning('Object has %d strings but no string sections: %s',
len(addresses), path)
return
for section_data in obj_sections:
cur_offsets = next_offsets
# Always assume first element is 0. I'm not entirely sure why this is
# necessary, but strings get missed without it.
next_offsets = [0]
prev_offset = 0
# TODO(agrieve): Switch to using nm --print-size in order to capture the
# address+size of each string rather than just the address.
for offset in cur_offsets[1:]:
if offset >= len(section_data):
# Remaining offsets are for next section.
next_offsets.append(offset)
continue
# Figure out which offsets apply to this section via heuristic of them
# all ending with a null character.
if offset == prev_offset or section_data[offset - 1] != '\0':
next_offsets.append(offset)
continue
yield section_data[prev_offset:offset]
prev_offset = offset
if prev_offset < len(section_data):
yield section_data[prev_offset:]
# This is a target for BulkForkAndCall().
def ResolveStringPieces(encoded_string_addresses_by_path, string_data,
tool_prefix, output_directory):
string_addresses_by_path = concurrent.DecodeDictOfLists(
encoded_string_addresses_by_path)
# Assign |target| as archive path, or a list of object paths.
any_path = next(string_addresses_by_path.iterkeys())
target = _ExtractArchivePath(any_path)
if not target:
target = string_addresses_by_path.keys()
# Run readelf to find location of .rodata within the .o files.
section_positions_by_path = _LookupStringSectionPositions(
target, tool_prefix, output_directory)
# Load the .rodata sections (from object files) as strings.
string_sections_by_path = _ReadStringSections(
target, output_directory, section_positions_by_path)
# list of elf_positions_by_path.
ret = [collections.defaultdict(list) for _ in string_data]
# Brute-force search of strings within ** merge strings sections.
# This is by far the slowest part of AnalyzeStringLiterals().
# TODO(agrieve): Pre-process string_data into a dict of literal->address (at
# least for ascii strings).
for path, object_addresses in string_addresses_by_path.iteritems():
for value in _IterStringLiterals(
path, object_addresses, string_sections_by_path.get(path)):
first_match = -1
first_match_dict = None
for target_dict, data in itertools.izip(ret, string_data):
# Set offset so that it will be 0 when len(value) is added to it below.
offset = -len(value)
while True:
offset = data.find(value, offset + len(value))
if offset == -1:
break
# Preferring exact matches (those following \0) over substring matches
# significantly increases accuracy (although shows that linker isn't
# being optimal).
if offset == 0 or data[offset - 1] == '\0':
break
if first_match == -1:
first_match = offset
first_match_dict = target_dict
if offset != -1:
break
if offset == -1:
# Exact match not found, so take suffix match if it exists.
offset = first_match
target_dict = first_match_dict
# Missing strings happen when optimization make them unused.
if offset != -1:
# Encode tuple as a string for easier mashalling.
target_dict[path].append(
str(offset) + ':' + str(len(value)))
return [concurrent.EncodeDictOfLists(x) for x in ret]
......@@ -57,4 +57,6 @@ libsupersize/match_util.py
libsupersize/models.py
libsupersize/ninja_parser.py
libsupersize/nm.py
libsupersize/obj_analyzer.py
libsupersize/path_util.py
libsupersize/string_extract.py
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment