Commit b08bfffd authored by Samuel Huang's avatar Samuel Huang Committed by Commit Bot

[Supersize] Add LLD-LTO string literal support.

This is the last of the series of CLs to add LLD-LTO string literal
support to Supersize by parsing BC .o files when LLD-LTO is enabled.
Details:
- Add ResolveStringPieces() (alongside ResolveStringPiecesIndirect()) to
  process strings extracted from BC files.
- Check each .o files processed to decide if it's a BC file.
- nm is still used to process all .o files (including BC files) to
  extract symbols.
- Any BC .o files found are processed by the new code that calls
  llvm-bcanalyzer. This provides flexibility since some .o files may
  still be ELF.

Bug: 723798
Change-Id: I5b7d1db9f91a2cd6749fe4c57595ac46eee664a8
Reviewed-on: https://chromium-review.googlesource.com/1130006Reviewed-by: default avatarSamuel Huang <huangs@chromium.org>
Reviewed-by: default avataragrieve <agrieve@chromium.org>
Commit-Queue: Samuel Huang <huangs@chromium.org>
Cr-Commit-Position: refs/heads/master@{#577622}
parent 6735d97d
...@@ -50,6 +50,7 @@ import sys ...@@ -50,6 +50,7 @@ import sys
import threading import threading
import traceback import traceback
import bcanalyzer
import concurrent import concurrent
import demangle import demangle
import nm import nm
...@@ -81,9 +82,10 @@ def _MakeToolPrefixAbsolute(tool_prefix): ...@@ -81,9 +82,10 @@ def _MakeToolPrefixAbsolute(tool_prefix):
class _PathsByType: class _PathsByType:
def __init__(self, arch, obj): def __init__(self, arch, obj, bc):
self.arch = arch self.arch = arch
self.obj = obj self.obj = obj
self.bc = bc
class _BulkObjectFileAnalyzerWorker(object): class _BulkObjectFileAnalyzerWorker(object):
...@@ -93,6 +95,7 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -93,6 +95,7 @@ class _BulkObjectFileAnalyzerWorker(object):
self._list_of_encoded_elf_string_ranges_by_path = None self._list_of_encoded_elf_string_ranges_by_path = None
self._paths_by_name = collections.defaultdict(list) self._paths_by_name = collections.defaultdict(list)
self._encoded_string_addresses_by_path_chunks = [] self._encoded_string_addresses_by_path_chunks = []
self._encoded_strings_by_path_chunks = []
def _ClassifyPaths(self, paths): def _ClassifyPaths(self, paths):
"""Classifies |paths| (.o and .a files) by file type into separate lists. """Classifies |paths| (.o and .a files) by file type into separate lists.
...@@ -102,12 +105,19 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -102,12 +105,19 @@ class _BulkObjectFileAnalyzerWorker(object):
""" """
arch_paths = [] arch_paths = []
obj_paths = [] obj_paths = []
bc_paths = []
for path in paths: for path in paths:
if path.endswith('.a'): if path.endswith('.a'):
# .a files are typically system libraries containing .o files that are
# ELF files (and never BC files).
arch_paths.append(path) arch_paths.append(path)
elif bcanalyzer.IsBitcodeFile(os.path.join(self._output_directory, path)):
# Chromium build tools create BC files with .o extension. As a result,
# IsBitcodeFile() is needed to distinguish BC files from ELF .o files.
bc_paths.append(path)
else: else:
obj_paths.append(path) obj_paths.append(path)
return _PathsByType(arch=arch_paths, obj=obj_paths) return _PathsByType(arch=arch_paths, obj=obj_paths, bc=bc_paths)
def _MakeBatches(self, paths, size=None): def _MakeBatches(self, paths, size=None):
if size is None: if size is None:
...@@ -124,12 +134,13 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -124,12 +134,13 @@ class _BulkObjectFileAnalyzerWorker(object):
output_directory=self._output_directory) output_directory=self._output_directory)
def _RunNm(self, paths_by_type): def _RunNm(self, paths_by_type):
"""Calls nm to get symbols and string addresses.""" """Calls nm to get symbols and (for non-BC files) string addresses."""
# Downstream functions rely upon .a not being grouped. # Downstream functions rely upon .a not being grouped.
batches = self._MakeBatches(paths_by_type.arch, None) batches = self._MakeBatches(paths_by_type.arch, None)
# Combine object files and Bitcode files for nm # Combine object files and Bitcode files for nm.
BATCH_SIZE = 50 # Arbitrarily chosen. BATCH_SIZE = 50 # Arbitrarily chosen.
batches.extend(self._MakeBatches(paths_by_type.obj, BATCH_SIZE)) batches.extend(
self._MakeBatches(paths_by_type.obj + paths_by_type.bc, BATCH_SIZE))
results = self._DoBulkFork(nm.RunNmOnIntermediates, batches) results = self._DoBulkFork(nm.RunNmOnIntermediates, batches)
# Names are still mangled. # Names are still mangled.
...@@ -143,12 +154,24 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -143,12 +154,24 @@ class _BulkObjectFileAnalyzerWorker(object):
if encoded_strs != concurrent.EMPTY_ENCODED_DICT: if encoded_strs != concurrent.EMPTY_ENCODED_DICT:
self._encoded_string_addresses_by_path_chunks.append(encoded_strs) self._encoded_string_addresses_by_path_chunks.append(encoded_strs)
def _RunLlvmBcAnalyzer(self, paths_by_type):
"""Calls llvm-bcanalyzer to extract string data (for LLD-LTO)."""
BATCH_SIZE = 50 # Arbitrarily chosen.
batches = self._MakeBatches(paths_by_type.bc, BATCH_SIZE)
results = self._DoBulkFork(
bcanalyzer.RunBcAnalyzerOnIntermediates, batches)
for encoded_strs in results:
if encoded_strs != concurrent.EMPTY_ENCODED_DICT:
self._encoded_strings_by_path_chunks.append(encoded_strs);
def AnalyzePaths(self, paths): def AnalyzePaths(self, paths):
logging.debug('worker: AnalyzePaths() started.') logging.debug('worker: AnalyzePaths() started.')
paths_by_type = self._ClassifyPaths(paths) paths_by_type = self._ClassifyPaths(paths)
logging.info('File counts: {\'arch\': %d, \'obj\': %d}', logging.info('File counts: {\'arch\': %d, \'obj\': %d, \'bc\': %d}',
len(paths_by_type.arch), len(paths_by_type.obj)) len(paths_by_type.arch), len(paths_by_type.obj),
len(paths_by_type.bc))
self._RunNm(paths_by_type) self._RunNm(paths_by_type)
self._RunLlvmBcAnalyzer(paths_by_type)
logging.debug('worker: AnalyzePaths() completed.') logging.debug('worker: AnalyzePaths() completed.')
def SortPaths(self): def SortPaths(self):
...@@ -179,6 +202,14 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -179,6 +202,14 @@ class _BulkObjectFileAnalyzerWorker(object):
output_directory=self._output_directory) output_directory=self._output_directory)
return list(results) return list(results)
def _GetEncodedRangesFromStrings(self, string_data):
params = ((chunk,) for chunk in self._encoded_strings_by_path_chunks)
# Order of the jobs doesn't matter since each job owns independent paths,
# and our output is a dict where paths are the key.
results = concurrent.BulkForkAndCall(
string_extract.ResolveStringPieces, params, string_data=string_data)
return list(results)
def AnalyzeStringLiterals(self, elf_path, elf_string_ranges): def AnalyzeStringLiterals(self, elf_path, elf_string_ranges):
logging.debug('worker: AnalyzeStringLiterals() started.') logging.debug('worker: AnalyzeStringLiterals() started.')
string_data = self._ReadElfStringData(elf_path, elf_string_ranges) string_data = self._ReadElfStringData(elf_path, elf_string_ranges)
...@@ -186,6 +217,7 @@ class _BulkObjectFileAnalyzerWorker(object): ...@@ -186,6 +217,7 @@ class _BulkObjectFileAnalyzerWorker(object):
# [source_idx][batch_idx][section_idx] -> Encoded {path: [string_ranges]}. # [source_idx][batch_idx][section_idx] -> Encoded {path: [string_ranges]}.
encoded_ranges_sources = [ encoded_ranges_sources = [
self._GetEncodedRangesFromStringAddresses(string_data), self._GetEncodedRangesFromStringAddresses(string_data),
self._GetEncodedRangesFromStrings(string_data),
] ]
# [section_idx] -> {path: [string_ranges]}. # [section_idx] -> {path: [string_ranges]}.
self._list_of_encoded_elf_string_ranges_by_path = [] self._list_of_encoded_elf_string_ranges_by_path = []
......
...@@ -18,8 +18,17 @@ ResolveStringPiecesIndirect(): ...@@ -18,8 +18,17 @@ ResolveStringPiecesIndirect():
each string_section. If found, translates to string_range and annotates it each string_section. If found, translates to string_range and annotates it
to the string_section. to the string_section.
- Returns [{path: [string_ranges]} for each string_section]. - Returns [{path: [string_ranges]} for each string_section].
ResolveStringPieces():
BulkForkAndCall() target: Given {path: [strings]} and
[raw_string_data for each string_section]:
- For each path, searches for src_strings in at most 1 raw_string_data over
each string_section. If found, translates to string_range and annotates it
to the string_section.
- Returns [{path: [string_ranges]} for each string_section].
""" """
import ast
import collections import collections
import itertools import itertools
import logging import logging
...@@ -266,3 +275,18 @@ def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data, ...@@ -266,3 +275,18 @@ def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data,
ret = _AnnotateStringData(string_data, GeneratePathAndValues()) ret = _AnnotateStringData(string_data, GeneratePathAndValues())
return [concurrent.EncodeDictOfLists(x) for x in ret] return [concurrent.EncodeDictOfLists(x) for x in ret]
# This is a target for BulkForkAndCall().
def ResolveStringPieces(encoded_strings_by_path, string_data):
# ast.literal_eval() undoes repr() applied to strings.
strings_by_path = concurrent.DecodeDictOfLists(
encoded_strings_by_path, value_transform=ast.literal_eval)
def GeneratePathAndValues():
for path, strings in strings_by_path.iteritems():
for value in strings:
yield path, value
ret = _AnnotateStringData(string_data, GeneratePathAndValues())
return [concurrent.EncodeDictOfLists(x) for x in ret]
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
libsupersize/apkanalyzer.py libsupersize/apkanalyzer.py
libsupersize/ar.py libsupersize/ar.py
libsupersize/archive.py libsupersize/archive.py
libsupersize/bcanalyzer.py
libsupersize/canned_queries.py libsupersize/canned_queries.py
libsupersize/concurrent.py libsupersize/concurrent.py
libsupersize/console.py libsupersize/console.py
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment