Add size breakdown to //tools/grit/pak_util.py print

Example output for: pak_util.py print locales/zh-CN.pak version: 5 encoding: utf-8 num_resources: 3120 num_aliases: 413 size_header: 12 size_id_table: 16248 size_alias_table: 1652 size_data: 99490 total_size: 117402 Entry(id=423, canonical_id=423, size=271, sha1=bd30bee710): ... ... Bug: 749521 Change-Id: Id0c3e788c83d9e97d67b7eb6a4090ca68cb97431 Reviewed-on: https://chromium-review.googlesource.com/794340 Commit-Queue: agrieve <agrieve@chromium.org> Reviewed-by: Robert Flack <flackr@chromium.org> Cr-Commit-Position: refs/heads/master@{#520138}

Add size breakdown to //tools/grit/pak_util.py print
Example output for: pak_util.py print locales/zh-CN.pak version: 5 encoding: utf-8 num_resources: 3120 num_aliases: 413 size_header: 12 size_id_table: 16248 size_alias_table: 1652 size_data: 99490 total_size: 117402 Entry(id=423, canonical_id=423, size=271, sha1=bd30bee710): ... ... Bug: 749521 Change-Id: Id0c3e788c83d9e97d67b7eb6a4090ca68cb97431 Reviewed-on: https://chromium-review.googlesource.com/794340 Commit-Queue: agrieve <agrieve@chromium.org> Reviewed-by: Robert Flack <flackr@chromium.org> Cr-Commit-Position: refs/heads/master@{#520138}
18d614e2 · Andrew Grieve · Commit Bot · 603dfcf4 · 18d614e2 · 18d614e2
Commit 18d614e2 authored Nov 29, 2017 by Andrew Grieve Committed by Commit Bot Nov 29, 2017
3 changed files
--- a/tools/grit/grit/format/data_pack.py
+++ b/tools/grit/grit/format/data_pack.py
@@ -33,8 +33,42 @@ class CorruptDataPack(Exception):
  pass


-DataPackContents = collections.namedtuple(
-    'DataPackContents', 'resources encoding')
+class DataPackSizes(object):
+  def __init__(self, header, id_table, alias_table, data):
+    self.header = header
+    self.id_table = id_table
+    self.alias_table = alias_table
+    self.data = data
+
+  @property
+  def total(self):
+    return sum(v for v in self.__dict__.itervalues())
+
+  def __iter__(self):
+    yield ('header', self.header)
+    yield ('id_table', self.id_table)
+    yield ('alias_table', self.alias_table)
+    yield ('data', self.data)
+
+  def __eq__(self, other):
+    return self.__dict__ == other.__dict__
+
+  def __repr__(self):
+    return self.__class__.__name__ + repr(self.__dict__)
+
+
+class DataPackContents(object):
+  def __init__(self, resources, encoding, version, aliases, sizes):
+    # Map of resource_id -> str.
+    self.resources = resources
+    # Encoding (int).
+    self.encoding = encoding
+    # Version (int).
+    self.version = version
+    # Map of resource_id->canonical_resource_id
+    self.aliases = aliases
+    # DataPackSizes instance.
+    self.sizes = sizes


 def Format(root, lang='en', output_dir='.'):
@@ -59,45 +93,51 @@ def ReadDataPack(input_file):

 def ReadDataPackFromString(data):
  """Reads a data pack file and returns a dictionary."""
-  original_data = data
-
  # Read the header.
  version = struct.unpack('<I', data[:4])[0]
  if version == 4:
    resource_count, encoding = struct.unpack('<IB', data[4:9])
    alias_count = 0
-    data = data[9:]
+    header_size = 9
  elif version == 5:
    encoding, resource_count, alias_count = struct.unpack('<BxxxHH', data[4:12])
-    data = data[12:]
+    header_size = 12
  else:
    raise WrongFileVersion('Found version: ' + str(version))

  resources = {}
  kIndexEntrySize = 2 + 4  # Each entry is a uint16 and a uint32.
  def entry_at_index(idx):
-    offset = idx * kIndexEntrySize
+    offset = header_size + idx * kIndexEntrySize
    return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])

  prev_resource_id, prev_offset = entry_at_index(0)
  for i in xrange(1, resource_count + 1):
    resource_id, offset = entry_at_index(i)
-    resources[prev_resource_id] = original_data[prev_offset:offset]
+    resources[prev_resource_id] = data[prev_offset:offset]
    prev_resource_id, prev_offset = resource_id, offset

+  id_table_size = (resource_count + 1) * kIndexEntrySize
  # Read the alias table.
-  alias_data = data[(resource_count + 1) * kIndexEntrySize:]
  kAliasEntrySize = 2 + 2  # uint16, uint16
  def alias_at_index(idx):
-    offset = idx * kAliasEntrySize
-    return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])
+    offset = header_size + id_table_size + idx * kAliasEntrySize
+    return struct.unpack('<HH', data[offset:offset + kAliasEntrySize])

+  aliases = {}
  for i in xrange(alias_count):
    resource_id, index = alias_at_index(i)
    aliased_id = entry_at_index(index)[0]
+    aliases[resource_id] = aliased_id
    resources[resource_id] = resources[aliased_id]

-  return DataPackContents(resources, encoding)
+  alias_table_size = kAliasEntrySize * alias_count
+  sizes = DataPackSizes(
+      header_size, id_table_size, alias_table_size,
+      len(data) - header_size - id_table_size - alias_table_size)
+  assert sizes.total == len(data), 'original={} computed={}'.format(
+      len(data), sizes.total)
+  return DataPackContents(resources, encoding, version, aliases, sizes)


 def WriteDataPackToString(resources, encoding):
@@ -181,8 +221,9 @@ def RePack(output_file, input_files, whitelist_file=None,
  if whitelist_file:
    whitelist = util.ReadFile(whitelist_file, util.RAW_TEXT).strip().split('\n')
    whitelist = set(map(int, whitelist))
+  inputs = [(p.resources, p.encoding) for p in input_data_packs]
  resources, encoding = RePackFromDataPackStrings(
-      input_data_packs, whitelist, suppress_removed_key_output)
+      inputs, whitelist, suppress_removed_key_output)
  WriteDataPack(resources, output_file, encoding)
  with open(output_file + '.info', 'w') as output_info_file:
    for filename in input_info_files:
@@ -192,17 +233,16 @@ def RePack(output_file, input_files, whitelist_file=None,

 def RePackFromDataPackStrings(inputs, whitelist,
                              suppress_removed_key_output=False):
-  """Returns a data pack string that combines the resources from inputs.
+  """Combines all inputs into one.

  Args:
-      inputs: a list of data pack strings that need to be combined.
+      inputs: a list of (resources_by_id, encoding) tuples to be combined.
      whitelist: a list of resource IDs that should be kept in the output string
                 or None to include all resources.
      suppress_removed_key_output: Do not print removed keys.

  Returns:
-      DataPackContents: a tuple containing the new combined data pack and its
-                        encoding.
+      Returns (resources_by_id, encoding).

  Raises:
      KeyError: if there are duplicate keys or resource encoding is
@@ -210,36 +250,36 @@ def RePackFromDataPackStrings(inputs, whitelist,
  """
  resources = {}
  encoding = None
-  for content in inputs:
+  for input_resources, input_encoding in inputs:
    # Make sure we have no dups.
-    duplicate_keys = set(content.resources.keys()) & set(resources.keys())
+    duplicate_keys = set(input_resources.keys()) & set(resources.keys())
    if duplicate_keys:
      raise exceptions.KeyError('Duplicate keys: ' + str(list(duplicate_keys)))

    # Make sure encoding is consistent.
    if encoding in (None, BINARY):
-      encoding = content.encoding
-    elif content.encoding not in (BINARY, encoding):
+      encoding = input_encoding
+    elif input_encoding not in (BINARY, encoding):
      raise exceptions.KeyError('Inconsistent encodings: ' + str(encoding) +
-                                ' vs ' + str(content.encoding))
+                                ' vs ' + str(input_encoding))

    if whitelist:
-      whitelisted_resources = dict([(key, content.resources[key])
-                                    for key in content.resources.keys()
+      whitelisted_resources = dict([(key, input_resources[key])
+                                    for key in input_resources.keys()
                                    if key in whitelist])
      resources.update(whitelisted_resources)
-      removed_keys = [key for key in content.resources.keys()
+      removed_keys = [key for key in input_resources.keys()
                      if key not in whitelist]
      if not suppress_removed_key_output:
        for key in removed_keys:
          print 'RePackFromDataPackStrings Removed Key:', key
    else:
-      resources.update(content.resources)
+      resources.update(input_resources)

  # Encoding is 0 for BINARY, 1 for UTF8 and 2 for UTF16
  if encoding is None:
    encoding = BINARY
-  return DataPackContents(resources, encoding)
+  return resources, encoding


 def main():

--- a/tools/grit/grit/format/data_pack_unittest.py
+++ b/tools/grit/grit/format/data_pack_unittest.py
@@ -28,16 +28,15 @@ class FormatDataPackUnittest(unittest.TestCase):
        '\x0a\x00\x3f\x00\x00\x00'          # index entry 10
        '\x00\x00\x3f\x00\x00\x00'          # extra entry for the size of last
        'this is id 4this is id 6')         # data
-    expected_resources = {
-        1: '',
-        4: 'this is id 4',
-        6: 'this is id 6',
-        10: '',
-    }
    expected_data_pack = data_pack.DataPackContents(
-        expected_resources, data_pack.UTF8)
+        {
+            1: '',
+            4: 'this is id 4',
+            6: 'this is id 6',
+            10: '',
+        }, data_pack.UTF8, 4, {}, data_pack.DataPackSizes(9, 30, 0, 24))
    loaded = data_pack.ReadDataPackFromString(expected_data)
-    self.assertEquals(loaded, expected_data_pack)
+    self.assertDictEqual(expected_data_pack.__dict__, loaded.__dict__)

  def testReadWriteDataPackV5(self):
    expected_data = (
@@ -51,19 +50,24 @@ class FormatDataPackUnittest(unittest.TestCase):
        '\x00\x00\x40\x00\x00\x00'          # extra entry for the size of last
        '\x0a\x00\x01\x00'                  # alias table
        'this is id 4this is id 6')         # data
-    expected_resources = {
+    input_resources = {
        1: '',
        4: 'this is id 4',
        6: 'this is id 6',
        10: 'this is id 4',
    }
-    data = data_pack.WriteDataPackToString(expected_resources, data_pack.UTF8)
+    data = data_pack.WriteDataPackToString(input_resources, data_pack.UTF8)
    self.assertEquals(data, expected_data)

    expected_data_pack = data_pack.DataPackContents(
-        expected_resources, data_pack.UTF8)
+        {
+            1: '',
+            4: input_resources[4],
+            6: input_resources[6],
+            10: input_resources[4],
+        }, data_pack.UTF8, 5, {10: 4}, data_pack.DataPackSizes(12, 24, 4, 24))
    loaded = data_pack.ReadDataPackFromString(expected_data)
-    self.assertEquals(loaded, expected_data_pack)
+    self.assertDictEqual(expected_data_pack.__dict__, loaded.__dict__)

  def testRePackUnittest(self):
    expected_with_whitelist = {
@@ -79,8 +83,7 @@ class FormatDataPackUnittest(unittest.TestCase):
              {40: 'Never', 50: 'gonna run around and', 60: 'desert you'},
              {65: 'Close', 70: 'Awww, snap!'}]
    whitelist = [1, 10, 20, 30, 40, 50, 60]
-    inputs = [data_pack.DataPackContents(input, data_pack.UTF8) for input
-              in inputs]
+    inputs = [(i, data_pack.UTF8) for i in inputs]

    # RePack using whitelist
    output, _ = data_pack.RePackFromDataPackStrings(

--- a/tools/grit/pak_util.py
+++ b/tools/grit/pak_util.py
@@ -33,7 +33,7 @@ def _ExtractMain(args):

 def _PrintMain(args):
  pak = data_pack.ReadDataPack(args.pak_file)
-  id_map = {id(v): k for k, v in sorted(pak.resources.items(), reverse=True)}
+  output = args.output
  encoding = 'binary'
  if pak.encoding == 1:
    encoding = 'utf-8'
@@ -41,14 +41,21 @@ def _PrintMain(args):
    encoding = 'utf-16'
  else:
    encoding = '?' + str(pak.encoding)
-  print 'Encoding:', encoding

-  try_decode = encoding.startswith('utf')
+  output.write('version: {}\n'.format(pak.version))
+  output.write('encoding: {}\n'.format(encoding))
+  output.write('num_resources: {}\n'.format(len(pak.resources)))
+  output.write('num_aliases: {}\n'.format(len(pak.aliases)))
+  breakdown = ', '.join('{}: {}'.format(*x) for x in pak.sizes)
+  output.write('total_size: {} ({})\n'.format(pak.sizes.total, breakdown))
+
+  try_decode = args.decode and encoding.startswith('utf')
  # Print IDs in ascending order, since that's the order in which they appear in
  # the file (order is lost by Python dict).
  for resource_id in sorted(pak.resources):
    data = pak.resources[resource_id]
-    desc = '<binary>'
+    canonical_id = pak.aliases.get(resource_id, resource_id)
+    desc = '<data>'
    if try_decode:
      try:
        desc = unicode(data, encoding)
@@ -58,19 +65,14 @@ def _PrintMain(args):
      except UnicodeDecodeError:
        pass
    sha1 = hashlib.sha1(data).hexdigest()[:10]
-    canonical_id = id_map[id(data)]
-    if resource_id == canonical_id:
-      line = u'Entry(id={}, len={}, sha1={}): {}'.format(
-          resource_id, len(data), sha1, desc)
-    else:
-      line = u'Entry(id={}, alias_of={}): {}'.format(
-          resource_id, canonical_id, desc)
-    print line.encode('utf-8')
+    output.write(
+        u'Entry(id={}, canonical_id={}, size={}, sha1={}): {}\n'.format(
+            resource_id, canonical_id, len(data), sha1, desc).encode('utf-8'))


 def _ListMain(args):
-  resources, _ = data_pack.ReadDataPack(args.pak_file)
-  for resource_id in sorted(resources.keys()):
+  pak = data_pack.ReadDataPack(args.pak_file)
+  for resource_id in sorted(pak.resources):
    args.output.write('%d\n' % resource_id)


@@ -99,8 +101,13 @@ def main():
  sub_parser = sub_parsers.add_parser('print',
      help='Prints all pak IDs and contents. Useful for diffing.')
  sub_parser.add_argument('pak_file')
+  sub_parser.add_argument('--output', type=argparse.FileType('w'),
+      default=sys.stdout,
+      help='The resource list path to write (default stdout)')
+  sub_parser.add_argument('--no-decode', dest='decode', action='store_false',
+      default=True, help='Do not print entry data.')
  sub_parser.set_defaults(func=_PrintMain)
-  
+
  sub_parser = sub_parsers.add_parser('list-id',
      help='Outputs all resource IDs to a file.')
  sub_parser.add_argument('pak_file')