[Media Feeds] Reduce space and increase speed

Right now we have a lot of strings in the schema_org generated code and this adds overhead and can be a bit slower. This adds a "schema_org_name_generator" build task that generates unsigned int hashes for all names in the schema. Unfortunately this is separate because the hash function is not available in Python. Change-Id: I567a48b388d1e9f5e20e6f5e153402e736c4d0f0 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2144773 Commit-Queue: Becca Hughes <beccahughes@chromium.org> Reviewed-by: Tommy Steimel <steimel@chromium.org> Cr-Commit-Position: refs/heads/master@{#758083}

[Media Feeds] Reduce space and increase speed
Right now we have a lot of strings in the schema_org generated code and this adds overhead and can be a bit slower. This adds a "schema_org_name_generator" build task that generates unsigned int hashes for all names in the schema. Unfortunately this is separate because the hash function is not available in Python. Change-Id: I567a48b388d1e9f5e20e6f5e153402e736c4d0f0 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2144773 Commit-Queue: Becca Hughes <beccahughes@chromium.org> Reviewed-by: Tommy Steimel <steimel@chromium.org> Cr-Commit-Position: refs/heads/master@{#758083}
81fed303 · Becca Hughes · Commit Bot · 3728af9c · 81fed303 · 81fed303
Commit 81fed303 authored Apr 10, 2020 by Becca Hughes Committed by Commit Bot Apr 10, 2020
9 changed files
--- a/components/schema_org/BUILD.gn
+++ b/components/schema_org/BUILD.gn
@@ -2,6 +2,8 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

+import("//build/compiled_action.gni")
+
 source_set("unit_tests") {
  testonly = true
  sources = [
@@ -22,6 +24,26 @@ source_set("unit_tests") {
  ]
 }

+executable("schema_org_name_generator") {
+  sources = [ "schema_org_name_generator.cc" ]
+  deps = [
+    "//base",
+    "//url",
+  ]
+}
+
+compiled_action("schema_org_name_data") {
+  tool = ":schema_org_name_generator"
+
+  args = [
+    rebase_path("//third_party/schema_org/schema.jsonld", root_build_dir),
+    rebase_path("//third_party/schema_org/overrides.jsonld", root_build_dir),
+    rebase_path("$target_gen_dir/schema_org_name_data.json", root_build_dir),
+  ]
+
+  outputs = [ "$target_gen_dir/schema_org_name_data.json" ]
+}
+
 action("generate_schema_org_code") {
  script = "//components/schema_org/generate_schema_org_code.py"
  sources = [
@@ -41,6 +63,9 @@ action("generate_schema_org_code") {
        "--overrides-file",
        rebase_path("//third_party/schema_org/overrides.jsonld",
                    root_build_dir),
+        "--name-file",
+        rebase_path("$target_gen_dir/schema_org_name_data.json",
+                    root_build_dir),
        "--output-dir",
        rebase_path("$target_gen_dir", root_build_dir),
        "--templates",
@@ -55,6 +80,7 @@ action("generate_schema_org_code") {
    "$target_gen_dir/schema_org_enums.h",
    "$target_gen_dir/schema_org_enums.cc",
  ]
+  deps = [ ":schema_org_name_data" ]
 }

 static_library("schema_org_properties") {

--- a/components/schema_org/generate_schema_org_code.py
+++ b/components/schema_org/generate_schema_org_code.py
@@ -50,9 +50,20 @@ def is_enum_type(class_obj):
        return parent_class['@id'] == schema_org_id('Enumeration')


-def find_enum_options(obj_id, schema):
+def make_entity(thing, names):
+    return {
+        "name": object_name_from_id(thing['@id']),
+        "name_hash": names[object_name_from_id(thing['@id'])]
+    }
+
+
+def make_entity_from_name(name, names):
+    return {"name": name, "name_hash": names[name]}
+
+
+def find_enum_options(obj_id, schema, names):
    return [
-        object_name_from_id(obj['@id']) for obj in schema['@graph']
+        make_entity(obj, names) for obj in schema['@graph']
        if obj['@type'] == obj_id
    ]

@@ -85,10 +96,11 @@ def get_root_type(the_class, schema):
    return class_obj


-def parse_property(prop, schema):
+def parse_property(prop, schema, names):
    """Parse out details about the property, including what type it can be."""
    parsed_prop = {
        'name': object_name_from_id(prop['@id']),
+        'name_hash': names[object_name_from_id(prop['@id'])],
        'thing_types': [],
        'enum_types': []
    }
@@ -142,7 +154,7 @@ def lookup_parents(thing, schema, lookup_table):
    obj_name = object_name_from_id(thing['@id'])
    if obj_name in lookup_table:
        return lookup_table[obj_name]
-    lookup_table[obj_name] = []
+    lookup_table[obj_name] = set()

    if 'rdfs:subClassOf' in thing:
        parent_classes = thing['rdfs:subClassOf']
@@ -160,48 +172,70 @@ def lookup_parents(thing, schema, lookup_table):
        ]
        # flatten the list
        found_parents = [item for sublist in found_parents for item in sublist]
-        lookup_table[obj_name].extend(found_parents)
+        lookup_table[obj_name].update(found_parents)

-    lookup_table[obj_name].append(obj_name)
+    lookup_table[obj_name].add(obj_name)
    return lookup_table[obj_name]


-def get_template_vars(schema_file_path, overrides_file_path):
-    """Read the needed template variables from the schema file."""
-    template_vars = {
-        'entities': [],
-        'properties': [],
-        'enums': [],
-        'entity_parent_lookup': {}
-    }
-
+def get_template_vars_from_file(schema_file_path, overrides_file_path,
+                                name_file_path):
    with open(schema_file_path) as schema_file:
        schema = json.loads(schema_file.read())

+    with open(name_file_path) as names_file:
+        names = json.loads(names_file.read())
+
    if overrides_file_path:
        with open(overrides_file_path) as overrides_file:
            overrides = json.loads(overrides_file.read())
        for thing in overrides['@graph']:
            merge_with_schema(schema, overrides, thing)

+    return get_template_vars(schema, names)
+
+
+def get_template_vars(schema, names):
+    """Read the needed template variables from the schema file."""
+    template_vars = {
+        'entities': [],
+        'properties': [],
+        'enums': [],
+        'entity_parent_lookup': []
+    }
+
+    entity_parent_lookup = {}
+
    for thing in schema['@graph']:
        if thing['@type'] == 'rdfs:Class':
-            template_vars['entities'].append(object_name_from_id(thing['@id']))
-            lookup_parents(thing, schema,
-                           template_vars['entity_parent_lookup'])
+            template_vars['entities'].append(make_entity(thing, names))
+            lookup_parents(thing, schema, entity_parent_lookup)
            if is_enum_type(thing):
                template_vars['enums'].append({
                    'name':
                    object_name_from_id(thing['@id']),
                    'id':
                    thing['@id'],
+                    'id_hash':
+                    names[thing['@id']],
                    'options':
-                    find_enum_options(thing['@id'], schema)
+                    find_enum_options(thing['@id'], schema, names)
                })
        elif thing['@type'] == 'rdf:Property':
-            template_vars['properties'].append(parse_property(thing, schema))
+            template_vars['properties'].append(
+                parse_property(thing, schema, names))
+
+    for entity, parents in entity_parent_lookup.iteritems():
+        template_vars['entity_parent_lookup'].append({
+            'name':
+            entity,
+            'name_hash':
+            names[entity],
+            'parents':
+            [make_entity_from_name(parent, names) for parent in parents]
+        })

-    template_vars['entities'].sort()
+    template_vars['entities'].sort(key=lambda p: p['name_hash'])
    template_vars['properties'].sort(key=lambda p: p['name'])

    return template_vars
@@ -225,13 +259,17 @@ def main():
        '--overrides-file',
        help='JSON-LD schema file with overrides to support changes not in the '
        'latest schema.org version. Optional.')
+    parser.add_argument(
+        '--name-file',
+        help='JSON file of hashed schema.org names to speed up lookups.')
    parser.add_argument(
        '--output-dir',
        help='Output directory in which to place generated code files.')
    parser.add_argument('--templates', nargs='+')
    args = parser.parse_args()

-    template_vars = get_template_vars(args.schema_file, args.overrides_file)
+    template_vars = get_template_vars_from_file(
+        args.schema_file, args.overrides_file, args.name_file)
    for template_file in args.templates:
        generate_file(
            os.path.join(args.output_dir,

--- a/components/schema_org/generate_schema_org_code_unittest.py
+++ b/components/schema_org/generate_schema_org_code_unittest.py
@@ -23,34 +23,46 @@ import jinja2

 class GenerateSchemaOrgCodeTest(unittest.TestCase):
    def test_get_template_vars(self):
-        file_content = """
-    {
-      "@graph": [
-        {
+        schema = {
+            "@graph": [{
                "@id": "http://schema.org/MediaObject",
                "@type": "rdfs:Class"
            },
                       {
                           "@id": "http://schema.org/propertyName",
                           "@type": "rdf:Property"
+                       }]
        }
-      ]
+
+        names = {
+            "http://schema.org/MediaObject": 1234,
+            "MediaObject": 1235,
+            "http://schema.org/propertyName": 2345,
+            "propertyName": 2346
        }
-    """
-        with mock.patch('__builtin__.open',
-                        mock.mock_open(read_data=file_content)) as m_open:
+
        self.assertEqual(
-                generate_schema_org_code.get_template_vars(m_open, m_open), {
-                    'entities': ['MediaObject'],
+            generate_schema_org_code.get_template_vars(schema, names), {
+                'entities': [{
+                    'name': 'MediaObject',
+                    'name_hash': 1235
+                }],
                'properties': [{
                    'name': 'propertyName',
+                    'name_hash': 2346,
                    'thing_types': [],
                    'enum_types': []
                }],
                'enums': [],
-                    'entity_parent_lookup': {
-                        'MediaObject': ['MediaObject']
-                    }
+                'entity_parent_lookup':
+                [{
+                    'name': 'MediaObject',
+                    'name_hash': 1235,
+                    'parents': [{
+                        'name': 'MediaObject',
+                        'name_hash': 1235
+                    }]
+                }]
            })

    def test_lookup_parents(self):
@@ -65,9 +77,9 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
        }
        brand = {'@id': schema_org_id('Brand'), 'rdfs:subClassOf': intangible}
        schema = {'@graph': [thing, intangible, structured_value, brand]}
-        self.assertListEqual(
+        self.assertSetEqual(
            generate_schema_org_code.lookup_parents(brand, schema, {}),
-            ['Thing', 'Intangible', 'Brand'])
+            set(['Thing', 'Intangible', 'Brand']))

    def test_get_root_type_thing(self):
        thing = {'@id': schema_org_id('Thing')}
@@ -146,9 +158,13 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
            ]
        }

+        names = {"http://schema.org/Identifier": 1234, "Identifier": 1235}
+
        self.assertEqual(
-            generate_schema_org_code.parse_property(identifier, schema), {
+            generate_schema_org_code.parse_property(identifier, schema, names),
+            {
                'name': 'Identifier',
+                'name_hash': 1235,
                'has_number': True,
                'thing_types': [property_value['@id']],
                'enum_types': []

--- a/components/schema_org/schema_org_name_generator.cc
+++ b/components/schema_org/schema_org_name_generator.cc
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "base/command_line.h"
+#include "base/files/file_util.h"
+#include "base/hash/hash.h"
+#include "base/json/json_reader.h"
+#include "base/json/json_writer.h"
+#include "base/logging.h"
+#include "base/path_service.h"
+#include "base/strings/stringprintf.h"
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+#include "url/gurl.h"
+
+namespace {
+
+// Print the command line help.
+void PrintHelp() {
+  LOG(ERROR) << "schema_org_name_generator <schema-file> ... <schema-file>"
+             << " <output-file>";
+}
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  base::CommandLine::Init(argc, argv);
+  const base::CommandLine& command_line =
+      *base::CommandLine::ForCurrentProcess();
+
+  logging::LoggingSettings settings;
+  settings.logging_dest =
+      logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
+  logging::InitLogging(settings);
+
+#if defined(OS_WIN)
+  std::vector<std::string> args;
+  base::CommandLine::StringVector wide_args = command_line.GetArgs();
+  for (const auto& arg : wide_args) {
+    args.push_back(base::UTF16ToUTF8(arg));
+  }
+#else
+  base::CommandLine::StringVector args = command_line.GetArgs();
+#endif
+  if (args.size() < 2U) {
+    PrintHelp();
+    return 1;
+  }
+
+  // Read all the args and convert to file paths.
+  std::vector<base::FilePath> paths;
+  for (auto& arg : args) {
+    paths.push_back(base::FilePath::FromUTF8Unsafe(arg));
+  }
+
+  // Check we have at least two paths.
+  if (paths.size() < 2U) {
+    PrintHelp();
+    return 1;
+  }
+
+  // Get the last path which is the output file.
+  base::FilePath output_path = paths.back();
+  paths.pop_back();
+
+  base::DictionaryValue output_map;
+  std::set<std::string> names_to_generate;
+
+  for (auto& path : paths) {
+    path = base::MakeAbsoluteFilePath(path);
+    if (!base::PathExists(path)) {
+      LOG(ERROR) << "Input JSON file doesn't exist.";
+      return 1;
+    }
+
+    std::string json_input;
+    if (!base::ReadFileToString(path, &json_input)) {
+      LOG(ERROR) << "Could not read input JSON file.";
+      return 1;
+    }
+
+    auto value = base::JSONReader::Read(json_input);
+    base::DictionaryValue* dict_value = nullptr;
+    if (!value.has_value() || !value->GetAsDictionary(&dict_value)) {
+      LOG(ERROR) << "Could not parse the input JSON file";
+      return 1;
+    }
+
+    const base::ListValue* graph = nullptr;
+    if (!dict_value->GetList("@graph", &graph)) {
+      LOG(ERROR) << "Could not parse the @graph in the input JSON";
+      return 1;
+    }
+
+    for (size_t i = 0; i < graph->GetSize(); ++i) {
+      const base::DictionaryValue* parsed = nullptr;
+      if (!graph->GetDictionary(i, &parsed)) {
+        LOG(ERROR) << "Could not parse entry " << i << " in the input JSON";
+        return 1;
+      }
+
+      std::string id;
+      if (!parsed->GetString("@id", &id)) {
+        LOG(ERROR) << "Could not extract the id from the entry";
+        return 1;
+      }
+
+      if (id.empty()) {
+        LOG(ERROR) << "ID was empty";
+        return 1;
+      }
+
+      names_to_generate.insert(id);
+      names_to_generate.insert(GURL(id).path().substr(1));
+    }
+  }
+
+  std::set<unsigned> generated_hashes;
+  for (auto& name : names_to_generate) {
+    auto hash = base::PersistentHash(name);
+
+    if (base::Contains(generated_hashes, hash)) {
+      LOG(ERROR) << "Hash collision: " << name;
+      return 1;
+    }
+
+    output_map.SetStringKey(name, base::StringPrintf("0x%x", hash));
+    generated_hashes.insert(hash);
+  }
+
+  std::string output;
+  if (!base::JSONWriter::Write(output_map, &output)) {
+    LOG(ERROR) << "Failed to convert output to JSON.";
+    return 1;
+  }
+
+  if (base::WriteFile(output_path, output.c_str(),
+                      static_cast<uint32_t>(output.size())) <= 0) {
+    LOG(ERROR) << "Failed to write output.";
+    return 1;
+  }
+
+  return 0;
+}
--- a/components/schema_org/templates/schema_org_entity_names.cc.tmpl
+++ b/components/schema_org/templates/schema_org_entity_names.cc.tmpl
@@ -6,7 +6,7 @@
 // Do not edit.

 #include "components/schema_org/{{ header_file }}.h"
-#include "base/containers/flat_set.h"
+#include "base/hash/hash.h"
 #include "base/no_destructor.h"
 #include "base/strings/string_piece.h"

@@ -14,37 +14,41 @@ namespace schema_org {
 namespace entity {

 {% for entity in entities %}
-const char k{{entity[0]|upper}}{{entity[1:]}}[] = "{{entity}}";
+const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[] = "{{entity.name}}";
 {% endfor %}

 bool IsValidEntityName(const std::string& entity_name) {
-  static const base::NoDestructor<base::flat_set<base::StringPiece>>
-    kValidEntityNames(base::flat_set<base::StringPiece>({
+  switch (base::PersistentHash(entity_name)) {
    {%for entity in entities %}
-      k{{entity[0]|upper}}{{entity[1:]}},
+      case {{entity.name_hash}}:
+        // {{ entity.name }}
+        return true;
    {% endfor %}
-  }));
-  return kValidEntityNames->find(entity_name) != kValidEntityNames->end();
+  };
+
+  return false;
 }

 bool IsDescendedFrom(const std::string& possible_parent,
                     const std::string& possible_child) {
-  static const base::NoDestructor<std::map<std::string, std::vector<std::string>>>
-    kParentEntities(std::map<std::string, std::vector<std::string>>({
-      {%for key in entity_parent_lookup %}
-      { "{{key}}", {
-        {% for parent in entity_parent_lookup[key] %}
-          "{{parent}}",
+  const auto possible_parent_hash = base::PersistentHash(possible_parent);
+
+  switch (base::PersistentHash(possible_child)) {
+    {%for entity in entity_parent_lookup %}
+      case {{entity.name_hash}}:
+        // {{ entity.name }}
+        switch (possible_parent_hash) {
+          {%for parent in entity.parents %}
+            case {{parent.name_hash}}:
+              // {{ parent.name }}
+              return true;
          {% endfor %}
        }
-      },
+        break;
    {% endfor %}
-    }));
-  auto parents = kParentEntities->find(possible_child);
-  if (parents == kParentEntities->end())
+  };
+
  return false;
-  auto it = std::find_if(parents->second.begin(), parents->second.end(), [&possible_parent](const std::string& parent) { return parent == possible_parent; });
-  return it != parents->second.end();
 }



--- a/components/schema_org/templates/schema_org_entity_names.h.tmpl
+++ b/components/schema_org/templates/schema_org_entity_names.h.tmpl
@@ -14,7 +14,7 @@ namespace schema_org {
 namespace entity {

 {% for entity in entities %}
-extern const char k{{entity[0]|upper}}{{entity[1:]}}[];
+extern const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[];
 {% endfor %}

 bool IsValidEntityName(const std::string& entity_name);

--- a/components/schema_org/templates/schema_org_enums.cc.tmpl
+++ b/components/schema_org/templates/schema_org_enums.cc.tmpl
@@ -5,6 +5,7 @@
 // Generated by running //components/schema_org/generate_schema_org_code.py.
 // Do not edit.

+#include "base/hash/hash.h"
 #include "components/schema_org/{{ header_file }}.h"

 namespace schema_org {
@@ -19,15 +20,24 @@ base::Optional<int> CheckValidEnumString(const std::string& name,
  }
  auto path = value.path().substr(1);

+  const auto path_hash = base::PersistentHash(path);
+
+  switch (base::PersistentHash(name)) {
    {% for enum in enums %}
-    if (name == "{{enum.id}}") {
+      case {{enum.id_hash}}:
+        // {{enum.id}}
+        switch (path_hash) {
          {% for option in enum.options %}
-        if (path == "{{option}}") {
+            // {{option.name}}
+            case {{option.name_hash}}:
              return {{ loop.index }};
-        }
          {% endfor %}
        }
+
+        break;
    {% endfor %}
+  }
+
  return base::nullopt;
 }


--- a/components/schema_org/templates/schema_org_enums.h.tmpl
+++ b/components/schema_org/templates/schema_org_enums.h.tmpl
@@ -19,7 +19,7 @@ namespace enums {
 {% for enum in enums %}
  enum class {{enum.name}} {
    {% for option in enum.options %}
-      k{{option}} = {{ loop.index }},
+      k{{option.name}} = {{ loop.index }},
    {% endfor %}
  };
 {% endfor %}

--- a/components/schema_org/templates/schema_org_property_configurations.cc.tmpl
+++ b/components/schema_org/templates/schema_org_property_configurations.cc.tmpl
@@ -5,6 +5,7 @@
 // Generated by running //components/schema_org/generate_schema_org_code.py.
 // Do not edit.

+#include "base/hash/hash.h"
 #include "components/schema_org/{{ header_file }}.h"

 namespace schema_org {
@@ -12,7 +13,9 @@ namespace property {

 PropertyConfiguration GetPropertyConfiguration(const std::string& name) {
  {% for property in properties %}
-  if (name == "{{property.name}}") {
+  switch (base::PersistentHash(name)) {
+    case {{property.name_hash}}:
+      // {{property.name}}
      return {
        /* .text = */ {{'true' if property.has_text else 'false'}},
        /* .date = */ {{'true' if property.has_date else 'false'}},