Commit 81fed303 authored by Becca Hughes's avatar Becca Hughes Committed by Commit Bot

[Media Feeds] Reduce space and increase speed

Right now we have a lot of strings in the schema_org
generated code and this adds overhead and can be
a bit slower.

This adds a "schema_org_name_generator" build task
that generates unsigned int hashes for all names
in the schema. Unfortunately this is separate because
the hash function is not available in Python.

Change-Id: I567a48b388d1e9f5e20e6f5e153402e736c4d0f0
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2144773
Commit-Queue: Becca Hughes <beccahughes@chromium.org>
Reviewed-by: default avatarTommy Steimel <steimel@chromium.org>
Cr-Commit-Position: refs/heads/master@{#758083}
parent 3728af9c
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
import("//build/compiled_action.gni")
source_set("unit_tests") { source_set("unit_tests") {
testonly = true testonly = true
sources = [ sources = [
...@@ -22,6 +24,26 @@ source_set("unit_tests") { ...@@ -22,6 +24,26 @@ source_set("unit_tests") {
] ]
} }
executable("schema_org_name_generator") {
sources = [ "schema_org_name_generator.cc" ]
deps = [
"//base",
"//url",
]
}
compiled_action("schema_org_name_data") {
tool = ":schema_org_name_generator"
args = [
rebase_path("//third_party/schema_org/schema.jsonld", root_build_dir),
rebase_path("//third_party/schema_org/overrides.jsonld", root_build_dir),
rebase_path("$target_gen_dir/schema_org_name_data.json", root_build_dir),
]
outputs = [ "$target_gen_dir/schema_org_name_data.json" ]
}
action("generate_schema_org_code") { action("generate_schema_org_code") {
script = "//components/schema_org/generate_schema_org_code.py" script = "//components/schema_org/generate_schema_org_code.py"
sources = [ sources = [
...@@ -41,6 +63,9 @@ action("generate_schema_org_code") { ...@@ -41,6 +63,9 @@ action("generate_schema_org_code") {
"--overrides-file", "--overrides-file",
rebase_path("//third_party/schema_org/overrides.jsonld", rebase_path("//third_party/schema_org/overrides.jsonld",
root_build_dir), root_build_dir),
"--name-file",
rebase_path("$target_gen_dir/schema_org_name_data.json",
root_build_dir),
"--output-dir", "--output-dir",
rebase_path("$target_gen_dir", root_build_dir), rebase_path("$target_gen_dir", root_build_dir),
"--templates", "--templates",
...@@ -55,6 +80,7 @@ action("generate_schema_org_code") { ...@@ -55,6 +80,7 @@ action("generate_schema_org_code") {
"$target_gen_dir/schema_org_enums.h", "$target_gen_dir/schema_org_enums.h",
"$target_gen_dir/schema_org_enums.cc", "$target_gen_dir/schema_org_enums.cc",
] ]
deps = [ ":schema_org_name_data" ]
} }
static_library("schema_org_properties") { static_library("schema_org_properties") {
......
...@@ -50,9 +50,20 @@ def is_enum_type(class_obj): ...@@ -50,9 +50,20 @@ def is_enum_type(class_obj):
return parent_class['@id'] == schema_org_id('Enumeration') return parent_class['@id'] == schema_org_id('Enumeration')
def find_enum_options(obj_id, schema): def make_entity(thing, names):
return {
"name": object_name_from_id(thing['@id']),
"name_hash": names[object_name_from_id(thing['@id'])]
}
def make_entity_from_name(name, names):
return {"name": name, "name_hash": names[name]}
def find_enum_options(obj_id, schema, names):
return [ return [
object_name_from_id(obj['@id']) for obj in schema['@graph'] make_entity(obj, names) for obj in schema['@graph']
if obj['@type'] == obj_id if obj['@type'] == obj_id
] ]
...@@ -85,10 +96,11 @@ def get_root_type(the_class, schema): ...@@ -85,10 +96,11 @@ def get_root_type(the_class, schema):
return class_obj return class_obj
def parse_property(prop, schema): def parse_property(prop, schema, names):
"""Parse out details about the property, including what type it can be.""" """Parse out details about the property, including what type it can be."""
parsed_prop = { parsed_prop = {
'name': object_name_from_id(prop['@id']), 'name': object_name_from_id(prop['@id']),
'name_hash': names[object_name_from_id(prop['@id'])],
'thing_types': [], 'thing_types': [],
'enum_types': [] 'enum_types': []
} }
...@@ -142,7 +154,7 @@ def lookup_parents(thing, schema, lookup_table): ...@@ -142,7 +154,7 @@ def lookup_parents(thing, schema, lookup_table):
obj_name = object_name_from_id(thing['@id']) obj_name = object_name_from_id(thing['@id'])
if obj_name in lookup_table: if obj_name in lookup_table:
return lookup_table[obj_name] return lookup_table[obj_name]
lookup_table[obj_name] = [] lookup_table[obj_name] = set()
if 'rdfs:subClassOf' in thing: if 'rdfs:subClassOf' in thing:
parent_classes = thing['rdfs:subClassOf'] parent_classes = thing['rdfs:subClassOf']
...@@ -160,48 +172,70 @@ def lookup_parents(thing, schema, lookup_table): ...@@ -160,48 +172,70 @@ def lookup_parents(thing, schema, lookup_table):
] ]
# flatten the list # flatten the list
found_parents = [item for sublist in found_parents for item in sublist] found_parents = [item for sublist in found_parents for item in sublist]
lookup_table[obj_name].extend(found_parents) lookup_table[obj_name].update(found_parents)
lookup_table[obj_name].append(obj_name) lookup_table[obj_name].add(obj_name)
return lookup_table[obj_name] return lookup_table[obj_name]
def get_template_vars(schema_file_path, overrides_file_path): def get_template_vars_from_file(schema_file_path, overrides_file_path,
"""Read the needed template variables from the schema file.""" name_file_path):
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': {}
}
with open(schema_file_path) as schema_file: with open(schema_file_path) as schema_file:
schema = json.loads(schema_file.read()) schema = json.loads(schema_file.read())
with open(name_file_path) as names_file:
names = json.loads(names_file.read())
if overrides_file_path: if overrides_file_path:
with open(overrides_file_path) as overrides_file: with open(overrides_file_path) as overrides_file:
overrides = json.loads(overrides_file.read()) overrides = json.loads(overrides_file.read())
for thing in overrides['@graph']: for thing in overrides['@graph']:
merge_with_schema(schema, overrides, thing) merge_with_schema(schema, overrides, thing)
return get_template_vars(schema, names)
def get_template_vars(schema, names):
"""Read the needed template variables from the schema file."""
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': []
}
entity_parent_lookup = {}
for thing in schema['@graph']: for thing in schema['@graph']:
if thing['@type'] == 'rdfs:Class': if thing['@type'] == 'rdfs:Class':
template_vars['entities'].append(object_name_from_id(thing['@id'])) template_vars['entities'].append(make_entity(thing, names))
lookup_parents(thing, schema, lookup_parents(thing, schema, entity_parent_lookup)
template_vars['entity_parent_lookup'])
if is_enum_type(thing): if is_enum_type(thing):
template_vars['enums'].append({ template_vars['enums'].append({
'name': 'name':
object_name_from_id(thing['@id']), object_name_from_id(thing['@id']),
'id': 'id':
thing['@id'], thing['@id'],
'id_hash':
names[thing['@id']],
'options': 'options':
find_enum_options(thing['@id'], schema) find_enum_options(thing['@id'], schema, names)
}) })
elif thing['@type'] == 'rdf:Property': elif thing['@type'] == 'rdf:Property':
template_vars['properties'].append(parse_property(thing, schema)) template_vars['properties'].append(
parse_property(thing, schema, names))
for entity, parents in entity_parent_lookup.iteritems():
template_vars['entity_parent_lookup'].append({
'name':
entity,
'name_hash':
names[entity],
'parents':
[make_entity_from_name(parent, names) for parent in parents]
})
template_vars['entities'].sort() template_vars['entities'].sort(key=lambda p: p['name_hash'])
template_vars['properties'].sort(key=lambda p: p['name']) template_vars['properties'].sort(key=lambda p: p['name'])
return template_vars return template_vars
...@@ -225,13 +259,17 @@ def main(): ...@@ -225,13 +259,17 @@ def main():
'--overrides-file', '--overrides-file',
help='JSON-LD schema file with overrides to support changes not in the ' help='JSON-LD schema file with overrides to support changes not in the '
'latest schema.org version. Optional.') 'latest schema.org version. Optional.')
parser.add_argument(
'--name-file',
help='JSON file of hashed schema.org names to speed up lookups.')
parser.add_argument( parser.add_argument(
'--output-dir', '--output-dir',
help='Output directory in which to place generated code files.') help='Output directory in which to place generated code files.')
parser.add_argument('--templates', nargs='+') parser.add_argument('--templates', nargs='+')
args = parser.parse_args() args = parser.parse_args()
template_vars = get_template_vars(args.schema_file, args.overrides_file) template_vars = get_template_vars_from_file(
args.schema_file, args.overrides_file, args.name_file)
for template_file in args.templates: for template_file in args.templates:
generate_file( generate_file(
os.path.join(args.output_dir, os.path.join(args.output_dir,
......
...@@ -23,34 +23,46 @@ import jinja2 ...@@ -23,34 +23,46 @@ import jinja2
class GenerateSchemaOrgCodeTest(unittest.TestCase): class GenerateSchemaOrgCodeTest(unittest.TestCase):
def test_get_template_vars(self): def test_get_template_vars(self):
file_content = """ schema = {
{ "@graph": [{
"@graph": [
{
"@id": "http://schema.org/MediaObject", "@id": "http://schema.org/MediaObject",
"@type": "rdfs:Class" "@type": "rdfs:Class"
}, },
{ {
"@id": "http://schema.org/propertyName", "@id": "http://schema.org/propertyName",
"@type": "rdf:Property" "@type": "rdf:Property"
}]
} }
]
names = {
"http://schema.org/MediaObject": 1234,
"MediaObject": 1235,
"http://schema.org/propertyName": 2345,
"propertyName": 2346
} }
"""
with mock.patch('__builtin__.open',
mock.mock_open(read_data=file_content)) as m_open:
self.assertEqual( self.assertEqual(
generate_schema_org_code.get_template_vars(m_open, m_open), { generate_schema_org_code.get_template_vars(schema, names), {
'entities': ['MediaObject'], 'entities': [{
'name': 'MediaObject',
'name_hash': 1235
}],
'properties': [{ 'properties': [{
'name': 'propertyName', 'name': 'propertyName',
'name_hash': 2346,
'thing_types': [], 'thing_types': [],
'enum_types': [] 'enum_types': []
}], }],
'enums': [], 'enums': [],
'entity_parent_lookup': { 'entity_parent_lookup':
'MediaObject': ['MediaObject'] [{
} 'name': 'MediaObject',
'name_hash': 1235,
'parents': [{
'name': 'MediaObject',
'name_hash': 1235
}]
}]
}) })
def test_lookup_parents(self): def test_lookup_parents(self):
...@@ -65,9 +77,9 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase): ...@@ -65,9 +77,9 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
} }
brand = {'@id': schema_org_id('Brand'), 'rdfs:subClassOf': intangible} brand = {'@id': schema_org_id('Brand'), 'rdfs:subClassOf': intangible}
schema = {'@graph': [thing, intangible, structured_value, brand]} schema = {'@graph': [thing, intangible, structured_value, brand]}
self.assertListEqual( self.assertSetEqual(
generate_schema_org_code.lookup_parents(brand, schema, {}), generate_schema_org_code.lookup_parents(brand, schema, {}),
['Thing', 'Intangible', 'Brand']) set(['Thing', 'Intangible', 'Brand']))
def test_get_root_type_thing(self): def test_get_root_type_thing(self):
thing = {'@id': schema_org_id('Thing')} thing = {'@id': schema_org_id('Thing')}
...@@ -146,9 +158,13 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase): ...@@ -146,9 +158,13 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
] ]
} }
names = {"http://schema.org/Identifier": 1234, "Identifier": 1235}
self.assertEqual( self.assertEqual(
generate_schema_org_code.parse_property(identifier, schema), { generate_schema_org_code.parse_property(identifier, schema, names),
{
'name': 'Identifier', 'name': 'Identifier',
'name_hash': 1235,
'has_number': True, 'has_number': True,
'thing_types': [property_value['@id']], 'thing_types': [property_value['@id']],
'enum_types': [] 'enum_types': []
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/command_line.h"
#include "base/files/file_util.h"
#include "base/hash/hash.h"
#include "base/json/json_reader.h"
#include "base/json/json_writer.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "url/gurl.h"
namespace {
// Print the command line help.
void PrintHelp() {
LOG(ERROR) << "schema_org_name_generator <schema-file> ... <schema-file>"
<< " <output-file>";
}
} // namespace
int main(int argc, char* argv[]) {
base::CommandLine::Init(argc, argv);
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
logging::LoggingSettings settings;
settings.logging_dest =
logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
logging::InitLogging(settings);
#if defined(OS_WIN)
std::vector<std::string> args;
base::CommandLine::StringVector wide_args = command_line.GetArgs();
for (const auto& arg : wide_args) {
args.push_back(base::UTF16ToUTF8(arg));
}
#else
base::CommandLine::StringVector args = command_line.GetArgs();
#endif
if (args.size() < 2U) {
PrintHelp();
return 1;
}
// Read all the args and convert to file paths.
std::vector<base::FilePath> paths;
for (auto& arg : args) {
paths.push_back(base::FilePath::FromUTF8Unsafe(arg));
}
// Check we have at least two paths.
if (paths.size() < 2U) {
PrintHelp();
return 1;
}
// Get the last path which is the output file.
base::FilePath output_path = paths.back();
paths.pop_back();
base::DictionaryValue output_map;
std::set<std::string> names_to_generate;
for (auto& path : paths) {
path = base::MakeAbsoluteFilePath(path);
if (!base::PathExists(path)) {
LOG(ERROR) << "Input JSON file doesn't exist.";
return 1;
}
std::string json_input;
if (!base::ReadFileToString(path, &json_input)) {
LOG(ERROR) << "Could not read input JSON file.";
return 1;
}
auto value = base::JSONReader::Read(json_input);
base::DictionaryValue* dict_value = nullptr;
if (!value.has_value() || !value->GetAsDictionary(&dict_value)) {
LOG(ERROR) << "Could not parse the input JSON file";
return 1;
}
const base::ListValue* graph = nullptr;
if (!dict_value->GetList("@graph", &graph)) {
LOG(ERROR) << "Could not parse the @graph in the input JSON";
return 1;
}
for (size_t i = 0; i < graph->GetSize(); ++i) {
const base::DictionaryValue* parsed = nullptr;
if (!graph->GetDictionary(i, &parsed)) {
LOG(ERROR) << "Could not parse entry " << i << " in the input JSON";
return 1;
}
std::string id;
if (!parsed->GetString("@id", &id)) {
LOG(ERROR) << "Could not extract the id from the entry";
return 1;
}
if (id.empty()) {
LOG(ERROR) << "ID was empty";
return 1;
}
names_to_generate.insert(id);
names_to_generate.insert(GURL(id).path().substr(1));
}
}
std::set<unsigned> generated_hashes;
for (auto& name : names_to_generate) {
auto hash = base::PersistentHash(name);
if (base::Contains(generated_hashes, hash)) {
LOG(ERROR) << "Hash collision: " << name;
return 1;
}
output_map.SetStringKey(name, base::StringPrintf("0x%x", hash));
generated_hashes.insert(hash);
}
std::string output;
if (!base::JSONWriter::Write(output_map, &output)) {
LOG(ERROR) << "Failed to convert output to JSON.";
return 1;
}
if (base::WriteFile(output_path, output.c_str(),
static_cast<uint32_t>(output.size())) <= 0) {
LOG(ERROR) << "Failed to write output.";
return 1;
}
return 0;
}
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
// Do not edit. // Do not edit.
#include "components/schema_org/{{ header_file }}.h" #include "components/schema_org/{{ header_file }}.h"
#include "base/containers/flat_set.h" #include "base/hash/hash.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/strings/string_piece.h" #include "base/strings/string_piece.h"
...@@ -14,37 +14,41 @@ namespace schema_org { ...@@ -14,37 +14,41 @@ namespace schema_org {
namespace entity { namespace entity {
{% for entity in entities %} {% for entity in entities %}
const char k{{entity[0]|upper}}{{entity[1:]}}[] = "{{entity}}"; const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[] = "{{entity.name}}";
{% endfor %} {% endfor %}
bool IsValidEntityName(const std::string& entity_name) { bool IsValidEntityName(const std::string& entity_name) {
static const base::NoDestructor<base::flat_set<base::StringPiece>> switch (base::PersistentHash(entity_name)) {
kValidEntityNames(base::flat_set<base::StringPiece>({
{%for entity in entities %} {%for entity in entities %}
k{{entity[0]|upper}}{{entity[1:]}}, case {{entity.name_hash}}:
// {{ entity.name }}
return true;
{% endfor %} {% endfor %}
})); };
return kValidEntityNames->find(entity_name) != kValidEntityNames->end();
return false;
} }
bool IsDescendedFrom(const std::string& possible_parent, bool IsDescendedFrom(const std::string& possible_parent,
const std::string& possible_child) { const std::string& possible_child) {
static const base::NoDestructor<std::map<std::string, std::vector<std::string>>> const auto possible_parent_hash = base::PersistentHash(possible_parent);
kParentEntities(std::map<std::string, std::vector<std::string>>({
{%for key in entity_parent_lookup %} switch (base::PersistentHash(possible_child)) {
{ "{{key}}", { {%for entity in entity_parent_lookup %}
{% for parent in entity_parent_lookup[key] %} case {{entity.name_hash}}:
"{{parent}}", // {{ entity.name }}
switch (possible_parent_hash) {
{%for parent in entity.parents %}
case {{parent.name_hash}}:
// {{ parent.name }}
return true;
{% endfor %} {% endfor %}
} }
}, break;
{% endfor %} {% endfor %}
})); };
auto parents = kParentEntities->find(possible_child);
if (parents == kParentEntities->end())
return false; return false;
auto it = std::find_if(parents->second.begin(), parents->second.end(), [&possible_parent](const std::string& parent) { return parent == possible_parent; });
return it != parents->second.end();
} }
......
...@@ -14,7 +14,7 @@ namespace schema_org { ...@@ -14,7 +14,7 @@ namespace schema_org {
namespace entity { namespace entity {
{% for entity in entities %} {% for entity in entities %}
extern const char k{{entity[0]|upper}}{{entity[1:]}}[]; extern const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[];
{% endfor %} {% endfor %}
bool IsValidEntityName(const std::string& entity_name); bool IsValidEntityName(const std::string& entity_name);
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
// Generated by running //components/schema_org/generate_schema_org_code.py. // Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit. // Do not edit.
#include "base/hash/hash.h"
#include "components/schema_org/{{ header_file }}.h" #include "components/schema_org/{{ header_file }}.h"
namespace schema_org { namespace schema_org {
...@@ -19,15 +20,24 @@ base::Optional<int> CheckValidEnumString(const std::string& name, ...@@ -19,15 +20,24 @@ base::Optional<int> CheckValidEnumString(const std::string& name,
} }
auto path = value.path().substr(1); auto path = value.path().substr(1);
const auto path_hash = base::PersistentHash(path);
switch (base::PersistentHash(name)) {
{% for enum in enums %} {% for enum in enums %}
if (name == "{{enum.id}}") { case {{enum.id_hash}}:
// {{enum.id}}
switch (path_hash) {
{% for option in enum.options %} {% for option in enum.options %}
if (path == "{{option}}") { // {{option.name}}
case {{option.name_hash}}:
return {{ loop.index }}; return {{ loop.index }};
}
{% endfor %} {% endfor %}
} }
break;
{% endfor %} {% endfor %}
}
return base::nullopt; return base::nullopt;
} }
......
...@@ -19,7 +19,7 @@ namespace enums { ...@@ -19,7 +19,7 @@ namespace enums {
{% for enum in enums %} {% for enum in enums %}
enum class {{enum.name}} { enum class {{enum.name}} {
{% for option in enum.options %} {% for option in enum.options %}
k{{option}} = {{ loop.index }}, k{{option.name}} = {{ loop.index }},
{% endfor %} {% endfor %}
}; };
{% endfor %} {% endfor %}
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
// Generated by running //components/schema_org/generate_schema_org_code.py. // Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit. // Do not edit.
#include "base/hash/hash.h"
#include "components/schema_org/{{ header_file }}.h" #include "components/schema_org/{{ header_file }}.h"
namespace schema_org { namespace schema_org {
...@@ -12,7 +13,9 @@ namespace property { ...@@ -12,7 +13,9 @@ namespace property {
PropertyConfiguration GetPropertyConfiguration(const std::string& name) { PropertyConfiguration GetPropertyConfiguration(const std::string& name) {
{% for property in properties %} {% for property in properties %}
if (name == "{{property.name}}") { switch (base::PersistentHash(name)) {
case {{property.name_hash}}:
// {{property.name}}
return { return {
/* .text = */ {{'true' if property.has_text else 'false'}}, /* .text = */ {{'true' if property.has_text else 'false'}},
/* .date = */ {{'true' if property.has_date else 'false'}}, /* .date = */ {{'true' if property.has_date else 'false'}},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment