Commit 81fed303 authored by Becca Hughes's avatar Becca Hughes Committed by Commit Bot

[Media Feeds] Reduce space and increase speed

Right now we have a lot of strings in the schema_org
generated code and this adds overhead and can be
a bit slower.

This adds a "schema_org_name_generator" build task
that generates unsigned int hashes for all names
in the schema. Unfortunately this is separate because
the hash function is not available in Python.

Change-Id: I567a48b388d1e9f5e20e6f5e153402e736c4d0f0
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2144773
Commit-Queue: Becca Hughes <beccahughes@chromium.org>
Reviewed-by: default avatarTommy Steimel <steimel@chromium.org>
Cr-Commit-Position: refs/heads/master@{#758083}
parent 3728af9c
......@@ -2,6 +2,8 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import("//build/compiled_action.gni")
source_set("unit_tests") {
testonly = true
sources = [
......@@ -22,6 +24,26 @@ source_set("unit_tests") {
]
}
executable("schema_org_name_generator") {
sources = [ "schema_org_name_generator.cc" ]
deps = [
"//base",
"//url",
]
}
compiled_action("schema_org_name_data") {
tool = ":schema_org_name_generator"
args = [
rebase_path("//third_party/schema_org/schema.jsonld", root_build_dir),
rebase_path("//third_party/schema_org/overrides.jsonld", root_build_dir),
rebase_path("$target_gen_dir/schema_org_name_data.json", root_build_dir),
]
outputs = [ "$target_gen_dir/schema_org_name_data.json" ]
}
action("generate_schema_org_code") {
script = "//components/schema_org/generate_schema_org_code.py"
sources = [
......@@ -41,6 +63,9 @@ action("generate_schema_org_code") {
"--overrides-file",
rebase_path("//third_party/schema_org/overrides.jsonld",
root_build_dir),
"--name-file",
rebase_path("$target_gen_dir/schema_org_name_data.json",
root_build_dir),
"--output-dir",
rebase_path("$target_gen_dir", root_build_dir),
"--templates",
......@@ -55,6 +80,7 @@ action("generate_schema_org_code") {
"$target_gen_dir/schema_org_enums.h",
"$target_gen_dir/schema_org_enums.cc",
]
deps = [ ":schema_org_name_data" ]
}
static_library("schema_org_properties") {
......
......@@ -50,9 +50,20 @@ def is_enum_type(class_obj):
return parent_class['@id'] == schema_org_id('Enumeration')
def find_enum_options(obj_id, schema):
def make_entity(thing, names):
return {
"name": object_name_from_id(thing['@id']),
"name_hash": names[object_name_from_id(thing['@id'])]
}
def make_entity_from_name(name, names):
return {"name": name, "name_hash": names[name]}
def find_enum_options(obj_id, schema, names):
return [
object_name_from_id(obj['@id']) for obj in schema['@graph']
make_entity(obj, names) for obj in schema['@graph']
if obj['@type'] == obj_id
]
......@@ -85,10 +96,11 @@ def get_root_type(the_class, schema):
return class_obj
def parse_property(prop, schema):
def parse_property(prop, schema, names):
"""Parse out details about the property, including what type it can be."""
parsed_prop = {
'name': object_name_from_id(prop['@id']),
'name_hash': names[object_name_from_id(prop['@id'])],
'thing_types': [],
'enum_types': []
}
......@@ -142,7 +154,7 @@ def lookup_parents(thing, schema, lookup_table):
obj_name = object_name_from_id(thing['@id'])
if obj_name in lookup_table:
return lookup_table[obj_name]
lookup_table[obj_name] = []
lookup_table[obj_name] = set()
if 'rdfs:subClassOf' in thing:
parent_classes = thing['rdfs:subClassOf']
......@@ -160,48 +172,70 @@ def lookup_parents(thing, schema, lookup_table):
]
# flatten the list
found_parents = [item for sublist in found_parents for item in sublist]
lookup_table[obj_name].extend(found_parents)
lookup_table[obj_name].update(found_parents)
lookup_table[obj_name].append(obj_name)
lookup_table[obj_name].add(obj_name)
return lookup_table[obj_name]
def get_template_vars(schema_file_path, overrides_file_path):
"""Read the needed template variables from the schema file."""
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': {}
}
def get_template_vars_from_file(schema_file_path, overrides_file_path,
name_file_path):
with open(schema_file_path) as schema_file:
schema = json.loads(schema_file.read())
with open(name_file_path) as names_file:
names = json.loads(names_file.read())
if overrides_file_path:
with open(overrides_file_path) as overrides_file:
overrides = json.loads(overrides_file.read())
for thing in overrides['@graph']:
merge_with_schema(schema, overrides, thing)
return get_template_vars(schema, names)
def get_template_vars(schema, names):
"""Read the needed template variables from the schema file."""
template_vars = {
'entities': [],
'properties': [],
'enums': [],
'entity_parent_lookup': []
}
entity_parent_lookup = {}
for thing in schema['@graph']:
if thing['@type'] == 'rdfs:Class':
template_vars['entities'].append(object_name_from_id(thing['@id']))
lookup_parents(thing, schema,
template_vars['entity_parent_lookup'])
template_vars['entities'].append(make_entity(thing, names))
lookup_parents(thing, schema, entity_parent_lookup)
if is_enum_type(thing):
template_vars['enums'].append({
'name':
object_name_from_id(thing['@id']),
'id':
thing['@id'],
'id_hash':
names[thing['@id']],
'options':
find_enum_options(thing['@id'], schema)
find_enum_options(thing['@id'], schema, names)
})
elif thing['@type'] == 'rdf:Property':
template_vars['properties'].append(parse_property(thing, schema))
template_vars['properties'].append(
parse_property(thing, schema, names))
for entity, parents in entity_parent_lookup.iteritems():
template_vars['entity_parent_lookup'].append({
'name':
entity,
'name_hash':
names[entity],
'parents':
[make_entity_from_name(parent, names) for parent in parents]
})
template_vars['entities'].sort()
template_vars['entities'].sort(key=lambda p: p['name_hash'])
template_vars['properties'].sort(key=lambda p: p['name'])
return template_vars
......@@ -225,13 +259,17 @@ def main():
'--overrides-file',
help='JSON-LD schema file with overrides to support changes not in the '
'latest schema.org version. Optional.')
parser.add_argument(
'--name-file',
help='JSON file of hashed schema.org names to speed up lookups.')
parser.add_argument(
'--output-dir',
help='Output directory in which to place generated code files.')
parser.add_argument('--templates', nargs='+')
args = parser.parse_args()
template_vars = get_template_vars(args.schema_file, args.overrides_file)
template_vars = get_template_vars_from_file(
args.schema_file, args.overrides_file, args.name_file)
for template_file in args.templates:
generate_file(
os.path.join(args.output_dir,
......
......@@ -23,34 +23,46 @@ import jinja2
class GenerateSchemaOrgCodeTest(unittest.TestCase):
def test_get_template_vars(self):
file_content = """
{
"@graph": [
{
schema = {
"@graph": [{
"@id": "http://schema.org/MediaObject",
"@type": "rdfs:Class"
},
{
"@id": "http://schema.org/propertyName",
"@type": "rdf:Property"
}]
}
]
names = {
"http://schema.org/MediaObject": 1234,
"MediaObject": 1235,
"http://schema.org/propertyName": 2345,
"propertyName": 2346
}
"""
with mock.patch('__builtin__.open',
mock.mock_open(read_data=file_content)) as m_open:
self.assertEqual(
generate_schema_org_code.get_template_vars(m_open, m_open), {
'entities': ['MediaObject'],
generate_schema_org_code.get_template_vars(schema, names), {
'entities': [{
'name': 'MediaObject',
'name_hash': 1235
}],
'properties': [{
'name': 'propertyName',
'name_hash': 2346,
'thing_types': [],
'enum_types': []
}],
'enums': [],
'entity_parent_lookup': {
'MediaObject': ['MediaObject']
}
'entity_parent_lookup':
[{
'name': 'MediaObject',
'name_hash': 1235,
'parents': [{
'name': 'MediaObject',
'name_hash': 1235
}]
}]
})
def test_lookup_parents(self):
......@@ -65,9 +77,9 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
}
brand = {'@id': schema_org_id('Brand'), 'rdfs:subClassOf': intangible}
schema = {'@graph': [thing, intangible, structured_value, brand]}
self.assertListEqual(
self.assertSetEqual(
generate_schema_org_code.lookup_parents(brand, schema, {}),
['Thing', 'Intangible', 'Brand'])
set(['Thing', 'Intangible', 'Brand']))
def test_get_root_type_thing(self):
thing = {'@id': schema_org_id('Thing')}
......@@ -146,9 +158,13 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
]
}
names = {"http://schema.org/Identifier": 1234, "Identifier": 1235}
self.assertEqual(
generate_schema_org_code.parse_property(identifier, schema), {
generate_schema_org_code.parse_property(identifier, schema, names),
{
'name': 'Identifier',
'name_hash': 1235,
'has_number': True,
'thing_types': [property_value['@id']],
'enum_types': []
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/command_line.h"
#include "base/files/file_util.h"
#include "base/hash/hash.h"
#include "base/json/json_reader.h"
#include "base/json/json_writer.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "url/gurl.h"
namespace {
// Print the command line help.
void PrintHelp() {
LOG(ERROR) << "schema_org_name_generator <schema-file> ... <schema-file>"
<< " <output-file>";
}
} // namespace
int main(int argc, char* argv[]) {
base::CommandLine::Init(argc, argv);
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
logging::LoggingSettings settings;
settings.logging_dest =
logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
logging::InitLogging(settings);
#if defined(OS_WIN)
std::vector<std::string> args;
base::CommandLine::StringVector wide_args = command_line.GetArgs();
for (const auto& arg : wide_args) {
args.push_back(base::UTF16ToUTF8(arg));
}
#else
base::CommandLine::StringVector args = command_line.GetArgs();
#endif
if (args.size() < 2U) {
PrintHelp();
return 1;
}
// Read all the args and convert to file paths.
std::vector<base::FilePath> paths;
for (auto& arg : args) {
paths.push_back(base::FilePath::FromUTF8Unsafe(arg));
}
// Check we have at least two paths.
if (paths.size() < 2U) {
PrintHelp();
return 1;
}
// Get the last path which is the output file.
base::FilePath output_path = paths.back();
paths.pop_back();
base::DictionaryValue output_map;
std::set<std::string> names_to_generate;
for (auto& path : paths) {
path = base::MakeAbsoluteFilePath(path);
if (!base::PathExists(path)) {
LOG(ERROR) << "Input JSON file doesn't exist.";
return 1;
}
std::string json_input;
if (!base::ReadFileToString(path, &json_input)) {
LOG(ERROR) << "Could not read input JSON file.";
return 1;
}
auto value = base::JSONReader::Read(json_input);
base::DictionaryValue* dict_value = nullptr;
if (!value.has_value() || !value->GetAsDictionary(&dict_value)) {
LOG(ERROR) << "Could not parse the input JSON file";
return 1;
}
const base::ListValue* graph = nullptr;
if (!dict_value->GetList("@graph", &graph)) {
LOG(ERROR) << "Could not parse the @graph in the input JSON";
return 1;
}
for (size_t i = 0; i < graph->GetSize(); ++i) {
const base::DictionaryValue* parsed = nullptr;
if (!graph->GetDictionary(i, &parsed)) {
LOG(ERROR) << "Could not parse entry " << i << " in the input JSON";
return 1;
}
std::string id;
if (!parsed->GetString("@id", &id)) {
LOG(ERROR) << "Could not extract the id from the entry";
return 1;
}
if (id.empty()) {
LOG(ERROR) << "ID was empty";
return 1;
}
names_to_generate.insert(id);
names_to_generate.insert(GURL(id).path().substr(1));
}
}
std::set<unsigned> generated_hashes;
for (auto& name : names_to_generate) {
auto hash = base::PersistentHash(name);
if (base::Contains(generated_hashes, hash)) {
LOG(ERROR) << "Hash collision: " << name;
return 1;
}
output_map.SetStringKey(name, base::StringPrintf("0x%x", hash));
generated_hashes.insert(hash);
}
std::string output;
if (!base::JSONWriter::Write(output_map, &output)) {
LOG(ERROR) << "Failed to convert output to JSON.";
return 1;
}
if (base::WriteFile(output_path, output.c_str(),
static_cast<uint32_t>(output.size())) <= 0) {
LOG(ERROR) << "Failed to write output.";
return 1;
}
return 0;
}
......@@ -6,7 +6,7 @@
// Do not edit.
#include "components/schema_org/{{ header_file }}.h"
#include "base/containers/flat_set.h"
#include "base/hash/hash.h"
#include "base/no_destructor.h"
#include "base/strings/string_piece.h"
......@@ -14,37 +14,41 @@ namespace schema_org {
namespace entity {
{% for entity in entities %}
const char k{{entity[0]|upper}}{{entity[1:]}}[] = "{{entity}}";
const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[] = "{{entity.name}}";
{% endfor %}
bool IsValidEntityName(const std::string& entity_name) {
static const base::NoDestructor<base::flat_set<base::StringPiece>>
kValidEntityNames(base::flat_set<base::StringPiece>({
switch (base::PersistentHash(entity_name)) {
{%for entity in entities %}
k{{entity[0]|upper}}{{entity[1:]}},
case {{entity.name_hash}}:
// {{ entity.name }}
return true;
{% endfor %}
}));
return kValidEntityNames->find(entity_name) != kValidEntityNames->end();
};
return false;
}
bool IsDescendedFrom(const std::string& possible_parent,
const std::string& possible_child) {
static const base::NoDestructor<std::map<std::string, std::vector<std::string>>>
kParentEntities(std::map<std::string, std::vector<std::string>>({
{%for key in entity_parent_lookup %}
{ "{{key}}", {
{% for parent in entity_parent_lookup[key] %}
"{{parent}}",
const auto possible_parent_hash = base::PersistentHash(possible_parent);
switch (base::PersistentHash(possible_child)) {
{%for entity in entity_parent_lookup %}
case {{entity.name_hash}}:
// {{ entity.name }}
switch (possible_parent_hash) {
{%for parent in entity.parents %}
case {{parent.name_hash}}:
// {{ parent.name }}
return true;
{% endfor %}
}
},
break;
{% endfor %}
}));
auto parents = kParentEntities->find(possible_child);
if (parents == kParentEntities->end())
};
return false;
auto it = std::find_if(parents->second.begin(), parents->second.end(), [&possible_parent](const std::string& parent) { return parent == possible_parent; });
return it != parents->second.end();
}
......
......@@ -14,7 +14,7 @@ namespace schema_org {
namespace entity {
{% for entity in entities %}
extern const char k{{entity[0]|upper}}{{entity[1:]}}[];
extern const char k{{entity.name[0]|upper}}{{entity.name[1:]}}[];
{% endfor %}
bool IsValidEntityName(const std::string& entity_name);
......
......@@ -5,6 +5,7 @@
// Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit.
#include "base/hash/hash.h"
#include "components/schema_org/{{ header_file }}.h"
namespace schema_org {
......@@ -19,15 +20,24 @@ base::Optional<int> CheckValidEnumString(const std::string& name,
}
auto path = value.path().substr(1);
const auto path_hash = base::PersistentHash(path);
switch (base::PersistentHash(name)) {
{% for enum in enums %}
if (name == "{{enum.id}}") {
case {{enum.id_hash}}:
// {{enum.id}}
switch (path_hash) {
{% for option in enum.options %}
if (path == "{{option}}") {
// {{option.name}}
case {{option.name_hash}}:
return {{ loop.index }};
}
{% endfor %}
}
break;
{% endfor %}
}
return base::nullopt;
}
......
......@@ -19,7 +19,7 @@ namespace enums {
{% for enum in enums %}
enum class {{enum.name}} {
{% for option in enum.options %}
k{{option}} = {{ loop.index }},
k{{option.name}} = {{ loop.index }},
{% endfor %}
};
{% endfor %}
......
......@@ -5,6 +5,7 @@
// Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit.
#include "base/hash/hash.h"
#include "components/schema_org/{{ header_file }}.h"
namespace schema_org {
......@@ -12,7 +13,9 @@ namespace property {
PropertyConfiguration GetPropertyConfiguration(const std::string& name) {
{% for property in properties %}
if (name == "{{property.name}}") {
switch (base::PersistentHash(name)) {
case {{property.name_hash}}:
// {{property.name}}
return {
/* .text = */ {{'true' if property.has_text else 'false'}},
/* .date = */ {{'true' if property.has_date else 'false'}},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment