Commit 08a6c7b8 authored by Sam Bowen's avatar Sam Bowen Committed by Commit Bot

Generate property configurations for schema.org metadata.

This will be used to validate that property data is the right type as
described by the schema. This amends an existing script that will use a
json+ld file checked into third_party.

Bug: 1044250
Change-Id: I781f4f50a0331655e36a03b7fc26d9c54c0ae87c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2036892Reviewed-by: default avatarBecca Hughes <beccahughes@chromium.org>
Commit-Queue: Sam Bowen <sgbowen@google.com>
Cr-Commit-Position: refs/heads/master@{#738644}
parent d90d3c63
...@@ -125,6 +125,7 @@ test("components_unittests") { ...@@ -125,6 +125,7 @@ test("components_unittests") {
"//components/reading_list/core:unit_tests", "//components/reading_list/core:unit_tests",
"//components/safe_search_api:unit_tests", "//components/safe_search_api:unit_tests",
"//components/scheduling_metrics:unit_tests", "//components/scheduling_metrics:unit_tests",
"//components/schema_org:unit_tests",
"//components/search:unit_tests", "//components/search:unit_tests",
"//components/search_engines:unit_tests", "//components/search_engines:unit_tests",
"//components/search_provider_logos:unit_tests", "//components/search_provider_logos:unit_tests",
......
# Copyright 2020 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
source_set("unit_tests") {
testonly = true
sources = [ "generate_schema_org_code_unittest.cc" ]
deps = [
":generate_schema_org_code",
":schema_org_properties",
"//base",
"//testing/gmock",
"//testing/gtest",
]
}
action("generate_schema_org_code") {
script = "//components/schema_org/generate_schema_org_code.py"
sources = [
"templates/schema_org_entity_names.cc.tmpl",
"templates/schema_org_entity_names.h.tmpl",
"templates/schema_org_property_configurations.cc.tmpl",
"templates/schema_org_property_configurations.h.tmpl",
"templates/schema_org_property_names.cc.tmpl",
"templates/schema_org_property_names.h.tmpl",
]
args =
[
"--schema-file",
rebase_path("//third_party/schema_org/schema.jsonld", root_build_dir),
"--output-dir",
rebase_path("$target_gen_dir", root_build_dir),
"--templates",
] + sources
outputs = [
"$target_gen_dir/schema_org_entity_names.h",
"$target_gen_dir/schema_org_entity_names.cc",
"$target_gen_dir/schema_org_property_configurations.h",
"$target_gen_dir/schema_org_property_configurations.cc",
"$target_gen_dir/schema_org_property_names.h",
"$target_gen_dir/schema_org_property_names.cc",
]
}
static_library("schema_org_properties") {
sources = [
"$target_gen_dir/schema_org_entity_names.cc",
"$target_gen_dir/schema_org_entity_names.h",
"$target_gen_dir/schema_org_property_configurations.cc",
"$target_gen_dir/schema_org_property_configurations.h",
"$target_gen_dir/schema_org_property_names.cc",
"$target_gen_dir/schema_org_property_names.h",
]
deps = [ ":generate_schema_org_code" ]
}
...@@ -7,10 +7,10 @@ This script generates C++ objects based on a JSON+LD schema file. Blink uses the ...@@ -7,10 +7,10 @@ This script generates C++ objects based on a JSON+LD schema file. Blink uses the
generated code to scrape schema.org data from web pages. generated code to scrape schema.org data from web pages.
""" """
import argparse
import json
import sys
import os import os
import sys
import json
import argparse
_current_dir = os.path.dirname(os.path.realpath(__file__)) _current_dir = os.path.dirname(os.path.realpath(__file__))
# jinja2 is in chromium's third_party directory # jinja2 is in chromium's third_party directory
...@@ -20,11 +20,75 @@ sys.path.insert( ...@@ -20,11 +20,75 @@ sys.path.insert(
import jinja2 import jinja2
from jinja2 import Environment, PackageLoader, select_autoescape from jinja2 import Environment, PackageLoader, select_autoescape
env = Environment(loader=PackageLoader('generate_schema_org_code', '')) env = Environment(loader=PackageLoader('generate_schema_org_code', ''))
env.trim_blocks = True
env.lstrip_blocks = True
SCHEMA_ORG_PREFIX = 'http://schema.org/'
def schema_org_id(object_name):
return SCHEMA_ORG_PREFIX + object_name
def object_name_from_id(schema_org_id): def object_name_from_id(the_id):
"""Get the object name from a schema.org ID.""" """Get the object name from a schema.org ID."""
return schema_org_id[len('http://schema.org/'):] return the_id[len(SCHEMA_ORG_PREFIX):]
def get_schema_obj(obj_id, schema):
"""Search the schema graph for an object with the given ID."""
matches = [obj for obj in schema['@graph'] if obj['@id'] == obj_id]
return matches[0] if len(matches) == 1 else None
def get_root_type(the_class, schema):
"""Get the base type the class is descended from."""
class_obj = get_schema_obj(the_class['@id'], schema)
if class_obj is None:
return the_class
if class_obj['@id'] == schema_org_id('Thing'):
return class_obj
if (class_obj.has_key('@type')
and schema_org_id('DataType') in class_obj['@type']):
return class_obj
if class_obj.has_key('rdfs:subClassOf'):
subclass = class_obj['rdfs:subClassOf']
# All classes that use multiple inheritance are Thing type.
if isinstance(subclass, list):
return get_schema_obj(schema_org_id('Thing'), schema)
return get_root_type(subclass, schema)
return class_obj
def parse_property(prop, schema):
"""Parse out details about the property, including what type it can be."""
parsed_prop = {'name': object_name_from_id(prop['@id']), 'thing_types': []}
if not prop.has_key(schema_org_id('rangeIncludes')):
return parsed_prop
rangeIncludes = prop[schema_org_id('rangeIncludes')]
if not isinstance(rangeIncludes, list):
rangeIncludes = [rangeIncludes]
for possible_type in rangeIncludes:
root_type = get_root_type(possible_type, schema)
if root_type['@id'] == schema_org_id('Thing'):
parsed_prop['thing_types'].append(possible_type['@id'])
elif root_type['@id'] == schema_org_id('Text'):
parsed_prop['has_text'] = True
elif root_type['@id'] == schema_org_id('Date'):
parsed_prop['has_date'] = True
elif root_type['@id'] == schema_org_id('Time'):
parsed_prop['has_time'] = True
elif root_type['@id'] == schema_org_id('Boolean'):
parsed_prop['has_boolean'] = True
elif root_type['@id'] == schema_org_id('Number'):
parsed_prop['has_number'] = True
elif root_type['@id'] == schema_org_id('DateTime'):
parsed_prop['has_date_time'] = True
return parsed_prop
def get_template_vars(schema_file_path): def get_template_vars(schema_file_path):
...@@ -38,11 +102,10 @@ def get_template_vars(schema_file_path): ...@@ -38,11 +102,10 @@ def get_template_vars(schema_file_path):
if thing['@type'] == 'rdfs:Class': if thing['@type'] == 'rdfs:Class':
template_vars['entities'].append(object_name_from_id(thing['@id'])) template_vars['entities'].append(object_name_from_id(thing['@id']))
elif thing['@type'] == 'rdf:Property': elif thing['@type'] == 'rdf:Property':
template_vars['properties'].append( template_vars['properties'].append(parse_property(thing, schema))
object_name_from_id(thing['@id']))
template_vars['entities'].sort() template_vars['entities'].sort()
template_vars['properties'].sort() template_vars['properties'].sort(key=lambda p: p['name'])
return template_vars return template_vars
...@@ -50,7 +113,7 @@ def get_template_vars(schema_file_path): ...@@ -50,7 +113,7 @@ def get_template_vars(schema_file_path):
def generate_file(file_name, template_file, template_vars): def generate_file(file_name, template_file, template_vars):
"""Generate and write file given a template and variables to render.""" """Generate and write file given a template and variables to render."""
template_vars['header_file'] = os.path.basename( template_vars['header_file'] = os.path.basename(
template_file[template_file.index('.')]) template_file[:template_file.index('.')])
template_vars['header_guard'] = template_vars['header_file'].upper() + '_H' template_vars['header_guard'] = template_vars['header_file'].upper() + '_H'
with open(file_name, 'w') as f: with open(file_name, 'w') as f:
f.write(env.get_template(template_file).render(template_vars)) f.write(env.get_template(template_file).render(template_vars))
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/schema_org/schema_org_entity_names.h"
#include "components/schema_org/schema_org_property_configurations.h"
#include "components/schema_org/schema_org_property_names.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace schema_org {
TEST(GenerateSchemaOrgTest, EntityName) {
EXPECT_STREQ(entity::kAboutPage, "AboutPage");
}
TEST(GenerateSchemaOrgTest, PropertyName) {
EXPECT_STREQ(property::kAcceptedAnswer, "acceptedAnswer");
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsText) {
EXPECT_TRUE(property::GetPropertyConfiguration(property::kAccessCode).text);
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsDate) {
EXPECT_TRUE(property::GetPropertyConfiguration(property::kBirthDate).date);
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsTime) {
EXPECT_TRUE(property::GetPropertyConfiguration(property::kCloses).time);
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsDateTime) {
EXPECT_TRUE(property::GetPropertyConfiguration(property::kCoverageStartTime)
.date_time);
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsNumber) {
EXPECT_TRUE(
property::GetPropertyConfiguration(property::kDownvoteCount).number);
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsThingType) {
EXPECT_THAT(
property::GetPropertyConfiguration(property::kAcceptedPaymentMethod)
.thing_types,
testing::UnorderedElementsAre("http://schema.org/LoanOrCredit",
"http://schema.org/PaymentMethod"));
}
TEST(GenerateSchemaOrgCodeTest, GetPropertyConfigurationSetsMultipleTypes) {
EXPECT_TRUE(property::GetPropertyConfiguration(property::kIdentifier).text);
EXPECT_THAT(
property::GetPropertyConfiguration(property::kIdentifier).thing_types,
testing::UnorderedElementsAre("http://schema.org/PropertyValue"));
}
} // namespace schema_org
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
import sys import sys
import unittest import unittest
import generate_schema_org_code import generate_schema_org_code
from generate_schema_org_code import schema_org_id
import os import os
SRC = os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir) SRC = os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
sys.path.append(os.path.join(SRC, 'third_party', 'pymock')) sys.path.append(os.path.join(SRC, 'third_party', 'pymock'))
import mock import mock
_current_dir = os.path.dirname(os.path.realpath(__file__)) _current_dir = os.path.dirname(os.path.realpath(__file__))
...@@ -42,9 +42,76 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase): ...@@ -42,9 +42,76 @@ class GenerateSchemaOrgCodeTest(unittest.TestCase):
self.assertEqual( self.assertEqual(
generate_schema_org_code.get_template_vars(m_open), { generate_schema_org_code.get_template_vars(m_open), {
'entities': ['MediaObject'], 'entities': ['MediaObject'],
'properties': ['propertyName'] 'properties': [{
'name': 'propertyName',
'thing_types': []
}]
}) })
def test_get_root_type_thing(self):
thing = {'@id': schema_org_id('Thing')}
intangible = {
'@id': schema_org_id('Intangible'),
'rdfs:subClassOf': thing
}
structured_value = {
'@id': schema_org_id('StructuredValue'),
'rdfs:subClassOf': intangible
}
schema = {'@graph': [thing, intangible, structured_value]}
self.assertEqual(
generate_schema_org_code.get_root_type(structured_value, schema),
thing)
def test_get_root_type_datatype(self):
text = {
'@id': schema_org_id('Text'),
'@type': [schema_org_id('DataType'), 'rdfs:Class']
}
url = {'@id': schema_org_id('URL'), 'rdfs:subClassOf': text}
schema = {'@graph': [url, text]}
self.assertEqual(
generate_schema_org_code.get_root_type(url, schema), text)
def test_parse_property_identifier(self):
thing = {'@id': schema_org_id('Thing')}
intangible = {
'@id': schema_org_id('Intangible'),
'rdfs:subClassOf': thing
}
structured_value = {
'@id': schema_org_id('StructuredValue'),
'rdfs:subClassOf': intangible
}
property_value = {
'@id': schema_org_id('PropertyValue'),
'rdfs:subClassOf': structured_value
}
text = {
'@id': schema_org_id('Text'),
'@type': [schema_org_id('DataType'), 'rdfs:Class']
}
url = {'@id': schema_org_id('URL'), 'rdfs:subClassOf': text}
identifier = {
'@id': schema_org_id('Identifier'),
schema_org_id('rangeIncludes'): [property_value, url, text]
}
schema = {
'@graph': [
thing, intangible, structured_value, property_value, text, url,
identifier
]
}
self.assertEqual(
generate_schema_org_code.parse_property(identifier, schema), {
'name': 'Identifier',
'has_text': True,
'thing_types': [property_value['@id']]
})
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
// Generated by running //components/schema_org/generate_schema_org_code.py. // Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit. // Do not edit.
#include components/schema_org/{{ header_file }}.h #include "components/schema_org/{{ header_file }}.h"
namespace schema_org { namespace schema_org {
namespace entity { namespace entity {
{% for entity in entities %} {% for entity in entities %}
char k{{entity}}[] = "{{entity}}"; const char k{{entity[0]|upper}}{{entity[1:]}}[] = "{{entity}}";
{% endfor %} {% endfor %}
} // entity } // entity
......
...@@ -14,7 +14,7 @@ namespace schema_org { ...@@ -14,7 +14,7 @@ namespace schema_org {
namespace entity { namespace entity {
{% for entity in entities %} {% for entity in entities %}
extern const char k{{entity}}[]; extern const char k{{entity[0]|upper}}{{entity[1:]}}[];
{% endfor %} {% endfor %}
} // namespace entity } // namespace entity
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit.
#include "components/schema_org/{{ header_file }}.h"
namespace schema_org {
namespace property {
PropertyConfiguration GetPropertyConfiguration(const std::string& name) {
{% for property in properties %}
if (name == "{{property.name}}") {
return {
.text = {{'true' if property.has_text else 'false'}},
.date = {{'true' if property.has_date else 'false'}},
.time = {{'true' if property.has_time else 'false'}},
.date_time = {{'true' if property.has_date_time else 'false'}},
.number = {{'true' if property.has_number else 'false'}},
.thing_types = {
{% for thing_type in property.thing_types %}
"{{thing_type}}",
{% endfor %}
}
};
}
{% endfor %}
return { };
}
} // property
} // schema_org
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit.
#ifndef SCHEMA_ORG_PROPERTY_{{ header_guard | upper }}
#define SCHEMA_ORG_PROPERTY_{{ header_guard | upper }}
#include <set>
#include <string>
namespace schema_org {
namespace property {
struct PropertyConfiguration {
bool text;
bool date;
bool time;
bool date_time;
bool number;
std::set<std::string> thing_types;
};
PropertyConfiguration GetPropertyConfiguration(const std::string& name);
} // namespace property
} // namespace schema_org
#endif // SCHEMA_ORG_PROPERTY_{{ header_guard | upper }}
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
// Generated by running //components/schema_org/generate_schema_org_code.py. // Generated by running //components/schema_org/generate_schema_org_code.py.
// Do not edit. // Do not edit.
#include components/schema_org/{{ header_file }}.h #include "components/schema_org/{{ header_file }}.h"
namespace schema_org { namespace schema_org {
namespace property { namespace property {
{% for property in properties %} {% for property in properties %}
char k{{property | capitalize}}[] = "{{property}}"; const char k{{property.name[0]|upper}}{{property.name[1:]}}[] = "{{property.name}}";
{% endfor %} {% endfor %}
} // property } // property
......
...@@ -14,8 +14,8 @@ namespace schema_org { ...@@ -14,8 +14,8 @@ namespace schema_org {
namespace property { namespace property {
{% for property in properties %} {% for property in properties %}
extern const char k{{property}}[]; extern const char k{{property.name[0]|upper}}{{property.name[1:]}}[];
{% endfor %} {% endfor %}
} // namespace property } // namespace property
} // namespace schema_org } // namespace schema_org
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment