Commit a49d62c7 authored by Robert Sesek's avatar Robert Sesek Committed by Commit Bot

Use the data_decoder service in TemplateURLParser.

This requires making the TemplateURLParser be asynchronous rather than
directly returning the result. That has a ripple effect of changing the
lifetimes of some of the parameters to Parse(), such as the
SearchTermsData.

The Firefox importer also uses the TemplateURLParser (although it may
be entirely broken, per https://crbug.com/868768). The importer assumes
that all operations are synchronous, so this adds an internal helper
class to manage the now-asynchronous state for TemplateURL parsing.

Bug: 699342
Change-Id: I311d9e29dbbca34a4f5696b251a0fbaaadcc506b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1879973
Commit-Queue: Robert Sesek <rsesek@chromium.org>
Reviewed-by: default avatarIlya Sherman <isherman@chromium.org>
Reviewed-by: default avatarMartin Barbella <mbarbella@chromium.org>
Reviewed-by: default avatarNico Weber <thakis@chromium.org>
Reviewed-by: default avatarKevin Bailey <krb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#712773}
parent 2aa1c80a
......@@ -5580,6 +5580,7 @@ static_library("test_support") {
"//content/test:test_support",
"//google_apis:test_support",
"//net:test_support",
"//services/data_decoder/public/cpp:test_support",
"//services/preferences/public/cpp/tracked:test_support",
"//skia",
"//testing/gmock",
......
......@@ -21,6 +21,7 @@
#include "components/search_engines/template_url.h"
#include "components/search_engines/template_url_parser.h"
#include "components/search_engines/template_url_prepopulate_data.h"
#include "services/data_decoder/public/cpp/data_decoder.h"
#include "ui/base/l10n/l10n_util.h"
#include <iterator>
......@@ -92,53 +93,95 @@ std::unique_ptr<TemplateURL> CreateTemplateURL(const base::string16& url,
return std::make_unique<TemplateURL>(data);
}
// Parses the OpenSearch XML files in |xml_files| and populates |search_engines|
// with the resulting TemplateURLs.
void ParseSearchEnginesFromFirefoxXMLData(
const std::vector<std::string>& xml_data,
TemplateURLService::OwnedTemplateURLVector* search_engines) {
DCHECK(search_engines);
} // namespace
std::map<std::string, std::unique_ptr<TemplateURL>> search_engine_for_url;
// The first XML file represents the default search engine in Firefox 3, so we
// need to keep it on top of the list.
auto default_turl = search_engine_for_url.end();
for (auto xml_iter = xml_data.begin(); xml_iter != xml_data.end();
++xml_iter) {
std::unique_ptr<TemplateURL> template_url = TemplateURLParser::Parse(
UIThreadSearchTermsData(), xml_iter->data(), xml_iter->length(),
base::BindRepeating(&FirefoxURLParameterFilter));
if (template_url) {
auto iter = search_engine_for_url.find(template_url->url());
if (iter == search_engine_for_url.end()) {
iter = search_engine_for_url
.insert(std::make_pair(template_url->url(),
std::move(template_url)))
.first;
} else {
// We have already found a search engine with the same URL. We give
// priority to the latest one found, as GetSearchEnginesXMLFiles()
// returns a vector with first Firefox default search engines and then
// the user's ones. We want to give priority to the user ones.
iter->second = std::move(template_url);
}
if (default_turl == search_engine_for_url.end())
default_turl = iter;
// When the Bridge receives the search engines XML data via
// SetFirefoxSearchEnginesXMLData(), this class is responsible for managing the
// asynchronous TemplateURL parsing operations. The Bridge generally operates
// synchronously, so this class manages the state and notifies the bridge when
// parsing is done.
class InProcessImporterBridge::SearchEnginesParser {
public:
// Starts parsing the |search_engines_xml_data| and will notify |bridge|
// upon completion.
SearchEnginesParser(const std::vector<std::string>& search_engines_xml_data,
InProcessImporterBridge* bridge)
: bridge_(bridge), data_decoder_(new data_decoder::DataDecoder()) {
DCHECK(!search_engines_xml_data.empty());
StartParse(search_engines_xml_data);
}
// Returns true if all the data have been parsed, false if the operation
// is still ongoing.
bool is_done() const { return is_done_; }
// If InProcessImporterBridge::NotifyEnded() is called before is_done()
// returns true, NotifyEnded() sets this flag so that it can be called back
// to complete the import.
void set_notify_ended_on_completion() { notify_ended_on_completion_ = true; }
private:
void StartParse(const std::vector<std::string>& search_engines_xml_data) {
const auto& last_item = search_engines_xml_data.end() - 1;
TemplateURLParser::ParameterFilter param_filter =
base::BindRepeating(&FirefoxURLParameterFilter);
for (auto it = search_engines_xml_data.begin();
it != search_engines_xml_data.end(); ++it) {
// Because all TemplateURLParser are handled by the same data_decoder_
// instance, the results will be returned FIFO.
// The SearchEnginesParser is owned by the InProcessImporterBridge,
// which is not deleted until NotifyEnded() is called, so using Unretained
// is safe.
TemplateURLParser::ParseWithDataDecoder(
data_decoder_.get(), &search_terms_data_, *it, param_filter,
base::BindOnce(&SearchEnginesParser::OnURLParsed,
base::Unretained(this), it == last_item));
}
}
// Put the results in the |search_engines| vector.
for (auto t_iter = search_engine_for_url.begin();
t_iter != search_engine_for_url.end(); ++t_iter) {
if (t_iter == default_turl)
search_engines->insert(search_engines->begin(),
std::move(default_turl->second));
else
search_engines->push_back(std::move(t_iter->second));
void OnURLParsed(bool is_last_item, std::unique_ptr<TemplateURL> url) {
if (url)
parsed_urls_.push_back(std::move(url));
if (is_last_item)
FinishParsing();
}
}
} // namespace
void FinishParsing() {
is_done_ = true;
// Shut down the DataDecoder.
data_decoder_.reset();
bridge_->WriteSearchEngines(std::move(parsed_urls_));
if (notify_ended_on_completion_)
bridge_->NotifyEnded();
}
// Storage for the URLs. These are stored in the same order as the original
// |search_engines_xml_data|.
TemplateURLService::OwnedTemplateURLVector parsed_urls_;
InProcessImporterBridge* bridge_; // Weak, owns this.
// Set to true if the last search engine has been parsed.
bool is_done_ = false;
// Set to true if the ImporterBridge has been NotifyEnded() already but was
// waiting on this class to finish the import.
bool notify_ended_on_completion_ = false;
// Parameter for TemplateURLParser.
UIThreadSearchTermsData search_terms_data_;
// The DataDecoder instance that is shared amongst all the TemplateURLs being
// parsed.
std::unique_ptr<data_decoder::DataDecoder> data_decoder_;
DISALLOW_COPY_AND_ASSIGN(SearchEnginesParser);
};
InProcessImporterBridge::InProcessImporterBridge(
ProfileWriter* writer,
......@@ -186,10 +229,10 @@ void InProcessImporterBridge::SetKeywords(
void InProcessImporterBridge::SetFirefoxSearchEnginesXMLData(
const std::vector<std::string>& search_engine_data) {
TemplateURLService::OwnedTemplateURLVector search_engines;
ParseSearchEnginesFromFirefoxXMLData(search_engine_data, &search_engines);
writer_->AddKeywords(std::move(search_engines), true);
if (!search_engine_data.empty()) {
// SearchEnginesParser will call back the Bridge back when it is done.
search_engines_.reset(new SearchEnginesParser(search_engine_data, this));
}
}
void InProcessImporterBridge::SetPasswordForm(
......@@ -228,6 +271,13 @@ void InProcessImporterBridge::NotifyItemEnded(importer::ImportItem item) {
}
void InProcessImporterBridge::NotifyEnded() {
// If there are search engines to parse but parsing them is not yet complete,
// arrange to be called back when they are done.
if (search_engines_ && !search_engines_->is_done()) {
search_engines_->set_notify_ended_on_completion();
return;
}
host_->NotifyImportEnded();
}
......@@ -236,3 +286,35 @@ base::string16 InProcessImporterBridge::GetLocalizedString(int message_id) {
}
InProcessImporterBridge::~InProcessImporterBridge() {}
void InProcessImporterBridge::WriteSearchEngines(
TemplateURL::OwnedTemplateURLVector template_urls) {
std::map<std::string, std::unique_ptr<TemplateURL>> search_engine_for_url;
for (auto& template_url : template_urls) {
std::string key = template_url->url();
// Give priority to the latest template URL that is found, as
// GetSearchEnginesXMLFiles() returns a vector with first Firefox default
// search engines and then the user's ones. The user ones should take
// precedence.
search_engine_for_url[key] = std::move(template_url);
}
// The first URL represents the default search engine in Firefox 3, so we
// need to keep it on top of the list.
auto default_turl = search_engine_for_url.end();
if (!template_urls.empty())
default_turl = search_engine_for_url.find(template_urls[0]->url());
// Put the results in the |search_engines| vector.
TemplateURLService::OwnedTemplateURLVector search_engines;
for (auto it = search_engine_for_url.begin();
it != search_engine_for_url.end(); ++it) {
if (it == default_turl) {
search_engines.insert(search_engines.begin(),
std::move(default_turl->second));
} else {
search_engines.push_back(std::move(it->second));
}
}
writer_->AddKeywords(std::move(search_engines), true);
}
......@@ -60,10 +60,19 @@ class InProcessImporterBridge : public ImporterBridge {
// End ImporterBridge implementation.
private:
class SearchEnginesParser;
friend class SearchEnginesParser;
~InProcessImporterBridge() override;
// Called by the SearchEnginesParser when all the search engines have been
// parsed. The |template_urls| vector is in the same sort order that was
// passed to SetFirefoxSearchEnginesXMLData().
void WriteSearchEngines(TemplateURL::OwnedTemplateURLVector template_urls);
ProfileWriter* const writer_; // weak
const base::WeakPtr<ExternalProcessImporterHost> host_;
std::unique_ptr<SearchEnginesParser> search_engines_;
DISALLOW_COPY_AND_ASSIGN(InProcessImporterBridge);
};
......
......@@ -10,9 +10,11 @@
#include "base/path_service.h"
#include "base/stl_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/test/task_environment.h"
#include "chrome/common/chrome_paths.h"
#include "components/search_engines/search_terms_data.h"
#include "components/search_engines/template_url.h"
#include "services/data_decoder/public/cpp/test_support/in_process_data_decoder.h"
#include "testing/gtest/include/gtest/gtest.h"
using base::ASCIIToUTF16;
......@@ -39,15 +41,25 @@ class TemplateURLParserTest : public testing::Test {
void ParseFile(const std::string& file_name,
const TemplateURLParser::ParameterFilter& filter);
void ParseString(const std::string& data,
const TemplateURLParser::ParameterFilter& filter);
// ParseFile parses the results into this template_url.
std::unique_ptr<TemplateURL> template_url_;
private:
void OnTemplateURLParsed(base::OnceClosure quit_closure,
std::unique_ptr<TemplateURL> template_url) {
template_url_ = std::move(template_url);
std::move(quit_closure).Run();
}
base::FilePath osdd_dir_;
base::test::TaskEnvironment task_environment_;
data_decoder::test::InProcessDataDecoder data_decoder_;
};
TemplateURLParserTest::TemplateURLParserTest() {
}
TemplateURLParserTest::TemplateURLParserTest() {}
TemplateURLParserTest::~TemplateURLParserTest() {
}
......@@ -66,8 +78,19 @@ void TemplateURLParserTest::ParseFile(
std::string contents;
ASSERT_TRUE(base::ReadFileToString(full_path, &contents));
template_url_ = TemplateURLParser::Parse(SearchTermsData(), contents.data(),
contents.length(), filter);
ParseString(contents, filter);
}
void TemplateURLParserTest::ParseString(
const std::string& data,
const TemplateURLParser::ParameterFilter& filter) {
base::RunLoop run_loop;
SearchTermsData search_terms_data;
TemplateURLParser::Parse(
&search_terms_data, data, filter,
base::BindOnce(&TemplateURLParserTest::OnTemplateURLParsed,
base::Unretained(this), run_loop.QuitClosure()));
run_loop.Run();
}
// Actual tests ---------------------------------------------------------------
......@@ -249,6 +272,5 @@ TEST_F(TemplateURLParserTest, InvalidInput) {
</Url>
</OpenSearchDescription>
)";
TemplateURLParser::Parse(SearchTermsData(), char_data, base::size(char_data),
filter);
ParseString(char_data, filter);
}
......@@ -15,6 +15,7 @@
#include "components/search_engines/template_url.h"
#include "components/search_engines/template_url_data.h"
#include "components/search_engines/template_url_service_observer.h"
#include "services/data_decoder/public/cpp/test_support/in_process_data_decoder.h"
class KeywordWebDataService;
class TemplateURLService;
......@@ -83,6 +84,7 @@ class TemplateURLServiceTestUtil : public TemplateURLServiceObserver {
base::string16 search_term_;
scoped_refptr<KeywordWebDataService> web_data_service_;
std::unique_ptr<TemplateURLService> model_;
data_decoder::test::InProcessDataDecoder data_decoder_;
DISALLOW_COPY_AND_ASSIGN(TemplateURLServiceTestUtil);
};
......
......@@ -48,6 +48,7 @@ static_library("utility") {
"//media",
"//net:net_with_v8",
"//printing/buildflags",
"//services/data_decoder:lib",
"//services/network:network_service",
"//services/service_manager/public/cpp",
"//skia",
......@@ -100,7 +101,6 @@ static_library("utility") {
"//chrome/common:mojo_bindings",
"//chrome/common/importer:interfaces",
"//components/autofill/core/common",
"//services/data_decoder:lib",
"//services/proxy_resolver:lib",
]
}
......
......@@ -70,9 +70,9 @@ static_library("search_engines") {
"//components/variations",
"//google_apis",
"//net",
"//services/data_decoder/public/cpp",
"//services/network/public/cpp",
"//sql",
"//third_party/libxml", # https://crbug.com/699342
"//third_party/metrics_proto",
"//ui/base",
"//ui/gfx",
......
......@@ -15,8 +15,8 @@ include_rules = [
"+components/variations",
"+components/webdata",
"+google_apis",
"+libxml",
"+net",
"+services/data_decoder/public",
"+services/network/public/cpp",
"+services/network/test",
"+sql",
......
......@@ -80,6 +80,7 @@ class TemplateURLFetcher::RequestDelegate {
base::string16 keyword() const { return keyword_; }
private:
void OnTemplateURLParsed(std::unique_ptr<TemplateURL> template_url);
void OnLoaded();
void AddSearchProvider();
......@@ -140,6 +141,25 @@ TemplateURLFetcher::RequestDelegate::RequestDelegate(
50000 /* max_body_size */);
}
void TemplateURLFetcher::RequestDelegate::OnTemplateURLParsed(
std::unique_ptr<TemplateURL> template_url) {
template_url_ = std::move(template_url);
if (!template_url_ ||
!template_url_->url_ref().SupportsReplacement(
fetcher_->template_url_service_->search_terms_data())) {
fetcher_->RequestCompleted(this);
// WARNING: RequestCompleted deletes us.
return;
}
// Wait for the model to be loaded before adding the provider.
if (!fetcher_->template_url_service_->loaded())
return;
AddSearchProvider();
// WARNING: AddSearchProvider deletes us.
}
void TemplateURLFetcher::RequestDelegate::OnLoaded() {
template_url_subscription_.reset();
if (!template_url_)
......@@ -158,23 +178,11 @@ void TemplateURLFetcher::RequestDelegate::OnSimpleLoaderComplete(
return;
}
template_url_ = TemplateURLParser::Parse(
fetcher_->template_url_service_->search_terms_data(),
response_body->data(), response_body->length(),
TemplateURLParser::ParameterFilter());
if (!template_url_ ||
!template_url_->url_ref().SupportsReplacement(
fetcher_->template_url_service_->search_terms_data())) {
fetcher_->RequestCompleted(this);
// WARNING: RequestCompleted deletes us.
return;
}
// Wait for the model to be loaded before adding the provider.
if (!fetcher_->template_url_service_->loaded())
return;
AddSearchProvider();
// WARNING: AddSearchProvider deletes us.
TemplateURLParser::Parse(
&fetcher_->template_url_service_->search_terms_data(),
*response_body.get(), TemplateURLParser::ParameterFilter(),
base::BindOnce(&RequestDelegate::OnTemplateURLParsed,
base::Unretained(this)));
}
void TemplateURLFetcher::RequestDelegate::AddSearchProvider() {
......
......@@ -7,29 +7,26 @@
#include <string.h>
#include <algorithm>
#include <map>
#include <memory>
#include <vector>
#include "base/bind.h"
#include "base/logging.h"
#include "base/macros.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/values.h"
#include "components/search_engines/search_terms_data.h"
#include "components/search_engines/template_url.h"
#include "libxml/parser.h"
#include "libxml/xmlwriter.h"
#include "services/data_decoder/public/cpp/data_decoder.h"
#include "services/data_decoder/public/cpp/safe_xml_parser.h"
#include "services/data_decoder/public/mojom/xml_parser.mojom.h"
#include "ui/gfx/favicon_size.h"
#include "url/gurl.h"
#include "url/url_constants.h"
namespace {
// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
// to that of char, the following names are all in terms of char. This avoids
// having to convert to wide, then do comparisons.
// Defines for element names of the OSD document:
const char kURLElement[] = "Url";
const char kParamElement[] = "Param";
......@@ -56,10 +53,6 @@ const char kHTMLType[] = "text/html";
// Mime type for as you type suggestions.
const char kSuggestionType[] = "application/x-suggestions+json";
std::string XMLCharToString(const xmlChar* value) {
return std::string(reinterpret_cast<const char*>(value));
}
// Returns true if input_encoding contains a valid input encoding string. This
// doesn't verify that we have a valid encoding for the string, just that the
// string contains characters that constitute a valid input encoding.
......@@ -101,217 +94,292 @@ bool IsHTTPRef(const std::string& url) {
gurl.SchemeIs(url::kHttpsScheme));
}
} // namespace
// TemplateURLParsingContext --------------------------------------------------
// To minimize memory overhead while parsing, a SAX style parser is used.
// TemplateURLParsingContext is used to maintain the state we're in the document
// while parsing.
class TemplateURLParsingContext {
// SafeTemplateURLParser takes the output of the data_decoder service's
// XmlParser and extracts the data from the search description into a
// TemplateURL.
class SafeTemplateURLParser {
public:
// Enum of the known element types.
enum ElementType {
UNKNOWN,
OPEN_SEARCH_DESCRIPTION,
URL,
PARAM,
SHORT_NAME,
IMAGE,
INPUT_ENCODING,
ALIAS,
};
enum Method {
GET,
POST
};
// Key/value of a Param node.
typedef std::pair<std::string, std::string> Param;
using Param = std::pair<std::string, std::string>;
explicit TemplateURLParsingContext(
const TemplateURLParser::ParameterFilter& parameter_filter);
SafeTemplateURLParser(
const SearchTermsData* search_terms_data,
const TemplateURLParser::ParameterFilter& parameter_filter,
TemplateURLParser::ParseCallback callback)
: search_terms_data_(search_terms_data),
parameter_filter_(parameter_filter),
callback_(std::move(callback)) {}
static void StartElementImpl(void* ctx,
const xmlChar* name,
const xmlChar** atts);
static void EndElementImpl(void* ctx, const xmlChar* name);
static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
SafeTemplateURLParser(const SafeTemplateURLParser&) = delete;
SafeTemplateURLParser& operator=(const SafeTemplateURLParser&) = delete;
// Returns a TemplateURL representing the result of parsing. This will be
// null if parsing failed or if the results were invalid for some reason (e.g.
// the resulting URL was not HTTP[S], a name wasn't supplied, a resulting
// TemplateURLRef was invalid, etc.).
std::unique_ptr<TemplateURL> GetTemplateURL(
const SearchTermsData& search_terms_data);
// Parse callback for DataDecoder::ParseXml(). This calls the callback
// passed to the constructor upon completion.
void OnXmlParseComplete(
data_decoder::DataDecoder::ValueOrError value_or_error);
private:
// Key is UTF8 encoded.
typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
void ParseURLs(const std::vector<const base::Value*>& urls);
void ParseImages(const std::vector<const base::Value*>& images);
void ParseEncodings(const std::vector<const base::Value*>& encodings);
void ParseAliases(const std::vector<const base::Value*>& aliases);
static void InitMapping();
std::unique_ptr<TemplateURL> FinalizeTemplateURL();
void ParseURL(const xmlChar** atts);
void ParseImage(const xmlChar** atts);
void ParseParam(const xmlChar** atts);
void ProcessURLParams();
// Returns the current ElementType.
ElementType GetKnownType();
static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
// Returns all child elements of |elem| named |tag|, which are searched
// for using the XML qualified namespaces in |namespaces_|.
bool GetChildElementsByTag(const base::Value& elem,
const std::string& tag,
std::vector<const base::Value*>* children);
// Data that gets updated as we parse, and is converted to a TemplateURL by
// GetTemplateURL().
// FinalizeTemplateURL().
TemplateURLData data_;
std::vector<ElementType> elements_;
bool image_is_valid_for_favicon_;
// Character content for the current element.
base::string16 string_;
const TemplateURLParser::ParameterFilter& parameter_filter_;
// The list of parameters parsed in the Param nodes of a Url node.
std::vector<Param> extra_params_;
// The HTTP methods used.
Method method_;
Method suggestion_method_;
// If true, we are currently parsing a suggest URL, otherwise it is an HTML
// search. Note that we don't need a stack as URL nodes cannot be nested.
bool is_suggest_url_;
Method method_ = GET;
Method suggestion_method_ = GET;
// If true, the user has set a keyword and we should use it. Otherwise,
// we generate a keyword based on the URL.
bool has_custom_keyword_;
bool has_custom_keyword_ = false;
// Whether we should derive the image from the URL (when images are data
// URLs).
bool derive_image_from_url_;
bool derive_image_from_url_ = false;
// The XML namespaces that were declared on the root element. These are used
// to search for tags by name in GetChildElementsByTag(). Will always contain
// at least one element, if only the empty string.
std::vector<std::string> namespaces_;
DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
const SearchTermsData* search_terms_data_;
TemplateURLParser::ParameterFilter parameter_filter_;
TemplateURLParser::ParseCallback callback_;
};
// static
TemplateURLParsingContext::ElementNameToElementTypeMap*
TemplateURLParsingContext::kElementNameToElementTypeMap = nullptr;
TemplateURLParsingContext::TemplateURLParsingContext(
const TemplateURLParser::ParameterFilter& parameter_filter)
: image_is_valid_for_favicon_(false),
parameter_filter_(parameter_filter),
method_(GET),
suggestion_method_(GET),
is_suggest_url_(false),
has_custom_keyword_(false),
derive_image_from_url_(false) {
if (kElementNameToElementTypeMap == nullptr)
InitMapping();
}
void SafeTemplateURLParser::OnXmlParseComplete(
data_decoder::DataDecoder::ValueOrError value_or_error) {
if (value_or_error.error) {
DLOG(ERROR) << "Failed to parse XML: " << *value_or_error.error;
std::move(callback_).Run(nullptr);
return;
}
// static
void TemplateURLParsingContext::StartElementImpl(void* ctx,
const xmlChar* name,
const xmlChar** atts) {
// Remove the namespace from |name|, ex: os:Url -> Url.
std::string node_name(XMLCharToString(name));
size_t index = node_name.find_first_of(":");
if (index != std::string::npos)
node_name.erase(0, index + 1);
TemplateURLParsingContext* context =
reinterpret_cast<TemplateURLParsingContext*>(ctx);
context->elements_.push_back(
context->kElementNameToElementTypeMap->count(node_name) ?
(*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
switch (context->GetKnownType()) {
case TemplateURLParsingContext::URL:
context->extra_params_.clear();
context->ParseURL(atts);
break;
case TemplateURLParsingContext::IMAGE:
context->ParseImage(atts);
break;
case TemplateURLParsingContext::PARAM:
context->ParseParam(atts);
break;
default:
break;
const base::Value& root = *value_or_error.value;
// Get the namespaces used in the XML document, which will be used
// to access nodes by tag name in GetChildElementsByTag().
if (const base::Value* namespaces =
root.FindDictKey(data_decoder::mojom::XmlParser::kNamespacesKey)) {
for (const auto& item : namespaces->DictItems()) {
namespaces_.push_back(item.first);
}
}
context->string_.clear();
if (namespaces_.empty())
namespaces_.push_back(std::string());
std::string root_tag;
if (!data_decoder::GetXmlElementTagName(root, &root_tag) ||
(root_tag != kOpenSearchDescriptionElement &&
root_tag != kFirefoxSearchDescriptionElement)) {
DLOG(ERROR) << "Unexpected root tag: " << root_tag;
std::move(callback_).Run(nullptr);
return;
}
// The only required element is the URL.
std::vector<const base::Value*> urls;
if (!GetChildElementsByTag(root, kURLElement, &urls)) {
std::move(callback_).Run(nullptr);
return;
}
ParseURLs(urls);
std::vector<const base::Value*> images;
if (GetChildElementsByTag(root, kImageElement, &images))
ParseImages(images);
std::vector<const base::Value*> encodings;
if (GetChildElementsByTag(root, kInputEncodingElement, &encodings))
ParseEncodings(encodings);
std::vector<const base::Value*> aliases;
if (GetChildElementsByTag(root, kAliasElement, &aliases))
ParseAliases(aliases);
std::vector<const base::Value*> short_names;
if (GetChildElementsByTag(root, kShortNameElement, &short_names)) {
std::string name;
if (data_decoder::GetXmlElementText(*short_names.back(), &name))
data_.SetShortName(base::UTF8ToUTF16(name));
}
std::move(callback_).Run(FinalizeTemplateURL());
}
// static
void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
TemplateURLParsingContext* context =
reinterpret_cast<TemplateURLParsingContext*>(ctx);
switch (context->GetKnownType()) {
case TemplateURLParsingContext::URL:
context->ProcessURLParams();
break;
case TemplateURLParsingContext::SHORT_NAME:
context->data_.SetShortName(context->string_);
break;
case TemplateURLParsingContext::IMAGE: {
GURL image_url(base::UTF16ToUTF8(context->string_));
if (image_url.SchemeIs(url::kDataScheme)) {
// TODO (jcampan): bug 1169256: when dealing with data URL, we need to
// decode the data URL in the renderer. For now, we'll just point to the
// favicon from the URL.
context->derive_image_from_url_ = true;
} else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
(image_url.SchemeIs(url::kHttpScheme) ||
image_url.SchemeIs(url::kHttpsScheme))) {
context->data_.favicon_url = image_url;
}
context->image_is_valid_for_favicon_ = false;
break;
void SafeTemplateURLParser::ParseURLs(
const std::vector<const base::Value*>& urls) {
for (auto* url : urls) {
std::string template_url =
data_decoder::GetXmlElementAttribute(*url, kURLTemplateAttribute);
std::string type =
data_decoder::GetXmlElementAttribute(*url, kURLTypeAttribute);
bool is_post = base::LowerCaseEqualsASCII(
data_decoder::GetXmlElementAttribute(*url, kParamMethodAttribute),
"post");
bool is_html_url = (type == kHTMLType);
bool is_suggest_url = (type == kSuggestionType);
if (is_html_url && !template_url.empty()) {
data_.SetURL(template_url);
is_suggest_url = false;
if (is_post)
method_ = POST;
} else if (is_suggest_url) {
data_.suggestions_url = template_url;
if (is_post)
suggestion_method_ = POST;
}
case TemplateURLParsingContext::INPUT_ENCODING: {
std::string input_encoding = base::UTF16ToASCII(context->string_);
if (IsValidEncodingString(input_encoding))
context->data_.input_encodings.push_back(input_encoding);
break;
std::vector<Param> extra_params;
std::vector<const base::Value*> params;
GetChildElementsByTag(*url, kParamElement, &params);
for (auto* param : params) {
std::string key =
data_decoder::GetXmlElementAttribute(*param, kParamNameAttribute);
std::string value =
data_decoder::GetXmlElementAttribute(*param, kParamValueAttribute);
if (!key.empty() &&
(parameter_filter_.is_null() || parameter_filter_.Run(key, value))) {
extra_params.push_back(Param(key, value));
}
}
case TemplateURLParsingContext::ALIAS: {
if (!context->string_.empty()) {
context->data_.SetKeyword(context->string_);
context->has_custom_keyword_ = true;
if (!parameter_filter_.is_null() || !extra_params.empty()) {
GURL url(is_suggest_url ? data_.suggestions_url : data_.url());
if (!url.is_valid())
return;
// If there is a parameter filter, parse the existing URL and remove any
// unwanted parameter.
std::string new_query;
bool modified = false;
if (!parameter_filter_.is_null()) {
url::Component query = url.parsed_for_possibly_invalid_spec().query;
url::Component key, value;
const char* url_spec = url.spec().c_str();
while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
std::string key_str(url_spec, key.begin, key.len);
std::string value_str(url_spec, value.begin, value.len);
if (parameter_filter_.Run(key_str, value_str)) {
AppendParamToQuery(key_str, value_str, &new_query);
} else {
modified = true;
}
}
}
if (!modified)
new_query = url.query();
// Add the extra parameters if any.
if (!extra_params.empty()) {
modified = true;
for (const auto& iter : extra_params)
AppendParamToQuery(iter.first, iter.second, &new_query);
}
if (modified) {
GURL::Replacements repl;
repl.SetQueryStr(new_query);
url = url.ReplaceComponents(repl);
if (is_suggest_url)
data_.suggestions_url = url.spec();
else if (url.is_valid())
data_.SetURL(url.spec());
}
break;
}
default:
break;
}
context->string_.clear();
context->elements_.pop_back();
}
// static
void TemplateURLParsingContext::CharactersImpl(void* ctx,
const xmlChar* ch,
int len) {
reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
base::UTF8ToUTF16(
base::StringPiece(reinterpret_cast<const char*>(ch), len));
void SafeTemplateURLParser::ParseImages(
const std::vector<const base::Value*>& images) {
for (auto* image : images) {
std::string url_string;
if (!data_decoder::GetXmlElementText(*image, &url_string))
continue;
std::string type =
data_decoder::GetXmlElementAttribute(*image, kImageTypeAttribute);
int width = 0;
int height = 0;
base::StringToInt(
data_decoder::GetXmlElementAttribute(*image, kImageWidthAttribute),
&width);
base::StringToInt(
data_decoder::GetXmlElementAttribute(*image, kImageHeightAttribute),
&height);
bool image_is_valid_for_favicon =
(width == gfx::kFaviconSize) && (height == gfx::kFaviconSize) &&
((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
GURL image_url(url_string);
if (image_url.SchemeIs(url::kDataScheme)) {
// TODO(jcampan): bug 1169256: when dealing with data URL, we need to
// decode the data URL in the renderer. For now, we'll just point to the
// favicon from the URL.
derive_image_from_url_ = true;
} else if (image_is_valid_for_favicon && image_url.is_valid() &&
(image_url.SchemeIs(url::kHttpScheme) ||
image_url.SchemeIs(url::kHttpsScheme))) {
data_.favicon_url = image_url;
}
image_is_valid_for_favicon = false;
}
}
std::unique_ptr<TemplateURL> TemplateURLParsingContext::GetTemplateURL(
const SearchTermsData& search_terms_data) {
// TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
if (method_ == TemplateURLParsingContext::POST || !IsHTTPRef(data_.url()) ||
!IsHTTPRef(data_.suggestions_url))
void SafeTemplateURLParser::ParseEncodings(
const std::vector<const base::Value*>& encodings) {
for (auto* encoding : encodings) {
std::string encoding_value;
if (data_decoder::GetXmlElementText(*encoding, &encoding_value)) {
if (IsValidEncodingString(encoding_value))
data_.input_encodings.push_back(encoding_value);
}
}
}
void SafeTemplateURLParser::ParseAliases(
const std::vector<const base::Value*>& aliases) {
for (auto* alias : aliases) {
std::string alias_value;
if (data_decoder::GetXmlElementText(*alias, &alias_value)) {
data_.SetKeyword(base::UTF8ToUTF16(alias_value));
has_custom_keyword_ = true;
}
}
}
std::unique_ptr<TemplateURL> SafeTemplateURLParser::FinalizeTemplateURL() {
// TODO(https://crbug.com/18107): Support engines that use POST.
if (method_ == POST || !IsHTTPRef(data_.url()) ||
!IsHTTPRef(data_.suggestions_url)) {
DLOG(ERROR) << "POST URLs are not supported";
return nullptr;
if (suggestion_method_ == TemplateURLParsingContext::POST)
}
if (suggestion_method_ == POST)
data_.suggestions_url.clear();
// If the image was a data URL, use the favicon from the search URL instead.
// (see the TODO in EndElementImpl()).
// (see the TODO in ParseImages()).
GURL search_url(data_.url());
if (derive_image_from_url_ && data_.favicon_url.is_empty())
data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
......@@ -329,189 +397,55 @@ std::unique_ptr<TemplateURL> TemplateURLParsingContext::GetTemplateURL(
std::unique_ptr<TemplateURL> template_url =
std::make_unique<TemplateURL>(data_);
if (template_url->url().empty() ||
!template_url->url_ref().IsValid(search_terms_data) ||
!template_url->url_ref().IsValid(*search_terms_data_) ||
(!template_url->suggestions_url().empty() &&
!template_url->suggestions_url_ref().IsValid(search_terms_data))) {
!template_url->suggestions_url_ref().IsValid(*search_terms_data_))) {
DLOG(ERROR) << "Template URL is not valid";
return nullptr;
}
return template_url;
}
// static
void TemplateURLParsingContext::InitMapping() {
kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
(*kElementNameToElementTypeMap)[kURLElement] = URL;
(*kElementNameToElementTypeMap)[kParamElement] = PARAM;
(*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
(*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
(*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
OPEN_SEARCH_DESCRIPTION;
(*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
OPEN_SEARCH_DESCRIPTION;
(*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
(*kElementNameToElementTypeMap)[kAliasElement] = ALIAS;
}
void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
if (!atts)
return;
std::string template_url;
bool is_post = false;
bool is_html_url = false;
bool is_suggest_url = false;
for (; *atts; atts += 2) {
std::string name(XMLCharToString(*atts));
const xmlChar* value = atts[1];
if (name == kURLTypeAttribute) {
std::string type = XMLCharToString(value);
is_html_url = (type == kHTMLType);
is_suggest_url = (type == kSuggestionType);
} else if (name == kURLTemplateAttribute) {
template_url = XMLCharToString(value);
} else if (name == kParamMethodAttribute) {
is_post = base::LowerCaseEqualsASCII(XMLCharToString(value), "post");
}
}
if (is_html_url && !template_url.empty()) {
data_.SetURL(template_url);
is_suggest_url_ = false;
if (is_post)
method_ = POST;
} else if (is_suggest_url) {
data_.suggestions_url = template_url;
is_suggest_url_ = true;
if (is_post)
suggestion_method_ = POST;
}
}
void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
if (!atts)
return;
int width = 0;
int height = 0;
std::string type;
for (; *atts; atts += 2) {
std::string name(XMLCharToString(*atts));
const xmlChar* value = atts[1];
if (name == kImageTypeAttribute) {
type = XMLCharToString(value);
} else if (name == kImageWidthAttribute) {
base::StringToInt(XMLCharToString(value), &width);
} else if (name == kImageHeightAttribute) {
base::StringToInt(XMLCharToString(value), &height);
}
}
image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
(height == gfx::kFaviconSize) &&
((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
}
void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
if (!atts)
return;
std::string key, value;
for (; *atts; atts += 2) {
std::string name(XMLCharToString(*atts));
const xmlChar* val = atts[1];
if (name == kParamNameAttribute) {
key = XMLCharToString(val);
} else if (name == kParamValueAttribute) {
value = XMLCharToString(val);
}
bool SafeTemplateURLParser::GetChildElementsByTag(
const base::Value& elem,
const std::string& tag,
std::vector<const base::Value*>* children) {
bool result = false;
for (const auto& ns : namespaces_) {
std::string name = data_decoder::GetXmlQualifiedName(ns, tag);
result |=
data_decoder::GetAllXmlElementChildrenWithTag(elem, name, children);
}
if (!key.empty() &&
(parameter_filter_.is_null() || parameter_filter_.Run(key, value)))
extra_params_.push_back(Param(key, value));
return result;
}
void TemplateURLParsingContext::ProcessURLParams() {
if (parameter_filter_.is_null() && extra_params_.empty())
return;
GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
if (!url.is_valid())
return;
// If there is a parameter filter, parse the existing URL and remove any
// unwanted parameter.
std::string new_query;
bool modified = false;
if (!parameter_filter_.is_null()) {
url::Component query = url.parsed_for_possibly_invalid_spec().query;
url::Component key, value;
const char* url_spec = url.spec().c_str();
while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
std::string key_str(url_spec, key.begin, key.len);
std::string value_str(url_spec, value.begin, value.len);
if (parameter_filter_.Run(key_str, value_str)) {
AppendParamToQuery(key_str, value_str, &new_query);
} else {
modified = true;
}
}
}
if (!modified)
new_query = url.query();
// Add the extra parameters if any.
if (!extra_params_.empty()) {
modified = true;
for (std::vector<Param>::const_iterator iter(extra_params_.begin());
iter != extra_params_.end(); ++iter)
AppendParamToQuery(iter->first, iter->second, &new_query);
}
} // namespace
if (modified) {
GURL::Replacements repl;
repl.SetQueryStr(new_query);
url = url.ReplaceComponents(repl);
if (is_suggest_url_)
data_.suggestions_url = url.spec();
else if (url.is_valid())
data_.SetURL(url.spec());
}
}
// TemplateURLParser ----------------------------------------------------------
TemplateURLParsingContext::ElementType
TemplateURLParsingContext::GetKnownType() {
if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
return elements_[1];
// We only expect PARAM nodes under the URL node.
return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
// static
void TemplateURLParser::Parse(const SearchTermsData* search_terms_data,
const std::string& data,
const ParameterFilter& parameter_filter,
ParseCallback completion_callback) {
auto safe_parser = std::make_unique<SafeTemplateURLParser>(
search_terms_data, parameter_filter, std::move(completion_callback));
data_decoder::DataDecoder::ParseXmlIsolated(
data, base::BindOnce(&SafeTemplateURLParser::OnXmlParseComplete,
std::move(safe_parser)));
}
// TemplateURLParser ----------------------------------------------------------
// static
std::unique_ptr<TemplateURL> TemplateURLParser::Parse(
const SearchTermsData& search_terms_data,
const char* data,
size_t length,
const TemplateURLParser::ParameterFilter& param_filter) {
// xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
// &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
// If this becomes problematic we'll need to provide our own entity
// type for &amp;, or strip out &#38; by hand after parsing.
int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
TemplateURLParsingContext context(param_filter);
xmlSAXHandler sax_handler;
memset(&sax_handler, 0, sizeof(sax_handler));
sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
static_cast<int>(length));
xmlSubstituteEntitiesDefault(last_sub_entities_value);
return error ? nullptr : context.GetTemplateURL(search_terms_data);
void TemplateURLParser::ParseWithDataDecoder(
data_decoder::DataDecoder* data_decoder,
const SearchTermsData* search_terms_data,
const std::string& data,
const ParameterFilter& parameter_filter,
ParseCallback completion_callback) {
auto safe_parser = std::make_unique<SafeTemplateURLParser>(
search_terms_data, parameter_filter, std::move(completion_callback));
data_decoder->ParseXml(
data, base::BindOnce(&SafeTemplateURLParser::OnXmlParseComplete,
std::move(safe_parser)));
}
......@@ -16,6 +16,10 @@
class SearchTermsData;
class TemplateURL;
namespace data_decoder {
class DataDecoder;
}
// TemplateURLParser, as the name implies, handling reading of TemplateURLs
// from OpenSearch description documents.
class TemplateURLParser {
......@@ -27,19 +31,30 @@ class TemplateURLParser {
using ParameterFilter =
base::RepeatingCallback<bool(const std::string&, const std::string&)>;
using ParseCallback = base::OnceCallback<void(std::unique_ptr<TemplateURL>)>;
// Decodes the chunk of data representing a TemplateURL, creates the
// TemplateURL, and returns it. Returns null if the data does not describe a
// valid TemplateURL, the URLs referenced do not point to valid http/https
// resources, or for some other reason we do not support the described
// TemplateURL. |parameter_filter| can be used if you want to filter some
// parameters out of the URL. For example, when importing from another
// browser, we remove any parameter identifying that browser. If set to null,
// the URL is not modified.
static std::unique_ptr<TemplateURL> Parse(
const SearchTermsData& search_terms_data,
const char* data,
size_t length,
const ParameterFilter& parameter_filter);
// TemplateURL, and calls the |completion_callback| with the result. A null
// value is provided if the data does not describe a valid TemplateURL, the
// URLs referenced do not point to valid http/https resources, or for some
// other reason we do not support the described TemplateURL.
// |parameter_filter| can be used if you want to filter some parameters out
// of the URL. For example, when importing from another browser, we remove
// any parameter identifying that browser. If set to null, the URL is not
// modified.
static void Parse(const SearchTermsData* search_terms_data,
const std::string& data,
const ParameterFilter& parameter_filter,
ParseCallback completion_callback);
// The same as Parse(), but it allows the caller to manage the lifetime of
// the DataDecoder service. The |data_decoder| must be kept alive until the
// |completion_callback| is called.
static void ParseWithDataDecoder(data_decoder::DataDecoder* data_decoder,
const SearchTermsData* search_terms_data,
const std::string& data,
const ParameterFilter& parameter_filter,
ParseCallback completion_callback);
private:
// No one should create one of these.
......
......@@ -31,11 +31,13 @@ source_set("cpp") {
public = [
"data_decoder.h",
"json_sanitizer.h",
"safe_xml_parser.h",
]
sources = [
"data_decoder.cc",
"json_sanitizer.cc",
"safe_xml_parser.cc",
]
configs += [ "//build/config/compiler:wexit_time_destructors" ]
......@@ -68,12 +70,10 @@ source_set("cpp") {
public += [
"decode_image.h",
"safe_bundled_exchanges_parser.h",
"safe_xml_parser.h",
]
sources += [
"decode_image.cc",
"safe_bundled_exchanges_parser.cc",
"safe_xml_parser.cc",
]
}
......
......@@ -67,6 +67,8 @@ fuzzer_test("template_url_parser_fuzzer") {
"//base",
"//base:i18n",
"//components/search_engines:search_engines",
"//services/data_decoder/public/cpp",
"//services/data_decoder/public/cpp:test_support",
"//third_party/libxml:libxml",
]
dict = "//third_party/libxml/fuzz/xml.dict"
......
......@@ -14,9 +14,12 @@
#include "base/bind.h"
#include "base/command_line.h"
#include "base/i18n/icu_util.h"
#include "base/run_loop.h"
#include "base/task/single_thread_task_executor.h"
#include "components/search_engines/search_terms_data.h"
#include "components/search_engines/template_url.h"
#include "components/search_engines/template_url_parser.h"
#include "services/data_decoder/public/cpp/test_support/in_process_data_decoder.h"
#include "testing/libfuzzer/libfuzzer_exports.h"
bool PseudoRandomFilter(std::mt19937* generator,
......@@ -45,7 +48,11 @@ void ignore(void* ctx, const char* msg, ...) {
class Env {
public:
Env() { xmlSetGenericErrorFunc(NULL, &ignore); }
Env() { xmlSetGenericErrorFunc(nullptr, &ignore); }
private:
base::SingleThreadTaskExecutor executor_;
data_decoder::test::InProcessDataDecoder data_decoder_;
};
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
......@@ -63,11 +70,22 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
// does not support 8 bit types on Windows.
std::uniform_int_distribution<uint16_t> pool(0, 1);
base::RunLoop run_loop;
SearchTermsData search_terms_data;
std::string string_data(reinterpret_cast<const char*>(params + 1), size);
TemplateURLParser::ParameterFilter filter =
base::BindRepeating(&PseudoRandomFilter, base::Unretained(&generator),
base::Unretained(&pool));
TemplateURLParser::Parse(&search_terms_data, string_data, filter,
base::BindOnce(
[](base::OnceClosure quit_closure,
std::unique_ptr<TemplateURL> ignored) {
std::move(quit_closure).Run();
},
run_loop.QuitClosure()));
run_loop.Run();
const char* char_data = reinterpret_cast<const char*>(params + 1);
TemplateURLParser::Parse(SearchTermsData(), char_data, size, filter);
return 0;
}
......@@ -141,7 +141,6 @@ static_library("libxml") {
":xml_reader",
":xml_writer",
":libxml_utils",
"//components/search_engines",
"//testing/libfuzzer/*",
"//third_party/blink/renderer/*",
"//third_party/fontconfig",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment