Commit 2268810e authored by wychen's avatar wychen Committed by Commit bot

Provide original URLs for next page detection in dom_distiller

Option --original-url(s) is used instead of --original-domain(s) in
eval server.

Roll DomDistillerJS

Picked up changes:
8c015c1 Mock Window.Location.getHref() in PagingLinksFinderTest

BUG=425952

Review URL: https://codereview.chromium.org/887803002

Cr-Commit-Position: refs/heads/master@{#315908}
parent f735c7a3
...@@ -40,21 +40,21 @@ namespace dom_distiller { ...@@ -40,21 +40,21 @@ namespace dom_distiller {
namespace { namespace {
typedef base::hash_map<std::string, std::string> UrlToDomainMap; typedef base::hash_map<std::string, std::string> FileToUrlMap;
} }
// Factory for creating a Distiller that creates different DomDistillerOptions // Factory for creating a Distiller that creates different DomDistillerOptions
// for different URLs, i.e. a specific kOriginalDomain option for each URL. // for different URLs, i.e. a specific kOriginalUrl option for each URL.
class TestDistillerFactoryImpl : public DistillerFactory { class TestDistillerFactoryImpl : public DistillerFactory {
public: public:
TestDistillerFactoryImpl( TestDistillerFactoryImpl(
scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
const UrlToDomainMap& url_to_domain_map) const FileToUrlMap& file_to_url_map)
: distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
dom_distiller_options_(dom_distiller_options), dom_distiller_options_(dom_distiller_options),
url_to_domain_map_(url_to_domain_map) { file_to_url_map_(file_to_url_map) {
} }
~TestDistillerFactoryImpl() override {} ~TestDistillerFactoryImpl() override {}
...@@ -62,8 +62,10 @@ class TestDistillerFactoryImpl : public DistillerFactory { ...@@ -62,8 +62,10 @@ class TestDistillerFactoryImpl : public DistillerFactory {
scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
dom_distiller::proto::DomDistillerOptions options; dom_distiller::proto::DomDistillerOptions options;
options = dom_distiller_options_; options = dom_distiller_options_;
UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); if (it != file_to_url_map_.end()) {
options.set_original_url(it->second);
}
scoped_ptr<DistillerImpl> distiller(new DistillerImpl( scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
*distiller_url_fetcher_factory_, options)); *distiller_url_fetcher_factory_, options));
return distiller.Pass(); return distiller.Pass();
...@@ -72,7 +74,7 @@ class TestDistillerFactoryImpl : public DistillerFactory { ...@@ -72,7 +74,7 @@ class TestDistillerFactoryImpl : public DistillerFactory {
private: private:
scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
dom_distiller::proto::DomDistillerOptions dom_distiller_options_; dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
UrlToDomainMap url_to_domain_map_; FileToUrlMap file_to_url_map_;
}; };
namespace { namespace {
...@@ -99,12 +101,12 @@ const char* kExtractTextOnly = "extract-text-only"; ...@@ -99,12 +101,12 @@ const char* kExtractTextOnly = "extract-text-only";
// Indicates to include debug output. // Indicates to include debug output.
const char* kDebugLevel = "debug-level"; const char* kDebugLevel = "debug-level";
// The original domain of the page if |kUrlSwitch| is a file. // The original URL of the page if |kUrlSwitch| is a file.
const char* kOriginalDomain = "original-domain"; const char* kOriginalUrl = "original-url";
// A semi-colon-separated (i.e. ';') list of original domains corresponding to // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
// "kUrlsSwitch". // "kUrlsSwitch".
const char* kOriginalDomains = "original-domains"; const char* kOriginalUrls = "original-urls";
// Maximum number of concurrent started extractor requests. // Maximum number of concurrent started extractor requests.
const int kMaxExtractorTasks = 8; const int kMaxExtractorTasks = 8;
...@@ -112,7 +114,7 @@ const int kMaxExtractorTasks = 8; ...@@ -112,7 +114,7 @@ const int kMaxExtractorTasks = 8;
scoped_ptr<DomDistillerService> CreateDomDistillerService( scoped_ptr<DomDistillerService> CreateDomDistillerService(
content::BrowserContext* context, content::BrowserContext* context,
const base::FilePath& db_path, const base::FilePath& db_path,
const UrlToDomainMap& url_to_domain_map) { const FileToUrlMap& file_to_url_map) {
scoped_refptr<base::SequencedTaskRunner> background_task_runner = scoped_refptr<base::SequencedTaskRunner> background_task_runner =
content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
content::BrowserThread::GetBlockingPool()->GetSequenceToken()); content::BrowserThread::GetBlockingPool()->GetSequenceToken());
...@@ -145,7 +147,7 @@ scoped_ptr<DomDistillerService> CreateDomDistillerService( ...@@ -145,7 +147,7 @@ scoped_ptr<DomDistillerService> CreateDomDistillerService(
scoped_ptr<DistillerFactory> distiller_factory( scoped_ptr<DistillerFactory> distiller_factory(
new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
options, options,
url_to_domain_map)); file_to_url_map));
// Setting up PrefService for DistilledPagePrefs. // Setting up PrefService for DistilledPagePrefs.
user_prefs::TestingPrefServiceSyncable* pref_service = user_prefs::TestingPrefServiceSyncable* pref_service =
...@@ -227,7 +229,7 @@ class ContentExtractionRequest : public ViewRequestDelegate { ...@@ -227,7 +229,7 @@ class ContentExtractionRequest : public ViewRequestDelegate {
static ScopedVector<ContentExtractionRequest> CreateForCommandLine( static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
const base::CommandLine& command_line, const base::CommandLine& command_line,
UrlToDomainMap* url_to_domain_map) { FileToUrlMap* file_to_url_map) {
ScopedVector<ContentExtractionRequest> requests; ScopedVector<ContentExtractionRequest> requests;
if (command_line.HasSwitch(kUrlSwitch)) { if (command_line.HasSwitch(kUrlSwitch)) {
GURL url; GURL url;
...@@ -235,31 +237,32 @@ class ContentExtractionRequest : public ViewRequestDelegate { ...@@ -235,31 +237,32 @@ class ContentExtractionRequest : public ViewRequestDelegate {
url = GURL(url_string); url = GURL(url_string);
if (url.is_valid()) { if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url)); requests.push_back(new ContentExtractionRequest(url));
if (command_line.HasSwitch(kOriginalDomain)) { if (command_line.HasSwitch(kOriginalUrl)) {
(*url_to_domain_map)[url.spec()] = (*file_to_url_map)[url.spec()] =
command_line.GetSwitchValueASCII(kOriginalDomain); command_line.GetSwitchValueASCII(kOriginalUrl);
} }
} }
} else if (command_line.HasSwitch(kUrlsSwitch)) { } else if (command_line.HasSwitch(kUrlsSwitch)) {
std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
std::vector<std::string> urls; std::vector<std::string> urls;
base::SplitString(urls_string, ' ', &urls); base::SplitString(urls_string, ' ', &urls);
// Check for original-domains switch, which must exactly pair up with // Check for original-urls switch, which must exactly pair up with
// |kUrlsSwitch| i.e. number of domains must be same as that of urls. // |kUrlsSwitch| i.e. number of original urls must be same as that of
std::vector<std::string> domains; // urls.
if (command_line.HasSwitch(kOriginalDomains)) { std::vector<std::string> original_urls;
std::string domains_string = if (command_line.HasSwitch(kOriginalUrls)) {
command_line.GetSwitchValueASCII( kOriginalDomains); std::string original_urls_string =
base::SplitString(domains_string, ';', &domains); command_line.GetSwitchValueASCII(kOriginalUrls);
if (domains.size() != urls.size()) domains.clear(); base::SplitString(original_urls_string, ' ', &original_urls);
if (original_urls.size() != urls.size()) original_urls.clear();
} }
for (size_t i = 0; i < urls.size(); ++i) { for (size_t i = 0; i < urls.size(); ++i) {
GURL url(urls[i]); GURL url(urls[i]);
if (url.is_valid()) { if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url)); requests.push_back(new ContentExtractionRequest(url));
// Only regard non-empty domain. // Only regard non-empty original urls.
if (!domains.empty() && !domains[i].empty()) { if (!original_urls.empty() && !original_urls[i].empty()) {
(*url_to_domain_map)[url.spec()] = domains[i]; (*file_to_url_map)[url.spec()] = original_urls[i];
} }
} else { } else {
ADD_FAILURE() << "Bad url"; ADD_FAILURE() << "Bad url";
...@@ -320,14 +323,14 @@ class ContentExtractor : public ContentBrowserTest { ...@@ -320,14 +323,14 @@ class ContentExtractor : public ContentBrowserTest {
void Start() { void Start() {
const base::CommandLine& command_line = const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess(); *base::CommandLine::ForCurrentProcess();
UrlToDomainMap url_to_domain_map; FileToUrlMap file_to_url_map;
requests_ = ContentExtractionRequest::CreateForCommandLine( requests_ = ContentExtractionRequest::CreateForCommandLine(
command_line, &url_to_domain_map); command_line, &file_to_url_map);
content::BrowserContext* context = content::BrowserContext* context =
shell()->web_contents()->GetBrowserContext(); shell()->web_contents()->GetBrowserContext();
service_ = CreateDomDistillerService(context, service_ = CreateDomDistillerService(context,
db_dir_.path(), db_dir_.path(),
url_to_domain_map); file_to_url_map);
PumpQueue(); PumpQueue();
} }
......
Name: dom-distiller-js Name: dom-distiller-js
URL: https://github.com/chromium/dom-distiller URL: https://github.com/chromium/dom-distiller
Version: 9deb1c6369 Version: 8c015c13ad
License: BSD License: BSD
Security Critical: yes Security Critical: yes
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -102,7 +102,7 @@ message DomDistillerOptions { ...@@ -102,7 +102,7 @@ message DomDistillerOptions {
// (3): (2) and extracted paging information // (3): (2) and extracted paging information
optional int32 debug_level = 2; optional int32 debug_level = 2;
// The original domain of the page if it's a file://, used for detecting next/prev page links // The original URL of the page, which is used in the heuristics in
// which expects the same domain for both current page and paging links. // detecting next/prev page links.
optional string original_domain = 3; optional string original_url = 3;
} }
...@@ -744,7 +744,7 @@ namespace dom_distiller { ...@@ -744,7 +744,7 @@ namespace dom_distiller {
if (!dict->GetString("3", &field_value)) { if (!dict->GetString("3", &field_value)) {
goto error; goto error;
} }
message->set_original_domain(field_value); message->set_original_url(field_value);
} }
return true; return true;
...@@ -760,8 +760,8 @@ namespace dom_distiller { ...@@ -760,8 +760,8 @@ namespace dom_distiller {
if (message.has_debug_level()) { if (message.has_debug_level()) {
dict->SetInteger("2", message.debug_level()); dict->SetInteger("2", message.debug_level());
} }
if (message.has_original_domain()) { if (message.has_original_url()) {
dict->SetString("3", message.original_domain()); dict->SetString("3", message.original_url());
} }
return dict.Pass(); return dict.Pass();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment