Commit d10be7ee authored by cjhopman@chromium.org's avatar cjhopman@chromium.org

Make content_extractor support a multiple-url request

This adds the --urls flag which accepts a space-separated list of urls
to distill.

This adds a pretty straightforward approach to having multiple of these
requests happening at the same time.

Once all requests are finished, they will be printed (in the order that
they appeared in --urls). If printing binary, each serialized protobuf
will be preceded by its size (as a varint32).

TBR=ben@

Review URL: https://codereview.chromium.org/276553002

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276819 0039d316-1c4b-4281-b951-d872f2087c98
parent ab5f281c
include_rules = [ include_rules = [
"+google", # For third_party/protobuf.
"+grit", # For generated headers. "+grit", # For generated headers.
"+jni", "+jni",
"+sync/api", "+sync/api",
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "base/path_service.h" #include "base/path_service.h"
#include "base/run_loop.h" #include "base/run_loop.h"
#include "base/strings/string_number_conversions.h" #include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "components/dom_distiller/content/distiller_page_web_contents.h" #include "components/dom_distiller/content/distiller_page_web_contents.h"
#include "components/dom_distiller/core/distiller.h" #include "components/dom_distiller/core/distiller.h"
#include "components/dom_distiller/core/dom_distiller_database.h" #include "components/dom_distiller/core/dom_distiller_database.h"
...@@ -22,6 +23,8 @@ ...@@ -22,6 +23,8 @@
#include "content/public/browser/browser_thread.h" #include "content/public/browser/browser_thread.h"
#include "content/public/test/content_browser_test.h" #include "content/public/test/content_browser_test.h"
#include "content/shell/browser/shell.h" #include "content/shell/browser/shell.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
#include "net/dns/mock_host_resolver.h" #include "net/dns/mock_host_resolver.h"
#include "third_party/dom_distiller_js/dom_distiller.pb.h" #include "third_party/dom_distiller_js/dom_distiller.pb.h"
#include "ui/base/resource/resource_bundle.h" #include "ui/base/resource/resource_bundle.h"
...@@ -35,6 +38,9 @@ namespace { ...@@ -35,6 +38,9 @@ namespace {
// The url to distill. // The url to distill.
const char* kUrlSwitch = "url"; const char* kUrlSwitch = "url";
// A space-separated list of urls to distill.
const char* kUrlsSwitch = "urls";
// Indicates that DNS resolution should be disabled for this test. // Indicates that DNS resolution should be disabled for this test.
const char* kDisableDnsSwitch = "disable-dns"; const char* kDisableDnsSwitch = "disable-dns";
...@@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only"; ...@@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only";
// Indicates to include debug output. // Indicates to include debug output.
const char* kDebugLevel = "debug-level"; const char* kDebugLevel = "debug-level";
// Maximum number of concurrent started extractor requests.
const int kMaxExtractorTasks = 8;
scoped_ptr<DomDistillerService> CreateDomDistillerService( scoped_ptr<DomDistillerService> CreateDomDistillerService(
content::BrowserContext* context, content::BrowserContext* context,
const base::FilePath& db_path) { const base::FilePath& db_path) {
...@@ -100,29 +109,30 @@ void AddComponentsResources() { ...@@ -100,29 +109,30 @@ void AddComponentsResources() {
pak_file, ui::SCALE_FACTOR_NONE); pak_file, ui::SCALE_FACTOR_NONE);
} }
void LogArticle(const DistilledArticleProto& article_proto) { bool WriteProtobufWithSize(
std::stringstream output; const google::protobuf::MessageLite& message,
if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) { google::protobuf::io::ZeroCopyOutputStream* output_stream) {
output << article_proto.SerializeAsString(); google::protobuf::io::CodedOutputStream coded_output(output_stream);
} else {
output << "Article Title: " << article_proto.title() << std::endl; // Write the size.
output << "# of pages: " << article_proto.pages_size() << std::endl; const int size = message.ByteSize();
for (int i = 0; i < article_proto.pages_size(); ++i) { coded_output.WriteLittleEndian32(size);
const DistilledPageProto& page = article_proto.pages(i); message.SerializeWithCachedSizes(&coded_output);
output << "Page " << i << std::endl; return !coded_output.HadError();
output << "URL: " << page.url() << std::endl; }
output << "Content: " << page.html() << std::endl;
}
}
std::string data = output.str(); std::string GetReadableArticleString(
if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) { const DistilledArticleProto& article_proto) {
base::FilePath filename = std::stringstream output;
CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile); output << "Article Title: " << article_proto.title() << std::endl;
base::WriteFile(filename, data.c_str(), data.size()); output << "# of pages: " << article_proto.pages_size() << std::endl;
} else { for (int i = 0; i < article_proto.pages_size(); ++i) {
VLOG(0) << data; const DistilledPageProto& page = article_proto.pages(i);
output << "Page " << i << std::endl;
output << "URL: " << page.url() << std::endl;
output << "Content: " << page.html() << std::endl;
} }
return output.str();
} }
} // namespace } // namespace
...@@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate { ...@@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate {
return *article_proto_; return *article_proto_;
} }
static scoped_ptr<ContentExtractionRequest> CreateForCommandLine( static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
const CommandLine& command_line) { const CommandLine& command_line) {
GURL url; ScopedVector<ContentExtractionRequest> requests;
if (command_line.HasSwitch(kUrlSwitch)) { if (command_line.HasSwitch(kUrlSwitch)) {
GURL url;
std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
url = GURL(url_string); url = GURL(url_string);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
}
} else if (command_line.HasSwitch(kUrlsSwitch)) {
std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
std::vector<std::string> urls;
base::SplitString(urls_string, ' ', &urls);
for (size_t i = 0; i < urls.size(); ++i) {
GURL url(urls[i]);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
} else {
ADD_FAILURE() << "Bad url";
}
}
} }
if (!url.is_valid()) { if (requests.empty()) {
ADD_FAILURE() << "No valid url provided"; ADD_FAILURE() << "No valid url provided";
return scoped_ptr<ContentExtractionRequest>();
} }
return scoped_ptr<ContentExtractionRequest>(
new ContentExtractionRequest(url)); return requests.Pass();
} }
private: private:
...@@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate { ...@@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate {
}; };
class ContentExtractor : public ContentBrowserTest { class ContentExtractor : public ContentBrowserTest {
public:
ContentExtractor()
: pending_tasks_(0),
max_tasks_(kMaxExtractorTasks),
next_request_(0),
output_data_(),
protobuf_output_stream_(
new google::protobuf::io::StringOutputStream(&output_data_)) {}
// Change behavior of the default host resolver to avoid DNS lookup errors, so // Change behavior of the default host resolver to avoid DNS lookup errors, so
// we can make network calls. // we can make network calls.
virtual void SetUpOnMainThread() OVERRIDE { virtual void SetUpOnMainThread() OVERRIDE {
...@@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest { ...@@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest {
service_ = CreateDomDistillerService(context, service_ = CreateDomDistillerService(context,
db_dir_.path()); db_dir_.path());
const CommandLine& command_line = *CommandLine::ForCurrentProcess(); const CommandLine& command_line = *CommandLine::ForCurrentProcess();
request_ = ContentExtractionRequest::CreateForCommandLine(command_line); requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
request_->Start( PumpQueue();
service_.get(), }
base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
void PumpQueue() {
while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
requests_[next_request_]->Start(
service_.get(),
base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
++next_request_;
++pending_tasks_;
}
} }
private: private:
...@@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest { ...@@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest {
mock_host_resolver_override_.reset(); mock_host_resolver_override_.reset();
} }
void FinishRequest() {
--pending_tasks_;
if (next_request_ == requests_.size() && pending_tasks_ == 0) {
Finish();
} else {
PumpQueue();
}
}
void DoArticleOutput() {
for (size_t i = 0; i < requests_.size(); ++i) {
const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
WriteProtobufWithSize(article, protobuf_output_stream_.get());
} else {
output_data_ += GetReadableArticleString(article) + "\n";
}
}
if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
base::FilePath filename =
CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
ASSERT_EQ(
(int)output_data_.size(),
base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
} else {
VLOG(0) << output_data_;
}
}
void Finish() { void Finish() {
LogArticle(request_->GetArticleCopy()); DoArticleOutput();
request_.reset(); requests_.clear();
service_.reset(); service_.reset();
base::MessageLoop::current()->PostTask( base::MessageLoop::current()->PostTask(
FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
} }
size_t pending_tasks_;
size_t max_tasks_;
size_t next_request_;
base::ScopedTempDir db_dir_; base::ScopedTempDir db_dir_;
scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
scoped_ptr<DomDistillerService> service_; scoped_ptr<DomDistillerService> service_;
scoped_ptr<ContentExtractionRequest> request_; ScopedVector<ContentExtractionRequest> requests_;
std::string output_data_;
scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
}; };
IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment