Commit d10be7ee authored by cjhopman@chromium.org's avatar cjhopman@chromium.org

Make content_extractor support a multiple-url request

This adds the --urls flag which accepts a space-separated list of urls
to distill.

This adds a pretty straightforward approach to having multiple of these
requests happening at the same time.

Once all requests are finished, they will be printed (in the order that
they appeared in --urls). If printing binary, each serialized protobuf
will be preceded by its size (as a varint32).

TBR=ben@

Review URL: https://codereview.chromium.org/276553002

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276819 0039d316-1c4b-4281-b951-d872f2087c98
parent ab5f281c
include_rules = [
"+google", # For third_party/protobuf.
"+grit", # For generated headers.
"+jni",
"+sync/api",
......
......@@ -10,6 +10,7 @@
#include "base/path_service.h"
#include "base/run_loop.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "components/dom_distiller/content/distiller_page_web_contents.h"
#include "components/dom_distiller/core/distiller.h"
#include "components/dom_distiller/core/dom_distiller_database.h"
......@@ -22,6 +23,8 @@
#include "content/public/browser/browser_thread.h"
#include "content/public/test/content_browser_test.h"
#include "content/shell/browser/shell.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
#include "net/dns/mock_host_resolver.h"
#include "third_party/dom_distiller_js/dom_distiller.pb.h"
#include "ui/base/resource/resource_bundle.h"
......@@ -35,6 +38,9 @@ namespace {
// The url to distill.
const char* kUrlSwitch = "url";
// A space-separated list of urls to distill.
const char* kUrlsSwitch = "urls";
// Indicates that DNS resolution should be disabled for this test.
const char* kDisableDnsSwitch = "disable-dns";
......@@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only";
// Indicates to include debug output.
const char* kDebugLevel = "debug-level";
// Maximum number of concurrent started extractor requests.
const int kMaxExtractorTasks = 8;
scoped_ptr<DomDistillerService> CreateDomDistillerService(
content::BrowserContext* context,
const base::FilePath& db_path) {
......@@ -100,29 +109,30 @@ void AddComponentsResources() {
pak_file, ui::SCALE_FACTOR_NONE);
}
void LogArticle(const DistilledArticleProto& article_proto) {
std::stringstream output;
if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
output << article_proto.SerializeAsString();
} else {
output << "Article Title: " << article_proto.title() << std::endl;
output << "# of pages: " << article_proto.pages_size() << std::endl;
for (int i = 0; i < article_proto.pages_size(); ++i) {
const DistilledPageProto& page = article_proto.pages(i);
output << "Page " << i << std::endl;
output << "URL: " << page.url() << std::endl;
output << "Content: " << page.html() << std::endl;
}
}
bool WriteProtobufWithSize(
const google::protobuf::MessageLite& message,
google::protobuf::io::ZeroCopyOutputStream* output_stream) {
google::protobuf::io::CodedOutputStream coded_output(output_stream);
// Write the size.
const int size = message.ByteSize();
coded_output.WriteLittleEndian32(size);
message.SerializeWithCachedSizes(&coded_output);
return !coded_output.HadError();
}
std::string data = output.str();
if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
base::FilePath filename =
CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
base::WriteFile(filename, data.c_str(), data.size());
} else {
VLOG(0) << data;
std::string GetReadableArticleString(
const DistilledArticleProto& article_proto) {
std::stringstream output;
output << "Article Title: " << article_proto.title() << std::endl;
output << "# of pages: " << article_proto.pages_size() << std::endl;
for (int i = 0; i < article_proto.pages_size(); ++i) {
const DistilledPageProto& page = article_proto.pages(i);
output << "Page " << i << std::endl;
output << "URL: " << page.url() << std::endl;
output << "Content: " << page.html() << std::endl;
}
return output.str();
}
} // namespace
......@@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate {
return *article_proto_;
}
static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
const CommandLine& command_line) {
GURL url;
ScopedVector<ContentExtractionRequest> requests;
if (command_line.HasSwitch(kUrlSwitch)) {
GURL url;
std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
url = GURL(url_string);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
}
} else if (command_line.HasSwitch(kUrlsSwitch)) {
std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
std::vector<std::string> urls;
base::SplitString(urls_string, ' ', &urls);
for (size_t i = 0; i < urls.size(); ++i) {
GURL url(urls[i]);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
} else {
ADD_FAILURE() << "Bad url";
}
}
}
if (!url.is_valid()) {
if (requests.empty()) {
ADD_FAILURE() << "No valid url provided";
return scoped_ptr<ContentExtractionRequest>();
}
return scoped_ptr<ContentExtractionRequest>(
new ContentExtractionRequest(url));
return requests.Pass();
}
private:
......@@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate {
};
class ContentExtractor : public ContentBrowserTest {
public:
ContentExtractor()
: pending_tasks_(0),
max_tasks_(kMaxExtractorTasks),
next_request_(0),
output_data_(),
protobuf_output_stream_(
new google::protobuf::io::StringOutputStream(&output_data_)) {}
// Change behavior of the default host resolver to avoid DNS lookup errors, so
// we can make network calls.
virtual void SetUpOnMainThread() OVERRIDE {
......@@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest {
service_ = CreateDomDistillerService(context,
db_dir_.path());
const CommandLine& command_line = *CommandLine::ForCurrentProcess();
request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
request_->Start(
service_.get(),
base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
PumpQueue();
}
void PumpQueue() {
while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
requests_[next_request_]->Start(
service_.get(),
base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
++next_request_;
++pending_tasks_;
}
}
private:
......@@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest {
mock_host_resolver_override_.reset();
}
void FinishRequest() {
--pending_tasks_;
if (next_request_ == requests_.size() && pending_tasks_ == 0) {
Finish();
} else {
PumpQueue();
}
}
void DoArticleOutput() {
for (size_t i = 0; i < requests_.size(); ++i) {
const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
WriteProtobufWithSize(article, protobuf_output_stream_.get());
} else {
output_data_ += GetReadableArticleString(article) + "\n";
}
}
if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
base::FilePath filename =
CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
ASSERT_EQ(
(int)output_data_.size(),
base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
} else {
VLOG(0) << output_data_;
}
}
void Finish() {
LogArticle(request_->GetArticleCopy());
request_.reset();
DoArticleOutput();
requests_.clear();
service_.reset();
base::MessageLoop::current()->PostTask(
FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
}
size_t pending_tasks_;
size_t max_tasks_;
size_t next_request_;
base::ScopedTempDir db_dir_;
scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
scoped_ptr<DomDistillerService> service_;
scoped_ptr<ContentExtractionRequest> request_;
ScopedVector<ContentExtractionRequest> requests_;
std::string output_data_;
scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
};
IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment