Store page no for distilled pages undergoing distillation.

In order to support distillation of previous pages and enable
meaningful incremental updates for viewers, distiller needs
to maintain page number information for pages under distillation.
This information will be used to add support for incremental updates
and distilling previous pages of an article.

BUG=288015
TEST=Covered by existing tests + added tests for failure   and 
     page limit for distiller.

Review URL: https://codereview.chromium.org/130543003

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@251546 0039d316-1c4b-4281-b951-d872f2087c98
parent 77a3bfab
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
namespace { namespace {
// Maximum number of distilled pages in an article. // Maximum number of distilled pages in an article.
const int kMaxPagesInArticle = 32; const size_t kMaxPagesInArticle = 32;
} }
namespace dom_distiller { namespace dom_distiller {
...@@ -41,64 +41,105 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { ...@@ -41,64 +41,105 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
return distiller.PassAs<Distiller>(); return distiller.PassAs<Distiller>();
} }
DistillerImpl::DistilledPageData::DistilledPageData() {}
DistillerImpl::DistilledPageData::~DistilledPageData() {}
DistillerImpl::DistillerImpl( DistillerImpl::DistillerImpl(
const DistillerPageFactory& distiller_page_factory, const DistillerPageFactory& distiller_page_factory,
const DistillerURLFetcherFactory& distiller_url_fetcher_factory) const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
: distiller_url_fetcher_factory_(distiller_url_fetcher_factory), : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
distillation_in_progress_(false) { max_pages_in_article_(kMaxPagesInArticle) {
page_distiller_.reset(new PageDistiller(distiller_page_factory)); page_distiller_.reset(new PageDistiller(distiller_page_factory));
} }
DistillerImpl::~DistillerImpl() { DistillerImpl::~DistillerImpl() { DCHECK(AreAllPagesFinished()); }
DCHECK(image_fetchers_.empty());
DCHECK(!distillation_in_progress_);
}
void DistillerImpl::Init() { void DistillerImpl::Init() {
DCHECK(!distillation_in_progress_); DCHECK(AreAllPagesFinished());
page_distiller_->Init(); page_distiller_->Init();
article_proto_.reset(new DistilledArticleProto()); }
void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
max_pages_in_article_ = max_num_pages;
}
bool DistillerImpl::AreAllPagesFinished() const {
return started_pages_index_.empty() && waiting_pages_.empty();
}
size_t DistillerImpl::TotalPageCount() const {
return waiting_pages_.size() + started_pages_index_.size() +
finished_pages_index_.size();
}
void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
if (!IsPageNumberInUse(page_num) && url.is_valid() &&
TotalPageCount() < max_pages_in_article_ &&
seen_urls_.find(url.spec()) == seen_urls_.end()) {
waiting_pages_[page_num] = url;
}
}
bool DistillerImpl::IsPageNumberInUse(int page_num) const {
return waiting_pages_.find(page_num) != waiting_pages_.end() ||
started_pages_index_.find(page_num) != started_pages_index_.end() ||
finished_pages_index_.find(page_num) != finished_pages_index_.end();
}
DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
const {
DCHECK_LT(index, pages_.size());
DistilledPageData* page_data = pages_[index];
DCHECK(page_data);
return page_data;
} }
void DistillerImpl::DistillPage(const GURL& url, void DistillerImpl::DistillPage(const GURL& url,
const DistillerCallback& distillation_cb) { const DistillerCallback& distillation_cb) {
DCHECK(!distillation_in_progress_); DCHECK(AreAllPagesFinished());
distillation_cb_ = distillation_cb; distillation_cb_ = distillation_cb;
DistillPage(url);
AddToDistillationQueue(0, url);
DistillNextPage();
} }
void DistillerImpl::DistillPage(const GURL& url) { void DistillerImpl::DistillNextPage() {
DCHECK(!distillation_in_progress_); if (!waiting_pages_.empty()) {
if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && std::map<int, GURL>::iterator front = waiting_pages_.begin();
processed_urls_.find(url.spec()) == processed_urls_.end()) { int page_num = front->first;
distillation_in_progress_ = true; const GURL url = front->second;
// Distill the next page.
waiting_pages_.erase(front);
DCHECK(url.is_valid()); DCHECK(url.is_valid());
DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
seen_urls_.insert(url.spec());
pages_.push_back(new DistilledPageData());
started_pages_index_[page_num] = pages_.size() - 1;
page_distiller_->DistillPage( page_distiller_->DistillPage(
url, url,
base::Bind(&DistillerImpl::OnPageDistillationFinished, base::Bind(&DistillerImpl::OnPageDistillationFinished,
base::Unretained(this), base::Unretained(this),
page_num,
url)); url));
} else {
RunDistillerCallbackIfDone();
} }
} }
void DistillerImpl::OnPageDistillationFinished( void DistillerImpl::OnPageDistillationFinished(
int page_num,
const GURL& page_url, const GURL& page_url,
scoped_ptr<DistilledPageInfo> distilled_page, scoped_ptr<DistilledPageInfo> distilled_page,
bool distillation_successful) { bool distillation_successful) {
DCHECK(distillation_in_progress_);
DCHECK(distilled_page.get()); DCHECK(distilled_page.get());
if (!distillation_successful) { DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
RunDistillerCallbackIfDone(); if (distillation_successful) {
} else { DistilledPageData* page_data =
DistilledPageProto* current_page = article_proto_->add_pages(); GetPageAtIndex(started_pages_index_[page_num]);
// Set the title of the article as the title of the first page. DistilledPageProto* current_page = new DistilledPageProto();
if (article_proto_->pages_size() == 1) { page_data->proto.reset(current_page);
article_proto_->set_title(distilled_page->title); page_data->page_num = page_num;
} page_data->title = distilled_page->title;
current_page->set_url(page_url.spec()); current_page->set_url(page_url.spec());
current_page->set_html(distilled_page->html); current_page->set_html(distilled_page->html);
...@@ -107,59 +148,106 @@ void DistillerImpl::OnPageDistillationFinished( ...@@ -107,59 +148,106 @@ void DistillerImpl::OnPageDistillationFinished(
if (next_page_url.is_valid()) { if (next_page_url.is_valid()) {
// The pages should be in same origin. // The pages should be in same origin.
DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
AddToDistillationQueue(page_num + 1, next_page_url);
} }
processed_urls_.insert(page_url.spec());
distillation_in_progress_ = false;
int page_number = article_proto_->pages_size();
for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
++img_num) { ++img_num) {
std::string image_id = std::string image_id =
base::IntToString(page_number) + "_" + base::IntToString(img_num); base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
} }
DistillPage(next_page_url);
AddPageIfDone(page_num);
DistillNextPage();
} else {
started_pages_index_.erase(page_num);
RunDistillerCallbackIfDone();
} }
} }
void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, void DistillerImpl::FetchImage(int page_num,
const std::string& image_id, const std::string& image_id,
const std::string& item) { const std::string& item) {
DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
DistillerURLFetcher* fetcher = DistillerURLFetcher* fetcher =
distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
image_fetchers_.push_back(fetcher); page_data->image_fetchers_.push_back(fetcher);
fetcher->FetchURL(item, fetcher->FetchURL(item,
base::Bind(&DistillerImpl::OnFetchImageDone, base::Bind(&DistillerImpl::OnFetchImageDone,
base::Unretained(this), base::Unretained(this),
base::Unretained(distilled_page_proto), page_num,
base::Unretained(fetcher), base::Unretained(fetcher),
image_id)); image_id));
} }
void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, void DistillerImpl::OnFetchImageDone(int page_num,
DistillerURLFetcher* url_fetcher, DistillerURLFetcher* url_fetcher,
const std::string& id, const std::string& id,
const std::string& response) { const std::string& response) {
DCHECK_GT(article_proto_->pages_size(), 0); DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
DCHECK(distilled_page_proto); DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
DCHECK(page_data->proto);
DCHECK(url_fetcher); DCHECK(url_fetcher);
ScopedVector<DistillerURLFetcher>::iterator fetcher_it = ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); std::find(page_data->image_fetchers_.begin(),
page_data->image_fetchers_.end(),
url_fetcher);
DCHECK(fetcher_it != image_fetchers_.end()); DCHECK(fetcher_it != page_data->image_fetchers_.end());
// Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
// callback is invoked by the |url_fetcher|. // callback is invoked by the |url_fetcher|.
image_fetchers_.weak_erase(fetcher_it); page_data->image_fetchers_.weak_erase(fetcher_it);
base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
DistilledPageProto_Image* image = distilled_page_proto->add_image();
DistilledPageProto_Image* image = page_data->proto->add_image();
image->set_name(id); image->set_name(id);
image->set_data(response); image->set_data(response);
AddPageIfDone(page_num);
}
void DistillerImpl::AddPageIfDone(int page_num) {
DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
if (page_data->image_fetchers_.empty()) {
finished_pages_index_[page_num] = started_pages_index_[page_num];
started_pages_index_.erase(page_num);
RunDistillerCallbackIfDone(); RunDistillerCallbackIfDone();
}
} }
void DistillerImpl::RunDistillerCallbackIfDone() { void DistillerImpl::RunDistillerCallbackIfDone() {
if (image_fetchers_.empty() && !distillation_in_progress_) { DCHECK(!distillation_cb_.is_null());
distillation_cb_.Run(article_proto_.Pass()); if (AreAllPagesFinished()) {
bool first_page = true;
scoped_ptr<DistilledArticleProto> article_proto(
new DistilledArticleProto());
// Stitch the pages back into the article.
for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
it != finished_pages_index_.end();) {
DistilledPageData* page_data = GetPageAtIndex(it->second);
*(article_proto->add_pages()) = *(page_data->proto);
if (first_page) {
article_proto->set_title(page_data->title);
first_page = false;
}
finished_pages_index_.erase(it++);
}
pages_.clear();
DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
max_pages_in_article_);
DCHECK(pages_.empty());
DCHECK(finished_pages_index_.empty());
distillation_cb_.Run(article_proto.Pass());
distillation_cb_.Reset();
} }
} }
......
...@@ -68,36 +68,97 @@ class DistillerImpl : public Distiller { ...@@ -68,36 +68,97 @@ class DistillerImpl : public Distiller {
virtual void DistillPage(const GURL& url, virtual void DistillPage(const GURL& url,
const DistillerCallback& callback) OVERRIDE; const DistillerCallback& callback) OVERRIDE;
void SetMaxNumPagesInArticle(size_t max_num_pages);
private: private:
void OnFetchImageDone(DistilledPageProto* distilled_page_proto, // In case of multiple pages, the Distiller maintains state of multiple pages
// as page numbers relative to the page number where distillation started.
// E.g. if distillation starts at page 2 for a 3 page article. The relative
// page numbers assigned to pages will be [-1,0,1].
// Class representing the state of a page under distillation.
struct DistilledPageData {
DistilledPageData();
virtual ~DistilledPageData();
// Relative page number of the page.
int page_num;
std::string title;
ScopedVector<DistillerURLFetcher> image_fetchers_;
scoped_ptr<DistilledPageProto> proto;
private:
DISALLOW_COPY_AND_ASSIGN(DistilledPageData);
};
void OnFetchImageDone(int page_num,
DistillerURLFetcher* url_fetcher, DistillerURLFetcher* url_fetcher,
const std::string& id, const std::string& id,
const std::string& response); const std::string& response);
void OnPageDistillationFinished(const GURL& page_url, void OnPageDistillationFinished(int page_num,
const GURL& page_url,
scoped_ptr<DistilledPageInfo> distilled_page, scoped_ptr<DistilledPageInfo> distilled_page,
bool distillation_successful); bool distillation_successful);
virtual void FetchImage(DistilledPageProto* distilled_page_proto, virtual void FetchImage(int page_num,
const std::string& image_id, const std::string& image_id,
const std::string& item); const std::string& item);
// Distills the page and adds the new page to |article_proto|. // Distills the next page.
void DistillPage(const GURL& url); void DistillNextPage();
// Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative
// page number and |url| is valid. Ignores duplicate pages and urls.
void AddToDistillationQueue(int page_num, const GURL& url);
// Check if |page_num| is a valid relative page number, i.e. page with
// |page_num| is either under distillation or has already completed
// distillation.
bool IsPageNumberInUse(int page_num) const;
bool AreAllPagesFinished() const;
// Total number of pages in the article that the distiller knows of, this
// includes pages that are pending distillation.
size_t TotalPageCount() const;
// Runs |distillation_cb_| if all distillation callbacks and image fetches are // Runs |distillation_cb_| if all distillation callbacks and image fetches are
// complete. // complete.
void RunDistillerCallbackIfDone(); void RunDistillerCallbackIfDone();
// Checks if page |distilled_page_data| has finished distillation, including
// all image fetches.
void AddPageIfDone(int page_num);
DistilledPageData* GetPageAtIndex(size_t index) const;
const DistillerURLFetcherFactory& distiller_url_fetcher_factory_; const DistillerURLFetcherFactory& distiller_url_fetcher_factory_;
scoped_ptr<PageDistiller> page_distiller_; scoped_ptr<PageDistiller> page_distiller_;
DistillerCallback distillation_cb_; DistillerCallback distillation_cb_;
ScopedVector<DistillerURLFetcher> image_fetchers_; // Set of pages that are under distillation or have finished distillation.
scoped_ptr<DistilledArticleProto> article_proto_; // |started_pages_index_| and |finished_pages_index_| maintains the mapping
bool distillation_in_progress_; // from page number to the indices in |pages_|.
// Set to keep track of which urls are already seen by the distiller. ScopedVector<DistilledPageData> pages_;
base::hash_set<std::string> processed_urls_;
// Maps page numbers of finished pages to the indices in |pages_|.
std::map<int, size_t> finished_pages_index_;
// Maps page numbers of pages under distillation to the indices in |pages_|.
// If a page is |started_pages_| that means it is still waiting for an action
// (distillation or image fetch) to finish.
base::hash_map<int, size_t> started_pages_index_;
// The list of pages that are still waiting for distillation to start.
// This is a map, to make distiller prefer distilling lower page numbers
// first.
std::map<int, GURL> waiting_pages_;
// Set to keep track of which urls are already seen by the distiller. Used to
// prevent distiller from distilling the same url twice.
base::hash_set<std::string> seen_urls_;
size_t max_pages_in_article_;
DISALLOW_COPY_AND_ASSIGN(DistillerImpl); DISALLOW_COPY_AND_ASSIGN(DistillerImpl);
}; };
......
...@@ -117,7 +117,6 @@ class MockDistillerPageFactory : public DistillerPageFactory { ...@@ -117,7 +117,6 @@ class MockDistillerPageFactory : public DistillerPageFactory {
} }
}; };
class DistillerTest : public testing::Test { class DistillerTest : public testing::Test {
public: public:
virtual ~DistillerTest() {} virtual ~DistillerTest() {}
...@@ -290,4 +289,117 @@ TEST_F(DistillerTest, DistillLinkLoop) { ...@@ -290,4 +289,117 @@ TEST_F(DistillerTest, DistillLinkLoop) {
EXPECT_EQ(article_proto_->pages_size(), 1); EXPECT_EQ(article_proto_->pages_size(), 1);
} }
TEST_F(DistillerTest, CheckMaxPageLimit) {
base::MessageLoopForUI loop;
const size_t kMaxPagesInArticle = 10;
string page_urls[kMaxPagesInArticle];
scoped_ptr<base::ListValue> list[kMaxPagesInArticle];
// Note: Next page url of the last page of article is set. So distiller will
// try to do kMaxPagesInArticle + 1 calls if the max article limit does not
// work.
string url_prefix = "http://a.com/";
for (size_t page_num = 0; page_num < kMaxPagesInArticle; ++page_num) {
page_urls[page_num] = url_prefix + base::IntToString(page_num + 1);
string content = "Content for page:" + base::IntToString(page_num);
string next_page_url = url_prefix + base::IntToString(page_num + 2);
list[page_num] = CreateDistilledValueReturnedFromJS(
kTitle, content, vector<int>(), next_page_url);
}
EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
.WillOnce(CreateMockDistillerPages(
list, page_urls, static_cast<int>(kMaxPagesInArticle)));
distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle);
distiller_->Init();
distiller_->DistillPage(
GURL(page_urls[0]),
base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
base::MessageLoop::current()->RunUntilIdle();
EXPECT_EQ(kTitle, article_proto_->title());
EXPECT_EQ(kMaxPagesInArticle,
static_cast<size_t>(article_proto_->pages_size()));
// Now check if distilling an article with exactly the page limit works by
// resetting the next page url of the last page of the article.
list[kMaxPagesInArticle - 1] =
CreateDistilledValueReturnedFromJS(kTitle, "Content", vector<int>(), "");
EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
.WillOnce(CreateMockDistillerPages(
list, page_urls, static_cast<int>(kMaxPagesInArticle)));
distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle);
distiller_->Init();
distiller_->DistillPage(
GURL(page_urls[0]),
base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
base::MessageLoop::current()->RunUntilIdle();
EXPECT_EQ(kTitle, article_proto_->title());
EXPECT_EQ(kMaxPagesInArticle,
static_cast<size_t>(article_proto_->pages_size()));
}
TEST_F(DistillerTest, SinglePageDistillationFailure) {
base::MessageLoopForUI loop;
// To simulate failure return a null value.
scoped_ptr<base::Value> nullValue(base::Value::CreateNullValue());
EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
.WillOnce(CreateMockDistillerPage(nullValue.get(), GURL(kURL)));
distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
distiller_->Init();
distiller_->DistillPage(
GURL(kURL),
base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
base::MessageLoop::current()->RunUntilIdle();
EXPECT_EQ("", article_proto_->title());
EXPECT_EQ(0, article_proto_->pages_size());
}
TEST_F(DistillerTest, MultiplePagesDistillationFailure) {
base::MessageLoopForUI loop;
const int kNumPages = 8;
string content[kNumPages];
string page_urls[kNumPages];
scoped_ptr<base::Value> distilled_values[kNumPages];
// The page number of the failed page.
int failed_page_num = 3;
string url_prefix = "http://a.com/";
for (int page_num = 0; page_num < kNumPages; ++page_num) {
page_urls[page_num] = url_prefix + base::IntToString(page_num);
content[page_num] = "Content for page:" + base::IntToString(page_num);
string next_page_url = url_prefix + base::IntToString(page_num + 1);
if (page_num != failed_page_num) {
distilled_values[page_num] = CreateDistilledValueReturnedFromJS(
kTitle, content[page_num], vector<int>(), next_page_url);
} else {
distilled_values[page_num].reset(base::Value::CreateNullValue());
}
}
// Expect only calls till the failed page number.
EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
.WillOnce(CreateMockDistillerPages(
distilled_values, page_urls, failed_page_num + 1));
distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
distiller_->Init();
distiller_->DistillPage(
GURL(page_urls[0]),
base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
base::MessageLoop::current()->RunUntilIdle();
EXPECT_EQ(kTitle, article_proto_->title());
EXPECT_EQ(article_proto_->pages_size(), failed_page_num);
for (int page_num = 0; page_num < failed_page_num; ++page_num) {
const DistilledPageProto& page = article_proto_->pages(page_num);
EXPECT_EQ(content[page_num], page.html());
EXPECT_EQ(page_urls[page_num], page.url());
}
}
} // namespace dom_distiller } // namespace dom_distiller
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment