Commit 940e7511 authored by nyquist's avatar nyquist Committed by Commit bot

Roll dom_distiller_js and add UMA for word count for distilled pages.

For every successfully distilled article a word count is
submitted to UMA.

The histogram ranges from 1->4000 words with 50 buckets.

Changes rolled in from the DOM Distiller repo:

bbf7c01 Add StatisticsInfo to DomDistillerResult proto for number of words.
fc1a5c1 treat non-breaking space as whitespace
de38c78 Expand usage of SimilarSiblingContentExpansion
444a55e reorder table tests
5ff895b add and fix missing tests to suite
970a419 Add SimilarSiblingContentExpansion
da76b1e add new table classification heuristic

BUG=417049

Review URL: https://codereview.chromium.org/608583003

Cr-Commit-Position: refs/heads/master@{#296986}
parent 535e290f
...@@ -84,35 +84,47 @@ void DistillerPage::OnDistillationDone(const GURL& page_url, ...@@ -84,35 +84,47 @@ void DistillerPage::OnDistillationDone(const GURL& page_url,
value, distiller_result.get()); value, distiller_result.get());
if (!found_content) { if (!found_content) {
DVLOG(1) << "Unable to parse DomDistillerResult."; DVLOG(1) << "Unable to parse DomDistillerResult.";
} else if (distiller_result->has_timing_info()) { } else {
const dom_distiller::proto::TimingInfo& timing = if (distiller_result->has_timing_info()) {
distiller_result->timing_info(); const dom_distiller::proto::TimingInfo& timing =
if (timing.has_markup_parsing_time()) { distiller_result->timing_info();
UMA_HISTOGRAM_TIMES( if (timing.has_markup_parsing_time()) {
"DomDistiller.Time.MarkupParsing", UMA_HISTOGRAM_TIMES(
base::TimeDelta::FromMillisecondsD(timing.markup_parsing_time())); "DomDistiller.Time.MarkupParsing",
base::TimeDelta::FromMillisecondsD(timing.markup_parsing_time()));
}
if (timing.has_document_construction_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.DocumentConstruction",
base::TimeDelta::FromMillisecondsD(
timing.document_construction_time()));
}
if (timing.has_article_processing_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.ArticleProcessing",
base::TimeDelta::FromMillisecondsD(
timing.article_processing_time()));
}
if (timing.has_formatting_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.Formatting",
base::TimeDelta::FromMillisecondsD(timing.formatting_time()));
}
if (timing.has_total_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.DistillationTotal",
base::TimeDelta::FromMillisecondsD(timing.total_time()));
}
} }
if (timing.has_document_construction_time()) { if (distiller_result->has_statistics_info()) {
UMA_HISTOGRAM_TIMES( const dom_distiller::proto::StatisticsInfo& statistics =
"DomDistiller.Time.DocumentConstruction", distiller_result->statistics_info();
base::TimeDelta::FromMillisecondsD( if (statistics.has_word_count()) {
timing.document_construction_time())); UMA_HISTOGRAM_CUSTOM_COUNTS(
} "DomDistiller.Statistics.WordCount",
if (timing.has_article_processing_time()) { statistics.word_count(),
UMA_HISTOGRAM_TIMES( 1, 4000, 50);
"DomDistiller.Time.ArticleProcessing", }
base::TimeDelta::FromMillisecondsD(
timing.article_processing_time()));
}
if (timing.has_formatting_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.Formatting",
base::TimeDelta::FromMillisecondsD(timing.formatting_time()));
}
if (timing.has_total_time()) {
UMA_HISTOGRAM_TIMES(
"DomDistiller.Time.DistillationTotal",
base::TimeDelta::FromMillisecondsD(timing.total_time()));
} }
} }
} }
......
Name: dom-distiller-js Name: dom-distiller-js
URL: https://code.google.com/p/dom-distiller URL: https://code.google.com/p/dom-distiller
Version: 432b2dc38e Version: bbf7c0181f
License: BSD License: BSD
Security Critical: yes Security Critical: yes
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -60,6 +60,10 @@ message DebugInfo { ...@@ -60,6 +60,10 @@ message DebugInfo {
optional string log = 1; optional string log = 1;
} }
message StatisticsInfo {
optional int32 word_count = 1;
}
message DomDistillerResult { message DomDistillerResult {
optional string title = 1; optional string title = 1;
optional DistilledContent distilled_content = 2; optional DistilledContent distilled_content = 2;
...@@ -68,6 +72,7 @@ message DomDistillerResult { ...@@ -68,6 +72,7 @@ message DomDistillerResult {
optional MarkupInfo markup_info = 5; optional MarkupInfo markup_info = 5;
optional TimingInfo timing_info = 6; optional TimingInfo timing_info = 6;
optional DebugInfo debug_info = 7; optional DebugInfo debug_info = 7;
optional StatisticsInfo statistics_info = 8;
} }
message DomDistillerOptions { message DomDistillerOptions {
......
...@@ -456,6 +456,33 @@ namespace dom_distiller { ...@@ -456,6 +456,33 @@ namespace dom_distiller {
} }
}; };
class StatisticsInfo {
public:
static bool ReadFromValue(const base::Value* json, dom_distiller::proto::StatisticsInfo* message) {
const base::DictionaryValue* dict;
if (!json->GetAsDictionary(&dict)) goto error;
if (dict->HasKey("1")) {
int field_value;
if (!dict->GetInteger("1", &field_value)) {
goto error;
}
message->set_word_count(field_value);
}
return true;
error:
return false;
}
static scoped_ptr<base::Value> WriteToValue(const dom_distiller::proto::StatisticsInfo& message) {
scoped_ptr<base::DictionaryValue> dict(new base::DictionaryValue());
if (message.has_word_count()) {
dict->SetInteger("1", message.word_count());
}
return dict.PassAs<base::Value>();
}
};
class DomDistillerResult { class DomDistillerResult {
public: public:
static bool ReadFromValue(const base::Value* json, dom_distiller::proto::DomDistillerResult* message) { static bool ReadFromValue(const base::Value* json, dom_distiller::proto::DomDistillerResult* message) {
...@@ -526,6 +553,15 @@ namespace dom_distiller { ...@@ -526,6 +553,15 @@ namespace dom_distiller {
goto error; goto error;
} }
} }
if (dict->HasKey("8")) {
const base::Value* inner_message_value;
if (!dict->Get("8", &inner_message_value)) {
goto error;
}
if (!dom_distiller::proto::json::StatisticsInfo::ReadFromValue(inner_message_value, message->mutable_statistics_info())) {
goto error;
}
}
return true; return true;
error: error:
...@@ -567,6 +603,11 @@ namespace dom_distiller { ...@@ -567,6 +603,11 @@ namespace dom_distiller {
dom_distiller::proto::json::DebugInfo::WriteToValue(message.debug_info()); dom_distiller::proto::json::DebugInfo::WriteToValue(message.debug_info());
dict->Set("7", inner_message_value.release()); dict->Set("7", inner_message_value.release());
} }
if (message.has_statistics_info()) {
scoped_ptr<base::Value> inner_message_value =
dom_distiller::proto::json::StatisticsInfo::WriteToValue(message.statistics_info());
dict->Set("8", inner_message_value.release());
}
return dict.PassAs<base::Value>(); return dict.PassAs<base::Value>();
} }
}; };
......
...@@ -5113,6 +5113,14 @@ Therefore, the affected-histogram name has to have at least one dot in it. ...@@ -5113,6 +5113,14 @@ Therefore, the affected-histogram name has to have at least one dot in it.
</summary> </summary>
</histogram> </histogram>
<histogram name="DomDistiller.Statistics.WordCount" units="words">
<owner>nyquist@chromium.org</owner>
<summary>
Records the number of words in a distilled page. For articles with multiple
pages, each page is counted separately.
</summary>
</histogram>
<histogram name="DomDistiller.Time.ArticleProcessing" units="milliseconds"> <histogram name="DomDistiller.Time.ArticleProcessing" units="milliseconds">
<owner>yfriedman@chromium.org</owner> <owner>yfriedman@chromium.org</owner>
<summary> <summary>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment