Commit 605ee933 authored by Xiaocheng Hu's avatar Xiaocheng Hu Committed by Commit Bot

Change TranslateHelper to use a textContent-like text dump

Currently, TranslateHelper caputres page innerText to decide the
page language. The text capturing is done in
RenderFrame::DidMeaningfulLayout(), in which we normally have the
clean layout, which is necessary for computing innerText.

However, due to some layout bug [1], we don't always have clean
layout at the call site. As fixing the layout bug is hard, this
patch changes the text capture algorithm to return a string similar
to textContent [2] instead, which doesn't require clean layout.

The difference between the new and old text capture is subtle:

- textContent may include invisible text nodes, while innerText never
  does that. However, this patch uses a slightly modified textContent
  that doesn't include text nodes in STYLE or SCRIPT elements.
  Other invisible text may still be included.

- textContent is a simple concatenation of all text nodes. innerText
  does some "formatting" by inserting/deleting some whitespace
  characters, including:
  - Insertion of '\n' between blocks of text (e.g., between <div>)
  - Insertion of '\t' between table cells
  - This patch uses a custom dump algorithm that still collapses
    consecutive whitespaces; however, the collapsing happens regardless
    of style

[1] crbug.com/803403 and crbug.com/585164. The crash happens in the
    wild, but we haven't even found a stable repro case yet.

Bug: 803403
Change-Id: I7e108d368cbcaccbbb60582323a9e9e041d95269
Reviewed-on: https://chromium-review.googlesource.com/891582Reviewed-by: default avatarTakashi Toyoshima <toyoshim@chromium.org>
Reviewed-by: default avatarKent Tamura <tkent@chromium.org>
Reviewed-by: default avatarRachel Blum <groby@chromium.org>
Commit-Queue: Xiaocheng Hu <xiaochengh@chromium.org>
Cr-Commit-Position: refs/heads/master@{#536969}
parent a85265e3
...@@ -59,11 +59,11 @@ size_t RequiredLineBreaksAround(const Node& node) { ...@@ -59,11 +59,11 @@ size_t RequiredLineBreaksAround(const Node& node) {
// whitespace characters are left as-is, without any collapsing or conversion. // whitespace characters are left as-is, without any collapsing or conversion.
// For example, from HTML <p>\na\n\nb\n</p>, we get text dump "a\n\nb". // For example, from HTML <p>\na\n\nb\n</p>, we get text dump "a\n\nb".
// [*] https://developer.mozilla.org/en-US/docs/Web/API/Node/innerText // [*] https://developer.mozilla.org/en-US/docs/Web/API/Node/innerText
class TextDumper final { class InnerTextDumper final {
STACK_ALLOCATED(); STACK_ALLOCATED();
public: public:
TextDumper(StringBuilder& builder, size_t max_length) InnerTextDumper(StringBuilder& builder, size_t max_length)
: builder_(builder), max_length_(max_length) {} : builder_(builder), max_length_(max_length) {}
void DumpTextFrom(const Node& node) { void DumpTextFrom(const Node& node) {
...@@ -155,11 +155,84 @@ class TextDumper final { ...@@ -155,11 +155,84 @@ class TextDumper final {
StringBuilder& builder_; StringBuilder& builder_;
const size_t max_length_; const size_t max_length_;
DISALLOW_COPY_AND_ASSIGN(TextDumper); DISALLOW_COPY_AND_ASSIGN(InnerTextDumper);
}; };
bool TextContentDumperIgnoresElement(const Element& element) {
return IsHTMLStyleElement(element) || IsHTMLScriptElement(element);
}
bool IsWhiteSpace(UChar ch) {
return ch == ' ' || ch == '\n' || ch == '\t';
}
// This class dumps textContent of a node into a StringBuilder, with the minor
// exception that text nodes in certain elements are ignored (See
// TextContentDumperIgnoresElement()), and consucetive whitespace characters are
// collapsed regardless of style.
// Note: This dumper is for TranslateHelper only. Do not use for other purposes!
class TextContentDumper {
STACK_ALLOCATED();
public:
TextContentDumper(StringBuilder& builder, size_t max_length)
: builder_(builder), max_length_(max_length) {}
void DumpTextFrom(const Element& element) { HandleElement(element, 0); }
private:
void HandleElement(const Element& element, unsigned depth) {
if (depth == text_dumper_max_depth)
return;
if (TextContentDumperIgnoresElement(element))
return;
for (const Node& child : NodeTraversal::ChildrenOf(element)) {
if (child.IsElementNode()) {
HandleElement(ToElement(child), depth + 1);
continue;
}
if (!child.IsTextNode())
continue;
HandleTextNode(ToText(child));
if (builder_.length() >= max_length_)
return;
}
}
void HandleTextNode(const Text& node) {
for (unsigned i = 0;
i < node.data().length() && builder_.length() < max_length_; ++i) {
UChar ch = node.data()[i];
if (ShouldAppendCharacter(ch))
builder_.Append(ch);
}
}
bool ShouldAppendCharacter(UChar ch) const {
if (!IsWhiteSpace(ch))
return true;
if (builder_.IsEmpty())
return true;
if (!IsWhiteSpace(builder_[builder_.length() - 1]))
return true;
return false;
}
StringBuilder& builder_;
const size_t max_length_;
DISALLOW_COPY_AND_ASSIGN(TextContentDumper);
};
// Controls which text dumper to use: TextContentDumper or InnerTextDumper.
enum TextDumpOption { kDumpTextContent, kDumpInnerText };
void FrameContentAsPlainText(size_t max_chars, void FrameContentAsPlainText(size_t max_chars,
LocalFrame* frame, LocalFrame* frame,
TextDumpOption option,
StringBuilder& output) { StringBuilder& output) {
Document* document = frame->GetDocument(); Document* document = frame->GetDocument();
if (!document) if (!document)
...@@ -168,11 +241,21 @@ void FrameContentAsPlainText(size_t max_chars, ...@@ -168,11 +241,21 @@ void FrameContentAsPlainText(size_t max_chars,
if (!frame->View() || frame->View()->ShouldThrottleRendering()) if (!frame->View() || frame->View()->ShouldThrottleRendering())
return; return;
if (option == TextDumpOption::kDumpInnerText) {
// Dumping inner text requires clean layout
DCHECK(!frame->View()->NeedsLayout()); DCHECK(!frame->View()->NeedsLayout());
DCHECK(!document->NeedsLayoutTreeUpdate()); DCHECK(!document->NeedsLayoutTreeUpdate());
}
if (document->documentElement()) if (document->documentElement()) {
TextDumper(output, max_chars).DumpTextFrom(*document->documentElement()); if (option == TextDumpOption::kDumpInnerText) {
InnerTextDumper(output, max_chars)
.DumpTextFrom(*document->documentElement());
} else {
TextContentDumper(output, max_chars)
.DumpTextFrom(*document->documentElement());
}
}
// The separator between frames when the frames are converted to plain text. // The separator between frames when the frames are converted to plain text.
const LChar kFrameSeparator[] = {'\n', '\n'}; const LChar kFrameSeparator[] = {'\n', '\n'};
...@@ -185,7 +268,8 @@ void FrameContentAsPlainText(size_t max_chars, ...@@ -185,7 +268,8 @@ void FrameContentAsPlainText(size_t max_chars,
if (!cur_child->IsLocalFrame()) if (!cur_child->IsLocalFrame())
continue; continue;
LocalFrame* cur_local_child = ToLocalFrame(cur_child); LocalFrame* cur_local_child = ToLocalFrame(cur_child);
// Ignore the text of non-visible frames. // When dumping inner text, ignore the text of non-visible frames.
if (option == TextDumpOption::kDumpInnerText) {
LayoutView* layout_view = cur_local_child->ContentLayoutObject(); LayoutView* layout_view = cur_local_child->ContentLayoutObject();
LayoutObject* owner_layout_object = cur_local_child->OwnerLayoutObject(); LayoutObject* owner_layout_object = cur_local_child->OwnerLayoutObject();
if (!layout_view || !layout_view->Size().Width() || if (!layout_view || !layout_view->Size().Width() ||
...@@ -193,9 +277,11 @@ void FrameContentAsPlainText(size_t max_chars, ...@@ -193,9 +277,11 @@ void FrameContentAsPlainText(size_t max_chars,
(layout_view->Location().X() + layout_view->Size().Width() <= 0) || (layout_view->Location().X() + layout_view->Size().Width() <= 0) ||
(layout_view->Location().Y() + layout_view->Size().Height() <= 0) || (layout_view->Location().Y() + layout_view->Size().Height() <= 0) ||
(owner_layout_object && owner_layout_object->Style() && (owner_layout_object && owner_layout_object->Style() &&
owner_layout_object->Style()->Visibility() != EVisibility::kVisible)) { owner_layout_object->Style()->Visibility() !=
EVisibility::kVisible)) {
continue; continue;
} }
}
// Make sure the frame separator won't fill up the buffer, and give up if // Make sure the frame separator won't fill up the buffer, and give up if
// it will. The danger is if the separator will make the buffer longer than // it will. The danger is if the separator will make the buffer longer than
...@@ -206,7 +292,7 @@ void FrameContentAsPlainText(size_t max_chars, ...@@ -206,7 +292,7 @@ void FrameContentAsPlainText(size_t max_chars,
return; return;
output.Append(kFrameSeparator, frame_separator_length); output.Append(kFrameSeparator, frame_separator_length);
FrameContentAsPlainText(max_chars, cur_local_child, output); FrameContentAsPlainText(max_chars, cur_local_child, option, output);
if (output.length() >= max_chars) if (output.length() >= max_chars)
return; // Filled up the buffer. return; // Filled up the buffer.
} }
...@@ -221,7 +307,7 @@ WebString WebFrameContentDumper::DeprecatedDumpFrameTreeAsText( ...@@ -221,7 +307,7 @@ WebString WebFrameContentDumper::DeprecatedDumpFrameTreeAsText(
return WebString(); return WebString();
StringBuilder text; StringBuilder text;
FrameContentAsPlainText(max_chars, ToWebLocalFrameImpl(frame)->GetFrame(), FrameContentAsPlainText(max_chars, ToWebLocalFrameImpl(frame)->GetFrame(),
text); TextDumpOption::kDumpTextContent, text);
return text.ToString(); return text.ToString();
} }
...@@ -236,7 +322,7 @@ WebString WebFrameContentDumper::DumpWebViewAsText(WebView* web_view, ...@@ -236,7 +322,7 @@ WebString WebFrameContentDumper::DumpWebViewAsText(WebView* web_view,
StringBuilder text; StringBuilder text;
FrameContentAsPlainText(max_chars, ToWebLocalFrameImpl(frame)->GetFrame(), FrameContentAsPlainText(max_chars, ToWebLocalFrameImpl(frame)->GetFrame(),
text); TextDumpOption::kDumpInnerText, text);
return text.ToString(); return text.ToString();
} }
......
...@@ -4984,6 +4984,29 @@ TEST_P(ParameterizedWebFrameTest, GetContentAsPlainText) { ...@@ -4984,6 +4984,29 @@ TEST_P(ParameterizedWebFrameTest, GetContentAsPlainText) {
EXPECT_EQ("Hello world", text.Utf8()); EXPECT_EQ("Hello world", text.Utf8());
} }
// Verifies that ChromeRenderFrameObserver::CapturePageText can get page text
// with WebFrameContentDumper::DeprecatedDumpFrameTreeAsText() in dirty layout.
TEST_P(ParameterizedWebFrameTest, CapturePageTextWithDirtyLayout) {
FrameTestHelpers::WebViewHelper web_view_helper;
web_view_helper.InitializeAndLoad("about:blank");
WebLocalFrame* frame = web_view_helper.LocalMainFrame();
Document* document = frame->GetDocument();
Element* body = document->body();
// Change the document innerHTML, which dirties layout.
const char* new_html = "<div>Foo bar</div><div></div>baz";
body->SetInnerHTMLFromString(new_html);
// Verifies that text capturing works on dirty layout.
// Note that we must call the deprecated function here to simulate the
// behavior of ChromeRenderFrameObserver::CapturePageText().
EXPECT_TRUE(document->NeedsLayoutTreeUpdate());
EXPECT_EQ(
"Foo barbaz",
WebFrameContentDumper::DeprecatedDumpFrameTreeAsText(frame, 12).Utf8());
}
TEST_P(ParameterizedWebFrameTest, GetFullHtmlOfPage) { TEST_P(ParameterizedWebFrameTest, GetFullHtmlOfPage) {
FrameTestHelpers::WebViewHelper web_view_helper; FrameTestHelpers::WebViewHelper web_view_helper;
web_view_helper.InitializeAndLoad("about:blank"); web_view_helper.InitializeAndLoad("about:blank");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment