Commit f4dafe02 authored by bryner@chromium.org's avatar bryner@chromium.org

Add an extractor for DOM features to be used for client side phishing detection.

PhishingDOMFeatureExtractor iterates over the page elements and computes a
number of features.  To avoid blocking the renderer for too long, the extractor
may run in several chunks of works, posting a task to continue processing if
necessary.

This CL only includes the feature extraction itself.  I will add the logic to
cap the time per iteration in a follow-up CL.

BUG=none
TEST=PhishingDOMFeatureExtractorTest

Review URL: http://codereview.chromium.org/2878046

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54082 0039d316-1c4b-4281-b951-d872f2087c98
parent 92608249
...@@ -168,6 +168,8 @@ ...@@ -168,6 +168,8 @@
'renderer/renderer_webstoragenamespace_impl.h', 'renderer/renderer_webstoragenamespace_impl.h',
'renderer/safe_browsing/features.cc', 'renderer/safe_browsing/features.cc',
'renderer/safe_browsing/features.h', 'renderer/safe_browsing/features.h',
'renderer/safe_browsing/phishing_dom_feature_extractor.cc',
'renderer/safe_browsing/phishing_dom_feature_extractor.h',
'renderer/safe_browsing/phishing_url_feature_extractor.cc', 'renderer/safe_browsing/phishing_url_feature_extractor.cc',
'renderer/safe_browsing/phishing_url_feature_extractor.h', 'renderer/safe_browsing/phishing_url_feature_extractor.h',
'renderer/spellchecker/spellcheck.cc', 'renderer/spellchecker/spellcheck.cc',
......
...@@ -1202,6 +1202,7 @@ ...@@ -1202,6 +1202,7 @@
'renderer/renderer_about_handler_unittest.cc', 'renderer/renderer_about_handler_unittest.cc',
'renderer/renderer_main_unittest.cc', 'renderer/renderer_main_unittest.cc',
'renderer/safe_browsing/features_unittest.cc', 'renderer/safe_browsing/features_unittest.cc',
'renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc',
'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc', 'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
'renderer/spellchecker/spellcheck_unittest.cc', 'renderer/spellchecker/spellcheck_unittest.cc',
'renderer/spellchecker/spellcheck_worditerator_unittest.cc', 'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
......
...@@ -15,6 +15,10 @@ FeatureMap::FeatureMap() {} ...@@ -15,6 +15,10 @@ FeatureMap::FeatureMap() {}
FeatureMap::~FeatureMap() {} FeatureMap::~FeatureMap() {}
bool FeatureMap::AddBooleanFeature(const std::string& name) { bool FeatureMap::AddBooleanFeature(const std::string& name) {
return AddRealFeature(name, 1.0);
}
bool FeatureMap::AddRealFeature(const std::string& name, double value) {
if (features_.size() >= kMaxFeatureMapSize) { if (features_.size() >= kMaxFeatureMapSize) {
// If we hit this case, it indicates that either kMaxFeatureMapSize is // If we hit this case, it indicates that either kMaxFeatureMapSize is
// too small, or there is a bug causing too many features to be added. // too small, or there is a bug causing too many features to be added.
...@@ -25,7 +29,16 @@ bool FeatureMap::AddBooleanFeature(const std::string& name) { ...@@ -25,7 +29,16 @@ bool FeatureMap::AddBooleanFeature(const std::string& name) {
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1); UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
return false; return false;
} }
features_[name] = 1.0; // We only expect features in the range [0.0, 1.0], so fail if the feature is
// outside this range.
if (value < 0.0 || value > 1.0) {
LOG(ERROR) << "Not adding feature: " << name << " because the value "
<< value << " is not in the range [0.0, 1.0].";
UMA_HISTOGRAM_COUNTS("SBClientPhishing.IllegalFeatureValue", 1);
return false;
}
features_[name] = value;
return true; return true;
} }
...@@ -47,5 +60,25 @@ const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3"; ...@@ -47,5 +60,25 @@ const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
// URL path features // URL path features
const char kUrlPathToken[] = "UrlPathToken="; const char kUrlPathToken[] = "UrlPathToken=";
// DOM HTML form features
const char kPageHasForms[] = "PageHasForms";
const char kPageActionOtherDomainFreq[] = "PageActionOtherDomainFreq";
const char kPageHasTextInputs[] = "PageHasTextInputs";
const char kPageHasPswdInputs[] = "PageHasPswdInputs";
const char kPageHasRadioInputs[] = "PageHasRadioInputs";
const char kPageHasCheckInputs[] = "PageHasCheckInputs";
// DOM HTML link features
const char kPageExternalLinksFreq[] = "PageExternalLinksFreq";
const char kPageLinkDomain[] = "PageLinkDomain=";
const char kPageSecureLinksFreq[] = "PageSecureLinksFreq";
// DOM HTML script features
const char kPageNumScriptTagsGTOne[] = "PageNumScriptTags>1";
const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6";
// Other DOM HTML features
const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq";
} // namespace features } // namespace features
} // namespace safe_browsing } // namespace safe_browsing
...@@ -44,6 +44,12 @@ class FeatureMap { ...@@ -44,6 +44,12 @@ class FeatureMap {
// kMaxFeatureMapSize. // kMaxFeatureMapSize.
bool AddBooleanFeature(const std::string& name); bool AddBooleanFeature(const std::string& name);
// Adds a real-valued feature to a FeatureMap with the given value.
// Values must always be in the range [0.0, 1.0]. Returns true on
// success, or false if the feature map exceeds kMaxFeatureMapSize
// or the value is outside of the allowed range.
bool AddRealFeature(const std::string& name, double value);
// Provides read-only access to the current set of features. // Provides read-only access to the current set of features.
const base::hash_map<std::string, double>& features() const { const base::hash_map<std::string, double>& features() const {
return features_; return features_;
...@@ -103,6 +109,55 @@ extern const char kUrlNumOtherHostTokensGTThree[]; ...@@ -103,6 +109,55 @@ extern const char kUrlNumOtherHostTokensGTThree[];
// token features, "abc" and "efg". Query parameters are not included. // token features, "abc" and "efg". Query parameters are not included.
extern const char kUrlPathToken[]; extern const char kUrlPathToken[];
////////////////////////////////////////////////////
// DOM HTML form features
////////////////////////////////////////////////////
// Set if the page has any <form> elements.
extern const char kPageHasForms[];
// The fraction of form elements whose |action| attribute points to a
// URL on a different domain from the document URL.
extern const char kPageActionOtherDomainFreq[];
// Set if the page has any <input type="text"> elements
// (includes inputs with missing or unknown types).
extern const char kPageHasTextInputs[];
// Set if the page has any <input type="password"> elements.
extern const char kPageHasPswdInputs[];
// Set if the page has any <input type="radio"> elements.
extern const char kPageHasRadioInputs[];
// Set if the page has any <input type="checkbox"> elements.
extern const char kPageHasCheckInputs[];
////////////////////////////////////////////////////
// DOM HTML link features
////////////////////////////////////////////////////
// The fraction of links in the page which point to a domain other than the
// domain of the document. See "URL host features" above for a discussion
// of how the doamin is computed.
extern const char kPageExternalLinksFreq[];
// Token feature containing each external domain that is linked to.
extern const char kPageLinkDomain[];
// Fraction of links in the page that use https.
extern const char kPageSecureLinksFreq[];
////////////////////////////////////////////////////
// DOM HTML script features
////////////////////////////////////////////////////
// Set if the number of <script> elements in the page is greater than 1.
extern const char kPageNumScriptTagsGTOne[];
// Set if the number of <script> elements in the page is greater than 6.
extern const char kPageNumScriptTagsGTSix[];
////////////////////////////////////////////////////
// Other DOM HTML features
////////////////////////////////////////////////////
// The fraction of images whose src attribute points to an external domain.
extern const char kPageImgOtherDomainFreq[];
} // namespace features } // namespace features
} // namepsace safe_browsing } // namepsace safe_browsing
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "base/format_macros.h" #include "base/format_macros.h"
#include "base/string_util.h" #include "base/string_util.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h" #include "testing/gtest/include/gtest/gtest.h"
namespace safe_browsing { namespace safe_browsing {
...@@ -24,4 +25,20 @@ TEST(PhishingFeaturesTest, TooManyFeatures) { ...@@ -24,4 +25,20 @@ TEST(PhishingFeaturesTest, TooManyFeatures) {
EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size()); EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
} }
TEST(PhishingFeaturesTest, IllegalFeatureValue) {
FeatureMap features;
EXPECT_FALSE(features.AddRealFeature("toosmall", -0.1));
EXPECT_TRUE(features.AddRealFeature("zero", 0.0));
EXPECT_TRUE(features.AddRealFeature("pointfive", 0.5));
EXPECT_TRUE(features.AddRealFeature("one", 1.0));
EXPECT_FALSE(features.AddRealFeature("toolarge", 1.1));
FeatureMap expected_features;
expected_features.AddRealFeature("zero", 0.0);
expected_features.AddRealFeature("pointfive", 0.5);
expected_features.AddRealFeature("one", 1.0);
EXPECT_THAT(features.features(),
::testing::ContainerEq(expected_features.features()));
}
} // namespace safe_browsing } // namespace safe_browsing
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
#include "base/compiler_specific.h"
#include "base/hash_tables.h"
#include "base/histogram.h"
#include "base/logging.h"
#include "chrome/renderer/render_view.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "net/base/registry_controlled_domain.h"
#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
#include "third_party/WebKit/WebKit/chromium/public/WebElement.h"
#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
#include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h"
#include "third_party/WebKit/WebKit/chromium/public/WebString.h"
#include "third_party/WebKit/WebKit/chromium/public/WebView.h"
namespace safe_browsing {
// Intermediate state used for computing features. See features.h for
// descriptions of the DOM features that are computed.
struct PhishingDOMFeatureExtractor::PageFeatureState {
// Link related features
int external_links;
base::hash_set<std::string> external_domains;
int secure_links;
int total_links;
// Form related features
int num_forms;
int num_text_inputs;
int num_pswd_inputs;
int num_radio_inputs;
int num_check_inputs;
int action_other_domain;
int total_actions;
// Image related features
int img_other_domain;
int total_imgs;
// How many script tags
int num_script_tags;
PageFeatureState()
: external_links(0),
secure_links(0),
total_links(0),
num_forms(0),
num_text_inputs(0),
num_pswd_inputs(0),
num_radio_inputs(0),
num_check_inputs(0),
action_other_domain(0),
total_actions(0),
img_other_domain(0),
total_imgs(0),
num_script_tags(0) {}
~PageFeatureState() {}
};
// Per-frame state
struct PhishingDOMFeatureExtractor::FrameData {
// This is our reference to document.all, which is an iterator over all
// of the elements in the document. It keeps track of our current position.
WebKit::WebNodeCollection elements;
// The domain of the document URL, stored here so that we don't need to
// recompute it every time it's needed.
std::string domain;
};
PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
RenderView* render_view)
: render_view_(render_view),
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
Clear();
}
PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
// The RenderView should have called CancelPendingExtraction() before
// we are destroyed.
CheckNoPendingExtraction();
}
void PhishingDOMFeatureExtractor::ExtractFeatures(
FeatureMap* features,
DoneCallback* done_callback) {
// The RenderView should have called CancelPendingExtraction() before
// starting a new extraction, so DCHECK this.
CheckNoPendingExtraction();
// However, in an opt build, we will go ahead and clean up the pending
// extraction so that we can start in a known state.
CancelPendingExtraction();
features_ = features;
done_callback_.reset(done_callback);
MessageLoop::current()->PostTask(
FROM_HERE,
method_factory_.NewRunnableMethod(
&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
}
void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
// Cancel any pending callbacks, and clear our state.
method_factory_.RevokeAll();
Clear();
}
void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
if (!cur_frame_) {
WebKit::WebView* web_view = render_view_->webview();
if (!web_view) {
// When the WebView is going away, the render view should have called
// CancelPendingExtraction() which should have stopped any pending work,
// so this case should not happen.
NOTREACHED();
RunCallback(false);
return;
}
cur_frame_ = web_view->mainFrame();
page_feature_state_.reset(new PageFeatureState);
}
for (; cur_frame_;
cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
WebKit::WebNode cur_node;
if (cur_frame_data_.get()) {
// We're resuming traversal of a frame, so just advance to the next node.
cur_node = cur_frame_data_->elements.nextItem();
} else {
// We just moved to a new frame, so update our frame state
// and advance to the first element.
if (!ResetFrameData()) {
// Nothing in this frame, move on to the next one.
LOG(WARNING) << "No content in frame, skipping";
continue;
}
cur_node = cur_frame_data_->elements.firstItem();
}
for (; !cur_node.isNull();
cur_node = cur_frame_data_->elements.nextItem()) {
if (!cur_node.isElementNode()) {
continue;
}
WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
if (element.hasTagName("a")) {
HandleLink(element);
} else if (element.hasTagName("form")) {
HandleForm(element);
} else if (element.hasTagName("img")) {
HandleImage(element);
} else if (element.hasTagName("input")) {
HandleInput(element);
} else if (element.hasTagName("script")) {
HandleScript(element);
}
// TODO(bryner): stop if too much time has elapsed, and add histograms
// for the time spent processing.
}
// We're done with this frame, recalculate the FrameData when we
// advance to the next frame.
cur_frame_data_.reset();
}
InsertFeatures();
RunCallback(true);
}
void PhishingDOMFeatureExtractor::HandleLink(
const WebKit::WebElement& element) {
// Count the number of times we link to a different host.
if (!element.hasAttribute("href")) {
DLOG(INFO) << "Skipping anchor tag with no href";
return;
}
// Retrieve the link and resolve the link in case it's relative.
WebKit::WebURL full_url = element.document().completeURL(
element.getAttribute("href"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
LOG(ERROR) << "Could not extract domain from link: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->external_links;
// Record each unique domain that we link to.
page_feature_state_->external_domains.insert(domain);
}
// Check how many are https links.
if (GURL(full_url).SchemeIs("https")) {
++page_feature_state_->secure_links;
}
++page_feature_state_->total_links;
}
void PhishingDOMFeatureExtractor::HandleForm(
const WebKit::WebElement& element) {
// Increment the number of forms on this page.
++page_feature_state_->num_forms;
// Record whether the action points to a different domain.
if (!element.hasAttribute("action")) {
return;
}
WebKit::WebURL full_url = element.document().completeURL(
element.getAttribute("action"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
LOG(ERROR) << "Could not extract domain from form action: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->action_other_domain;
}
++page_feature_state_->total_actions;
}
void PhishingDOMFeatureExtractor::HandleImage(
const WebKit::WebElement& element) {
if (!element.hasAttribute("src")) {
DLOG(INFO) << "Skipping img tag with no src";
}
// Record whether the image points to a different domain.
WebKit::WebURL full_url = element.document().completeURL(
element.getAttribute("src"));
std::string domain;
bool is_external = IsExternalDomain(full_url, &domain);
if (domain.empty()) {
LOG(ERROR) << "Could not extract domain from image src: " << full_url;
return;
}
if (is_external) {
++page_feature_state_->img_other_domain;
}
++page_feature_state_->total_imgs;
}
void PhishingDOMFeatureExtractor::HandleInput(
const WebKit::WebElement& element) {
// The HTML spec says that if the type is unspecified, it defaults to text.
// In addition, any unrecognized type will be treated as a text input.
//
// Note that we use the attribute value rather than
// WebFormControlElement::formControlType() for consistency with the
// way the phishing classification model is created.
std::string type = element.getAttribute("type").utf8();
StringToLowerASCII(&type);
if (type == "password") {
++page_feature_state_->num_pswd_inputs;
} else if (type == "radio") {
++page_feature_state_->num_radio_inputs;
} else if (type == "checkbox") {
++page_feature_state_->num_check_inputs;
} else if (type != "submit" && type != "reset" && type != "file" &&
type != "hidden" && type != "image" && type != "button") {
// Note that there are a number of new input types in HTML5 that are not
// handled above. For now, we will consider these as text inputs since
// they could be used to capture user input.
++page_feature_state_->num_text_inputs;
}
}
void PhishingDOMFeatureExtractor::HandleScript(
const WebKit::WebElement& element) {
++page_feature_state_->num_script_tags;
}
void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
DCHECK(!done_callback_.get());
DCHECK(!cur_frame_data_.get());
DCHECK(!cur_frame_);
if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) {
LOG(ERROR) << "Extraction in progress, missing call to "
<< "CancelPendingExtraction";
}
}
void PhishingDOMFeatureExtractor::RunCallback(bool success) {
DCHECK(done_callback_.get());
done_callback_->Run(success);
Clear();
}
void PhishingDOMFeatureExtractor::Clear() {
features_ = NULL;
done_callback_.reset(NULL);
cur_frame_data_.reset(NULL);
cur_frame_ = NULL;
}
bool PhishingDOMFeatureExtractor::ResetFrameData() {
DCHECK(cur_frame_);
DCHECK(!cur_frame_data_.get());
WebKit::WebDocument doc = cur_frame_->document();
if (doc.isNull()) {
return false;
}
cur_frame_data_.reset(new FrameData());
cur_frame_data_->elements = doc.all();
cur_frame_data_->domain =
net::RegistryControlledDomainService::GetDomainAndRegistry(
cur_frame_->url());
return true;
}
bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
std::string* domain) const {
DCHECK(domain);
DCHECK(cur_frame_data_.get());
if (cur_frame_data_->domain.empty()) {
return false;
}
// TODO(bryner): Ensure that the url encoding is consistent with the features
// in the model.
if (url.HostIsIPAddress()) {
domain->assign(url.host());
} else {
domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry(
url));
}
return !domain->empty() && *domain != cur_frame_data_->domain;
}
void PhishingDOMFeatureExtractor::InsertFeatures() {
DCHECK(page_feature_state_.get());
features_->Clear();
if (page_feature_state_->total_links > 0) {
// Add a feature for the fraction of times the page links to an external
// domain vs. an internal domain.
double link_freq = static_cast<double>(
page_feature_state_->external_links) /
page_feature_state_->total_links;
features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
// Add a feature for each unique domain that we're linking to
for (base::hash_set<std::string>::iterator it =
page_feature_state_->external_domains.begin();
it != page_feature_state_->external_domains.end(); ++it) {
features_->AddBooleanFeature(features::kPageLinkDomain + *it);
}
// Fraction of links that use https.
double secure_freq = static_cast<double>(
page_feature_state_->secure_links) / page_feature_state_->total_links;
features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
}
// Record whether forms appear and whether various form elements appear.
if (page_feature_state_->num_forms > 0) {
features_->AddBooleanFeature(features::kPageHasForms);
}
if (page_feature_state_->num_text_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasTextInputs);
}
if (page_feature_state_->num_pswd_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasPswdInputs);
}
if (page_feature_state_->num_radio_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasRadioInputs);
}
if (page_feature_state_->num_check_inputs > 0) {
features_->AddBooleanFeature(features::kPageHasCheckInputs);
}
// Record fraction of form actions that point to a different domain.
if (page_feature_state_->total_actions > 0) {
double action_freq = static_cast<double>(
page_feature_state_->action_other_domain) /
page_feature_state_->total_actions;
features_->AddRealFeature(features::kPageActionOtherDomainFreq,
action_freq);
}
// Record how many image src attributes point to a different domain.
if (page_feature_state_->total_imgs > 0) {
double img_freq = static_cast<double>(
page_feature_state_->img_other_domain) /
page_feature_state_->total_imgs;
features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
}
// Record number of script tags (discretized for numerical stability.)
if (page_feature_state_->num_script_tags > 1) {
features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
if (page_feature_state_->num_script_tags > 6) {
features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
}
}
}
} // namespace safe_browsing
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingDOMFeatureExtractor handles computing DOM-based features for the
// client-side phishing detection model. These include the presence of various
// types of elements, ratios of external and secure links, and tokens for
// external domains linked to.
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#include <string>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/scoped_ptr.h"
#include "base/task.h"
class GURL;
class RenderView;
namespace WebKit {
class WebElement;
class WebFrame;
}
namespace safe_browsing {
class FeatureMap;
class PhishingDOMFeatureExtractor {
public:
// Callback to be run when feature extraction finishes. The callback
// argument is true if extraction was successful, false otherwise.
typedef Callback1<bool>::Type DoneCallback;
// Creates a PhishingDOMFeatureExtractor for the specified RenderView.
// The PhishingDOMFeatureExtrator should be destroyed prior to destroying
// the RenderView.
explicit PhishingDOMFeatureExtractor(RenderView* render_view);
~PhishingDOMFeatureExtractor();
// Begins extracting features into the given FeatureMap for the page
// currently loaded in this object's RenderView. To avoid blocking the
// render thread for too long, the feature extractor may run in several
// chunks of work, posting a task to the current MessageLoop to continue
// processing. Once feature extraction is complete, |done_callback|
// is run. PhishingDOMFeatureExtractor takes ownership of the callback.
void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
// Must be called if there is a feature extraction in progress when the page
// is unloaded or the PhishingDOMFeatureExtractor is destroyed.
void CancelPendingExtraction();
private:
struct FrameData;
struct PageFeatureState;
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
// finishes, calls RunCallback().
void ExtractFeaturesWithTimeout();
// Handlers for the various HTML elements that we compute features for.
// Since some of the features (such as ratios) cannot be computed until
// feature extraction is finished, these handlers do not add to the feature
// map directly. Instead, they update the values in the PageFeatureState.
void HandleLink(const WebKit::WebElement& element);
void HandleForm(const WebKit::WebElement& element);
void HandleImage(const WebKit::WebElement& element);
void HandleInput(const WebKit::WebElement& element);
void HandleScript(const WebKit::WebElement& element);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
// builds.
void CheckNoPendingExtraction();
// Runs |done_callback_| and then clears all internal state.
void RunCallback(bool success);
// Clears all internal feature extraction state.
void Clear();
// Called after advancing |cur_frame_| to update the state in
// |cur_frame_data_|. Returns true if the state was updated successfully.
bool ResetFrameData();
// Given a URL, checks whether the domain is different from the domain of
// the current frame's URL. If so, stores the domain in |domain| and returns
// true, otherwise returns false.
bool IsExternalDomain(const GURL& url, std::string* domain) const;
// Called once all frames have been processed to compute features from the
// PageFeatureState and add them to |features_|. See features.h for a
// description of which features are computed.
void InsertFeatures();
// Non-owned pointer to the view that we will extract features from.
RenderView* render_view_;
// The output parameters from the most recent call to ExtractFeatures().
FeatureMap* features_; // The caller keeps ownership of this.
scoped_ptr<DoneCallback> done_callback_;
// Non-owned pointer to the current frame that we are processing.
WebKit::WebFrame* cur_frame_;
// Stores extra state for |cur_frame_| that will be persisted until we
// advance to the next frame.
scoped_ptr<FrameData> cur_frame_data_;
// Stores the intermediate data used to create features. This data is
// accumulated across all frames in the RenderView.
scoped_ptr<PageFeatureState> page_feature_state_;
// Used to create ExtractFeaturesWithTimeout tasks.
// These tasks are revoked if extraction is cancelled.
ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_;
DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
};
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
#include <string.h> // for memcpy()
#include <map>
#include <string>
#include "base/callback.h"
#include "base/command_line.h"
#include "base/message_loop.h"
#include "base/process.h"
#include "base/string_util.h"
#include "chrome/common/main_function_params.h"
#include "chrome/common/render_messages.h"
#include "chrome/common/sandbox_init_wrapper.h"
#include "chrome/renderer/mock_render_process.h"
#include "chrome/renderer/render_thread.h"
#include "chrome/renderer/render_view.h"
#include "chrome/renderer/render_view_visitor.h"
#include "chrome/renderer/renderer_main_platform_delegate.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "googleurl/src/gurl.h"
#include "ipc/ipc_channel.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
#include "third_party/WebKit/WebKit/chromium/public/WebURLRequest.h"
#include "third_party/WebKit/WebKit/chromium/public/WebView.h"
#include "webkit/glue/webkit_glue.h"
using ::testing::ContainerEq;
namespace safe_browsing {
class PhishingDOMFeatureExtractorTest : public ::testing::Test,
public IPC::Channel::Listener,
public RenderViewVisitor {
public:
// IPC::Channel::Listener implementation.
virtual void OnMessageReceived(const IPC::Message& message) {
IPC_BEGIN_MESSAGE_MAP(PhishingDOMFeatureExtractorTest, message)
IPC_MESSAGE_HANDLER(ViewHostMsg_RenderViewReady, OnRenderViewReady)
IPC_MESSAGE_HANDLER(ViewHostMsg_DidStopLoading, OnDidStopLoading)
IPC_MESSAGE_HANDLER(ViewHostMsg_RequestResource, OnRequestResource)
IPC_END_MESSAGE_MAP()
}
// RenderViewVisitor implementation.
virtual bool Visit(RenderView* render_view) {
view_ = render_view;
return false;
}
protected:
virtual void SetUp() {
// Set up the renderer. This code is largely adapted from
// render_view_test.cc and renderer_main.cc. Note that we use a
// MockRenderProcess (because we don't need to use IPC for painting),
// but we use a real RenderThread so that we can use the ResourceDispatcher
// to fetch network resources. These are then served canned content
// in OnRequestResource().
sandbox_init_wrapper_.reset(new SandboxInitWrapper);
command_line_.reset(new CommandLine(CommandLine::ARGUMENTS_ONLY));
params_.reset(new MainFunctionParams(*command_line_,
*sandbox_init_wrapper_, NULL));
platform_.reset(new RendererMainPlatformDelegate(*params_));
platform_->PlatformInitialize();
// We use a new IPC channel name for each test that runs.
// This is necessary because the renderer-side IPC channel is not
// shut down when the RenderThread goes away, so attempting to reuse
// the channel name gives an error (see ChildThread::~ChildThread()).
std::string thread_name = StringPrintf(
"phishing_dom_feature_Extractor_unittest.%d",
next_thread_id_++);
channel_.reset(new IPC::Channel(thread_name,
IPC::Channel::MODE_SERVER, this));
ASSERT_TRUE(channel_->Connect());
webkit_glue::SetJavaScriptFlags(L"--expose-gc");
mock_process_.reset(new MockRenderProcess);
render_thread_ = new RenderThread(thread_name);
mock_process_->set_main_thread(render_thread_);
// Tell the renderer to create a view, then wait until it's ready.
// We can't call View::Create() directly here or else we won't get
// RenderProcess's lazy initialization of WebKit.
view_ = NULL;
ViewMsg_New_Params params;
params.parent_window = 0;
params.view_id = kViewId;
params.session_storage_namespace_id = kInvalidSessionStorageNamespaceId;
ASSERT_TRUE(channel_->Send(new ViewMsg_New(params)));
msg_loop_.Run();
extractor_.reset(new PhishingDOMFeatureExtractor(view_));
}
virtual void TearDown() {
// Try very hard to collect garbage before shutting down.
GetMainFrame()->collectGarbage();
GetMainFrame()->collectGarbage();
ASSERT_TRUE(channel_->Send(new ViewMsg_Close(kViewId)));
do {
msg_loop_.RunAllPending();
view_ = NULL;
RenderView::ForEach(this);
} while (view_);
mock_process_.reset();
msg_loop_.RunAllPending();
platform_->PlatformUninitialize();
platform_.reset();
command_line_.reset();
sandbox_init_wrapper_.reset();
}
// Returns the main WebFrame for our RenderView.
WebKit::WebFrame* GetMainFrame() {
return view_->webview()->mainFrame();
}
// Loads |url| into the RenderView, waiting for the load to finish.
void LoadURL(const std::string& url) {
GetMainFrame()->loadRequest(WebKit::WebURLRequest(GURL(url)));
msg_loop_.Run();
}
// Runs the DOMFeatureExtractor on the RenderView, waiting for the
// completion callback. Returns the success boolean from the callback.
bool ExtractFeatures(FeatureMap* features) {
success_ = false;
extractor_->ExtractFeatures(
features,
NewCallback(this, &PhishingDOMFeatureExtractorTest::ExtractionDone));
msg_loop_.Run();
return success_;
}
// Completion callback for feature extraction.
void ExtractionDone(bool success) {
success_ = success;
msg_loop_.Quit();
}
// IPC message handlers below
// Notification that page load has finished. Exit the message loop
// so that the test can continue.
void OnDidStopLoading() {
msg_loop_.Quit();
}
// Notification that the renderer wants to load a resource.
// If the requested url is in responses_, we send the renderer a 200
// and the supplied content, otherwise we send it a 404 error.
void OnRequestResource(const IPC::Message& message,
int request_id,
const ViewHostMsg_Resource_Request& request_data) {
std::string headers, body;
std::map<std::string, std::string>::const_iterator it =
responses_.find(request_data.url.spec());
if (it == responses_.end()) {
headers = "HTTP/1.1 404 Not Found\0Content-Type:text/html\0\0";
body = "content not found";
} else {
headers = "HTTP/1.1 200 OK\0Content-Type:text/html\0\0";
body = it->second;
}
ResourceResponseHead response_head;
response_head.headers = new net::HttpResponseHeaders(headers);
response_head.mime_type = "text/html";
ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_ReceivedResponse(
message.routing_id(), request_id, response_head)));
base::SharedMemory shared_memory;
ASSERT_TRUE(shared_memory.Create(std::wstring(), false,
false, body.size()));
ASSERT_TRUE(shared_memory.Map(body.size()));
memcpy(shared_memory.memory(), body.data(), body.size());
base::SharedMemoryHandle handle;
ASSERT_TRUE(shared_memory.GiveToProcess(base::Process::Current().handle(),
&handle));
ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_DataReceived(
message.routing_id(), request_id, handle, body.size())));
ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_RequestComplete(
message.routing_id(),
request_id,
URLRequestStatus(),
std::string())));
}
// Notification that the render view we've created is ready to use.
void OnRenderViewReady() {
// Grab a pointer to the new view using RenderViewVisitor.
ASSERT_TRUE(!view_);
RenderView::ForEach(this);
ASSERT_TRUE(view_);
msg_loop_.Quit();
}
static int next_thread_id_; // incrementing counter for thread ids
static const int32 kViewId = 5; // arbitrary id for our testing view
MessageLoopForIO msg_loop_;
// channel that the renderer uses to talk to the browser.
// For this test, we will handle the browser end of the channel.
scoped_ptr<IPC::Channel> channel_;
RenderThread* render_thread_; // owned by mock_process_
scoped_ptr<MockRenderProcess> mock_process_;
RenderView* view_; // not owned, deletes itself on close
scoped_ptr<RendererMainPlatformDelegate> platform_;
scoped_ptr<MainFunctionParams> params_;
scoped_ptr<CommandLine> command_line_;
scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_;
scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
// Map of URL -> response body for network requests from the renderer.
// Any URLs not in this map are served a 404 error.
std::map<std::string, std::string> responses_;
bool success_; // holds the success value from ExtractFeatures
};
int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0;
TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
responses_["http://host.com/"] =
"<html><head><body>"
"<form action=\"query\"><input type=text><input type=checkbox></form>"
"<form action=\"http://cgi.host.com/submit\"></form>"
"<form action=\"http://other.com/\"></form>"
"<form action=\"query\"></form>"
"<form></form></body></html>";
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageHasForms);
expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25);
expected_features.AddBooleanFeature(features::kPageHasTextInputs);
expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
FeatureMap features;
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
responses_["http://host.com/"] =
"<html><head><body>"
"<input type=\"radio\"><input type=password></body></html>";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageHasRadioInputs);
expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
features.Clear();
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
responses_["http://host.com/"] =
"<html><head><body><input></body></html>";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageHasTextInputs);
features.Clear();
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
responses_["http://host.com/"] =
"<html><head><body><input type=\"invalid\"></body></html>";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageHasTextInputs);
features.Clear();
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}
TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
responses_["http://www.host.com/"] =
"<html><head><body>"
"<a href=\"http://www2.host.com/abc\">link</a>"
"<a name=page_anchor></a>"
"<a href=\"http://www.chromium.org/\">chromium</a>"
"</body></html";
FeatureMap expected_features;
expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5);
expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0);
expected_features.AddBooleanFeature(features::kPageLinkDomain +
std::string("chromium.org"));
FeatureMap features;
LoadURL("http://www.host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
responses_.clear();
responses_["https://www.host.com/"] =
"<html><head><body>"
"<a href=\"login\">this is secure</a>"
"<a href=\"http://host.com\">not secure</a>"
"<a href=\"https://www2.host.com/login\">also secure</a>"
"<a href=\"http://chromium.org/\">also not secure</a>"
"</body></html>";
expected_features.Clear();
expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5);
expected_features.AddBooleanFeature(features::kPageLinkDomain +
std::string("chromium.org"));
features.Clear();
LoadURL("https://www.host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}
TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
responses_["http://host.com/"] =
"<html><head><script></script><script></script></head></html>";
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
FeatureMap features;
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
responses_["http://host.com/"] =
"<html><head><script></script><script></script><script></script>"
"<script></script><script></script><script></script><script></script>"
"</head><body><img src=\"blah.gif\">"
"<img src=\"http://host2.com/blah.gif\"></body></html>";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix);
expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5);
features.Clear();
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}
TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
// Test that features are aggregated across all frames.
responses_["http://host.com/"] =
"<html><body><input type=text><a href=\"info.html\">link</a>"
"<iframe src=\"http://host2.com/\"></iframe>"
"<iframe src=\"http://host3.com/\"></iframe>"
"</body></html>";
responses_["http://host2.com/"] =
"<html><head><script></script><body>"
"<form action=\"http://host4.com/\"><input type=checkbox></form>"
"<form action=\"http://host2.com/submit\"></form>"
"<a href=\"http://www.host2.com/home\">link</a>"
"<iframe src=\"nested.html\"></iframe>"
"<body></html>";
responses_["http://host2.com/nested.html"] =
"<html><body><input type=password>"
"<a href=\"https://host4.com/\">link</a>"
"<a href=\"relative\">another</a>"
"</body></html>";
responses_["http://host3.com/"] =
"<html><head><script></script><body>"
"<img src=\"http://host.com/123.png\">"
"</body></html>";
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageHasForms);
// Form action domains are compared to the URL of the document they're in,
// not the URL of the toplevel page. So http://host2.com/ has two form
// actions, one of which is external.
expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
expected_features.AddBooleanFeature(features::kPageHasTextInputs);
expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
expected_features.AddBooleanFeature(features::kPageLinkDomain +
std::string("host4.com"));
expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25);
expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0);
FeatureMap features;
LoadURL("http://host.com/");
ASSERT_TRUE(ExtractFeatures(&features));
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}
// TODO(bryner): Test extraction with multiple passes, including the case where
// the node we stopped on is removed from the document.
} // namespace safe_browsing
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment