Commit f4dafe02 authored by bryner@chromium.org's avatar bryner@chromium.org

Add an extractor for DOM features to be used for client side phishing detection.

PhishingDOMFeatureExtractor iterates over the page elements and computes a
number of features.  To avoid blocking the renderer for too long, the extractor
may run in several chunks of works, posting a task to continue processing if
necessary.

This CL only includes the feature extraction itself.  I will add the logic to
cap the time per iteration in a follow-up CL.

BUG=none
TEST=PhishingDOMFeatureExtractorTest

Review URL: http://codereview.chromium.org/2878046

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54082 0039d316-1c4b-4281-b951-d872f2087c98
parent 92608249
......@@ -168,6 +168,8 @@
'renderer/renderer_webstoragenamespace_impl.h',
'renderer/safe_browsing/features.cc',
'renderer/safe_browsing/features.h',
'renderer/safe_browsing/phishing_dom_feature_extractor.cc',
'renderer/safe_browsing/phishing_dom_feature_extractor.h',
'renderer/safe_browsing/phishing_url_feature_extractor.cc',
'renderer/safe_browsing/phishing_url_feature_extractor.h',
'renderer/spellchecker/spellcheck.cc',
......
......@@ -1202,6 +1202,7 @@
'renderer/renderer_about_handler_unittest.cc',
'renderer/renderer_main_unittest.cc',
'renderer/safe_browsing/features_unittest.cc',
'renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc',
'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
'renderer/spellchecker/spellcheck_unittest.cc',
'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
......
......@@ -15,6 +15,10 @@ FeatureMap::FeatureMap() {}
FeatureMap::~FeatureMap() {}
bool FeatureMap::AddBooleanFeature(const std::string& name) {
return AddRealFeature(name, 1.0);
}
bool FeatureMap::AddRealFeature(const std::string& name, double value) {
if (features_.size() >= kMaxFeatureMapSize) {
// If we hit this case, it indicates that either kMaxFeatureMapSize is
// too small, or there is a bug causing too many features to be added.
......@@ -25,7 +29,16 @@ bool FeatureMap::AddBooleanFeature(const std::string& name) {
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
return false;
}
features_[name] = 1.0;
// We only expect features in the range [0.0, 1.0], so fail if the feature is
// outside this range.
if (value < 0.0 || value > 1.0) {
LOG(ERROR) << "Not adding feature: " << name << " because the value "
<< value << " is not in the range [0.0, 1.0].";
UMA_HISTOGRAM_COUNTS("SBClientPhishing.IllegalFeatureValue", 1);
return false;
}
features_[name] = value;
return true;
}
......@@ -47,5 +60,25 @@ const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
// URL path features
const char kUrlPathToken[] = "UrlPathToken=";
// DOM HTML form features
const char kPageHasForms[] = "PageHasForms";
const char kPageActionOtherDomainFreq[] = "PageActionOtherDomainFreq";
const char kPageHasTextInputs[] = "PageHasTextInputs";
const char kPageHasPswdInputs[] = "PageHasPswdInputs";
const char kPageHasRadioInputs[] = "PageHasRadioInputs";
const char kPageHasCheckInputs[] = "PageHasCheckInputs";
// DOM HTML link features
const char kPageExternalLinksFreq[] = "PageExternalLinksFreq";
const char kPageLinkDomain[] = "PageLinkDomain=";
const char kPageSecureLinksFreq[] = "PageSecureLinksFreq";
// DOM HTML script features
const char kPageNumScriptTagsGTOne[] = "PageNumScriptTags>1";
const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6";
// Other DOM HTML features
const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq";
} // namespace features
} // namespace safe_browsing
......@@ -44,6 +44,12 @@ class FeatureMap {
// kMaxFeatureMapSize.
bool AddBooleanFeature(const std::string& name);
// Adds a real-valued feature to a FeatureMap with the given value.
// Values must always be in the range [0.0, 1.0]. Returns true on
// success, or false if the feature map exceeds kMaxFeatureMapSize
// or the value is outside of the allowed range.
bool AddRealFeature(const std::string& name, double value);
// Provides read-only access to the current set of features.
const base::hash_map<std::string, double>& features() const {
return features_;
......@@ -103,6 +109,55 @@ extern const char kUrlNumOtherHostTokensGTThree[];
// token features, "abc" and "efg". Query parameters are not included.
extern const char kUrlPathToken[];
////////////////////////////////////////////////////
// DOM HTML form features
////////////////////////////////////////////////////
// Set if the page has any <form> elements.
extern const char kPageHasForms[];
// The fraction of form elements whose |action| attribute points to a
// URL on a different domain from the document URL.
extern const char kPageActionOtherDomainFreq[];
// Set if the page has any <input type="text"> elements
// (includes inputs with missing or unknown types).
extern const char kPageHasTextInputs[];
// Set if the page has any <input type="password"> elements.
extern const char kPageHasPswdInputs[];
// Set if the page has any <input type="radio"> elements.
extern const char kPageHasRadioInputs[];
// Set if the page has any <input type="checkbox"> elements.
extern const char kPageHasCheckInputs[];
////////////////////////////////////////////////////
// DOM HTML link features
////////////////////////////////////////////////////
// The fraction of links in the page which point to a domain other than the
// domain of the document. See "URL host features" above for a discussion
// of how the doamin is computed.
extern const char kPageExternalLinksFreq[];
// Token feature containing each external domain that is linked to.
extern const char kPageLinkDomain[];
// Fraction of links in the page that use https.
extern const char kPageSecureLinksFreq[];
////////////////////////////////////////////////////
// DOM HTML script features
////////////////////////////////////////////////////
// Set if the number of <script> elements in the page is greater than 1.
extern const char kPageNumScriptTagsGTOne[];
// Set if the number of <script> elements in the page is greater than 6.
extern const char kPageNumScriptTagsGTSix[];
////////////////////////////////////////////////////
// Other DOM HTML features
////////////////////////////////////////////////////
// The fraction of images whose src attribute points to an external domain.
extern const char kPageImgOtherDomainFreq[];
} // namespace features
} // namepsace safe_browsing
......
......@@ -6,6 +6,7 @@
#include "base/format_macros.h"
#include "base/string_util.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace safe_browsing {
......@@ -24,4 +25,20 @@ TEST(PhishingFeaturesTest, TooManyFeatures) {
EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
}
TEST(PhishingFeaturesTest, IllegalFeatureValue) {
FeatureMap features;
EXPECT_FALSE(features.AddRealFeature("toosmall", -0.1));
EXPECT_TRUE(features.AddRealFeature("zero", 0.0));
EXPECT_TRUE(features.AddRealFeature("pointfive", 0.5));
EXPECT_TRUE(features.AddRealFeature("one", 1.0));
EXPECT_FALSE(features.AddRealFeature("toolarge", 1.1));
FeatureMap expected_features;
expected_features.AddRealFeature("zero", 0.0);
expected_features.AddRealFeature("pointfive", 0.5);
expected_features.AddRealFeature("one", 1.0);
EXPECT_THAT(features.features(),
::testing::ContainerEq(expected_features.features()));
}
} // namespace safe_browsing
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingDOMFeatureExtractor handles computing DOM-based features for the
// client-side phishing detection model. These include the presence of various
// types of elements, ratios of external and secure links, and tokens for
// external domains linked to.
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#include <string>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/scoped_ptr.h"
#include "base/task.h"
class GURL;
class RenderView;
namespace WebKit {
class WebElement;
class WebFrame;
}
namespace safe_browsing {
class FeatureMap;
class PhishingDOMFeatureExtractor {
public:
// Callback to be run when feature extraction finishes. The callback
// argument is true if extraction was successful, false otherwise.
typedef Callback1<bool>::Type DoneCallback;
// Creates a PhishingDOMFeatureExtractor for the specified RenderView.
// The PhishingDOMFeatureExtrator should be destroyed prior to destroying
// the RenderView.
explicit PhishingDOMFeatureExtractor(RenderView* render_view);
~PhishingDOMFeatureExtractor();
// Begins extracting features into the given FeatureMap for the page
// currently loaded in this object's RenderView. To avoid blocking the
// render thread for too long, the feature extractor may run in several
// chunks of work, posting a task to the current MessageLoop to continue
// processing. Once feature extraction is complete, |done_callback|
// is run. PhishingDOMFeatureExtractor takes ownership of the callback.
void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
// Must be called if there is a feature extraction in progress when the page
// is unloaded or the PhishingDOMFeatureExtractor is destroyed.
void CancelPendingExtraction();
private:
struct FrameData;
struct PageFeatureState;
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
// finishes, calls RunCallback().
void ExtractFeaturesWithTimeout();
// Handlers for the various HTML elements that we compute features for.
// Since some of the features (such as ratios) cannot be computed until
// feature extraction is finished, these handlers do not add to the feature
// map directly. Instead, they update the values in the PageFeatureState.
void HandleLink(const WebKit::WebElement& element);
void HandleForm(const WebKit::WebElement& element);
void HandleImage(const WebKit::WebElement& element);
void HandleInput(const WebKit::WebElement& element);
void HandleScript(const WebKit::WebElement& element);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
// builds.
void CheckNoPendingExtraction();
// Runs |done_callback_| and then clears all internal state.
void RunCallback(bool success);
// Clears all internal feature extraction state.
void Clear();
// Called after advancing |cur_frame_| to update the state in
// |cur_frame_data_|. Returns true if the state was updated successfully.
bool ResetFrameData();
// Given a URL, checks whether the domain is different from the domain of
// the current frame's URL. If so, stores the domain in |domain| and returns
// true, otherwise returns false.
bool IsExternalDomain(const GURL& url, std::string* domain) const;
// Called once all frames have been processed to compute features from the
// PageFeatureState and add them to |features_|. See features.h for a
// description of which features are computed.
void InsertFeatures();
// Non-owned pointer to the view that we will extract features from.
RenderView* render_view_;
// The output parameters from the most recent call to ExtractFeatures().
FeatureMap* features_; // The caller keeps ownership of this.
scoped_ptr<DoneCallback> done_callback_;
// Non-owned pointer to the current frame that we are processing.
WebKit::WebFrame* cur_frame_;
// Stores extra state for |cur_frame_| that will be persisted until we
// advance to the next frame.
scoped_ptr<FrameData> cur_frame_data_;
// Stores the intermediate data used to create features. This data is
// accumulated across all frames in the RenderView.
scoped_ptr<PageFeatureState> page_feature_state_;
// Used to create ExtractFeaturesWithTimeout tasks.
// These tasks are revoked if extraction is cancelled.
ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_;
DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
};
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment