Commit 98aaff50 authored by zysxqn@google.com's avatar zysxqn@google.com

Extracting page shingle hashes for similarity detection.

BUG=

Review URL: https://codereview.chromium.org/268673007

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@269976 0039d316-1c4b-4281-b951-d872f2087c98
parent e221f9f6
......@@ -88,4 +88,10 @@ message ClientSideModel {
// Murmur hash seed that was used to hash the page words.
optional fixed32 murmur_hash_seed = 8;
// Maximum number of unique shingle hashes per page.
optional int32 max_shingles_per_page = 9 [default = 200];
// The number of words in a shingle.
optional int32 shingle_size = 10 [default = 4];
}
......@@ -68,6 +68,9 @@ message ClientPhishingRequest {
optional string OBSOLETE_referrer_url = 9;
// Field 11 is only used on the server.
// List of shingle hashes we extracted.
repeated uint32 shingle_hashes = 12 [packed = true];
}
message ClientPhishingResponse {
......
......@@ -63,6 +63,8 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
&scorer_->page_words(),
scorer_->max_words_per_term(),
scorer_->murmurhash3_seed(),
scorer_->max_shingles_per_page(),
scorer_->shingle_size(),
clock_.get()));
} else {
// We're disabling client-side phishing detection, so tear down all
......@@ -154,12 +156,14 @@ void PhishingClassifier::CancelPendingClassification() {
}
void PhishingClassifier::DOMExtractionFinished(bool success) {
shingle_hashes_.reset(new std::set<uint32>);
if (success) {
// Term feature extraction can take awhile, so it runs asynchronously
// in several chunks of work and invokes the callback when finished.
term_extractor_->ExtractFeatures(
page_text_,
features_.get(),
shingle_hashes_.get(),
base::Bind(&PhishingClassifier::TermExtractionFinished,
base::Unretained(this)));
} else {
......@@ -197,6 +201,10 @@ void PhishingClassifier::TermExtractionFinished(bool success) {
feature->set_name(it->first);
feature->set_value(it->second);
}
for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
it != shingle_hashes_->end(); ++it) {
verdict.add_shingle_hashes(*it);
}
float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
verdict.set_client_score(score);
verdict.set_is_phishing(score >= kPhishyThreshold);
......@@ -236,6 +244,7 @@ void PhishingClassifier::Clear() {
page_text_ = NULL;
done_callback_.Reset();
features_.reset(NULL);
shingle_hashes_.reset(NULL);
}
} // namespace safe_browsing
......@@ -18,6 +18,8 @@
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
#include <set>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/memory/scoped_ptr.h"
......@@ -135,6 +137,7 @@ class PhishingClassifier {
// State for any in-progress extraction.
scoped_ptr<FeatureMap> features_;
scoped_ptr<std::set<uint32> > shingle_hashes_;
const base::string16* page_text_; // owned by the caller
DoneCallback done_callback_;
......
......@@ -93,6 +93,8 @@ class PhishingClassifierTest : public InProcessBrowserTest {
model.set_murmur_hash_seed(2777808611U);
model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
model.set_max_words_per_term(1);
model.set_max_shingles_per_page(100);
model.set_shingle_size(3);
clock_ = new MockFeatureExtractorClock;
scorer_.reset(Scorer::Create(model.SerializeAsString()));
......
......@@ -38,14 +38,19 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
// actual phishing page.
const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
// The maximum size of the negative word cache.
const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
// All of the state pertaining to the current feature extraction.
struct PhishingTermFeatureExtractor::ExtractionState {
// Stores up to max_words_per_term_ previous words separated by spaces.
std::string previous_words;
// Stores the current shingle after a new word is processed and added in.
std::string current_shingle;
// Stores the sizes of the words in current_shingle. Note: the size includes
// the space after each word. In other words, the sum of all sizes in this
// list is equal to the length of current_shingle.
std::list<size_t> shingle_word_sizes;
// Stores the sizes of the words in previous_words. Note: the size includes
// the space after each word. In other words, the sum of all sizes in this
// list is equal to the length of previous_words.
......@@ -81,12 +86,15 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
size_t max_shingles_per_page,
size_t shingle_size,
FeatureExtractorClock* clock)
: page_term_hashes_(page_term_hashes),
page_word_hashes_(page_word_hashes),
max_words_per_term_(max_words_per_term),
murmurhash3_seed_(murmurhash3_seed),
negative_word_cache_(kMaxNegativeWordCacheSize),
max_shingles_per_page_(max_shingles_per_page),
shingle_size_(shingle_size),
clock_(clock),
weak_factory_(this) {
Clear();
......@@ -101,6 +109,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
void PhishingTermFeatureExtractor::ExtractFeatures(
const base::string16* page_text,
FeatureMap* features,
std::set<uint32>* shingle_hashes,
const DoneCallback& done_callback) {
// The RenderView should have called CancelPendingExtraction() before
// starting a new extraction, so DCHECK this.
......@@ -111,6 +120,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures(
page_text_ = page_text;
features_ = features;
shingle_hashes_ = shingle_hashes,
done_callback_ = done_callback;
state_.reset(new ExtractionState(*page_text_, clock_->Now()));
......@@ -184,18 +194,24 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
void PhishingTermFeatureExtractor::HandleWord(
const base::StringPiece16& word) {
// Quickest out if we have seen this word before and know that it's not
// part of any term. This avoids the lowercasing and UTF conversion, both of
// which are relatively expensive.
if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
// We know we're no longer in a possible n-gram, so clear the previous word
// state.
state_->previous_words.clear();
state_->previous_word_sizes.clear();
return;
// First, extract shingle hashes.
const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
state_->current_shingle.append(word_lower + " ");
state_->shingle_word_sizes.push_back(word_lower.size() + 1);
if (state_->shingle_word_sizes.size() == shingle_size_) {
shingle_hashes_->insert(
MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
state_->shingle_word_sizes.pop_front();
}
// Check if the size of shingle hashes is over the limit.
if (shingle_hashes_->size() > max_shingles_per_page_) {
// Pop the largest one.
std::set<uint32>::iterator it = shingle_hashes_->end();
shingle_hashes_->erase(--it);
}
std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
// Next, extract page terms.
uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
// Quick out if the word is not part of any term, which is the common case.
......@@ -203,8 +219,6 @@ void PhishingTermFeatureExtractor::HandleWord(
// Word doesn't exist in our terms so we can clear the n-gram state.
state_->previous_words.clear();
state_->previous_word_sizes.clear();
// Insert into negative cache so that we don't try this again.
negative_word_cache_.Put(word, true);
return;
}
......@@ -276,9 +290,9 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) {
void PhishingTermFeatureExtractor::Clear() {
page_text_ = NULL;
features_ = NULL;
shingle_hashes_ = NULL;
done_callback_.Reset();
state_.reset(NULL);
negative_word_cache_.Clear();
}
} // namespace safe_browsing
......@@ -16,12 +16,12 @@
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#include <set>
#include <string>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/containers/hash_tables.h"
#include "base/containers/mru_cache.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/weak_ptr.h"
#include "base/strings/string16.h"
......@@ -47,6 +47,11 @@ class PhishingTermFeatureExtractor {
// must ensure that they are valid until the PhishingTermFeatureExtractor is
// destroyed.
//
// In addition to extracting page terms, we will also extract text shingling
// sketch, which consists of hashes of N-gram-words (referred to as shingles)
// in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
// the maximum number of unique shingle hashes we extracted per page.
//
// |clock| is used for timing feature extractor operations, and may be mocked
// for testing. The caller keeps ownership of the clock.
PhishingTermFeatureExtractor(
......@@ -54,6 +59,8 @@ class PhishingTermFeatureExtractor {
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
size_t max_shingles_per_page,
size_t shingle_size,
FeatureExtractorClock* clock);
~PhishingTermFeatureExtractor();
......@@ -67,11 +74,12 @@ class PhishingTermFeatureExtractor {
// |done_callback| is run on the current thread.
// PhishingTermFeatureExtractor takes ownership of the callback.
//
// |page_text| and |features| are owned by the caller, and must not be
// destroyed until either |done_callback| is run or
// |page_text|, |features|, and |shingle_hashes| are owned by the caller,
// and must not be destroyed until either |done_callback| is run or
// CancelPendingExtraction() is called.
void ExtractFeatures(const base::string16* page_text,
FeatureMap* features,
std::set<uint32>* shingle_hashes,
const DoneCallback& done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
......@@ -95,10 +103,6 @@ class PhishingTermFeatureExtractor {
// before giving up on the current page.
static const int kMaxTotalTimeMs;
// The size of the cache that we use to determine if we can avoid lower
// casing, hashing, and UTF conversion.
static const int kMaxNegativeWordCacheSize;
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
......@@ -135,12 +139,11 @@ class PhishingTermFeatureExtractor {
// The seed for murmurhash3.
const uint32 murmurhash3_seed_;
// This cache is used to see if we need to check the word at all, as
// converting to UTF8, lowercasing, and hashing are all relatively expensive
// operations. Though this is called an MRU cache, it seems to behave like
// an LRU cache (i.e. it evicts the oldest accesses first).
typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
WordCache negative_word_cache_;
// The maximum number of unique shingle hashes we extract in a page.
const size_t max_shingles_per_page_;
// The number of words in a shingle.
const size_t shingle_size_;
// Non-owned pointer to our clock.
FeatureExtractorClock* clock_;
......@@ -148,6 +151,7 @@ class PhishingTermFeatureExtractor {
// The output parameters from the most recent call to ExtractFeatures().
const base::string16* page_text_; // The caller keeps ownership of this.
FeatureMap* features_; // The caller keeps ownership of this.
std::set<uint32>* shingle_hashes_;
DoneCallback done_callback_;
// Stores the current state of term extraction from |page_text_|.
......
......@@ -26,6 +26,9 @@
using base::ASCIIToUTF16;
using ::testing::Return;
static const uint32 kMurmurHash3Seed = 2777808611U;
namespace safe_browsing {
class PhishingTermFeatureExtractorTest : public ::testing::Test {
......@@ -63,27 +66,35 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
words.insert("\xe5\x86\x8d\xe8\xa7\x81");
static const uint32 kMurmurHash3Seed = 2777808611U;
for (base::hash_set<std::string>::iterator it = words.begin();
it != words.end(); ++it) {
word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
}
ResetExtractor(3 /* max shingles per page */);
}
void ResetExtractor(size_t max_shingles_per_page) {
extractor_.reset(new PhishingTermFeatureExtractor(
&term_hashes_,
&word_hashes_,
3 /* max_words_per_term */,
kMurmurHash3Seed,
max_shingles_per_page,
4 /* shingle_size */,
&clock_));
}
// Runs the TermFeatureExtractor on |page_text|, waiting for the
// completion callback. Returns the success boolean from the callback.
bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) {
bool ExtractFeatures(const base::string16* page_text,
FeatureMap* features,
std::set<uint32>* shingle_hashes) {
success_ = false;
extractor_->ExtractFeatures(
page_text,
features,
shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.Run();
......@@ -91,10 +102,12 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
}
void PartialExtractFeatures(const base::string16* page_text,
FeatureMap* features) {
FeatureMap* features,
std::set<uint32>* shingle_hashes) {
extractor_->ExtractFeatures(
page_text,
features,
shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.PostTask(
......@@ -129,10 +142,13 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
base::string16 page_text = ASCIIToUTF16("blah");
FeatureMap expected_features; // initially empty
std::set<uint32> expected_shingle_hashes;
FeatureMap features;
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
std::set<uint32> shingle_hashes;
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("one one");
expected_features.Clear();
......@@ -140,29 +156,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one one"));
expected_shingle_hashes.clear();
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("bla bla multi word test bla");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("multi word test"));
expected_shingle_hashes.clear();
expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
kMurmurHash3Seed));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// This text has all of the words for one of the terms, but they are
// not in the correct order.
page_text = ASCIIToUTF16("bla bla test word multi bla");
expected_features.Clear();
expected_shingle_hashes.clear();
expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
kMurmurHash3Seed));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Test various separators.
page_text = ASCIIToUTF16("Capitalization plus non-space\n"
"separator... punctuation!");
expected_features.Clear();
......@@ -174,36 +212,77 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
std::string("separator"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("punctuation"));
expected_shingle_hashes.clear();
expected_shingle_hashes.insert(
MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
features.Clear();
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Test a page with too many words and we should only 3 minimum hashes.
page_text = ASCIIToUTF16("This page has way too many words.");
expected_features.Clear();
expected_shingle_hashes.clear();
expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
kMurmurHash3Seed));
expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
kMurmurHash3Seed));
std::set<uint32>::iterator it = expected_shingle_hashes.end();
expected_shingle_hashes.erase(--it);
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Test with empty page text.
page_text = base::string16();
expected_features.Clear();
expected_shingle_hashes.clear();
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Chinese translation of the phrase "hello goodbye". This tests that
// we can correctly separate terms in languages that don't use spaces.
// Chinese translation of the phrase "hello goodbye hello goodbye". This tests
// that we can correctly separate terms in languages that don't use spaces.
page_text =
base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
"\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
expected_features.Clear();
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
expected_shingle_hashes.clear();
expected_shingle_hashes.insert(MurmurHash3String(
"\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
"\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
}
TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
// For this test, we'll cause the feature extraction to run multiple
// iterations by incrementing the clock.
ResetExtractor(200 /* max shingles per page */);
// This page has a total of 30 words. For the features to be computed
// correctly, the extractor has to process the entire string of text.
......@@ -245,10 +324,67 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("two"));
std::set<uint32> expected_shingle_hashes;
expected_shingle_hashes.insert(
MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
expected_shingle_hashes.insert(
MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
FeatureMap features;
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
std::set<uint32> shingle_hashes;
ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Make sure none of the mock expectations carry over to the next test.
::testing::Mock::VerifyAndClearExpectations(&clock_);
......@@ -271,7 +407,8 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
features.Clear();
EXPECT_FALSE(ExtractFeatures(&page_text, &features));
shingle_hashes.clear();
EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
}
TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
......@@ -294,8 +431,9 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
FeatureMap features;
std::set<uint32> shingle_hashes;
// Extract first 10 words then stop.
PartialExtractFeatures(page_text.get(), &features);
PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
page_text.reset(new base::string16());
for (int i = 30; i < 58; ++i) {
......@@ -303,12 +441,13 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
}
page_text->append(ASCIIToUTF16("multi word test "));
features.Clear();
shingle_hashes.clear();
// This part doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
// Now extract normally and make sure nothing breaks.
EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageTerm +
......
......@@ -103,6 +103,14 @@ uint32 Scorer::murmurhash3_seed() const {
return model_.murmur_hash_seed();
}
size_t Scorer::max_shingles_per_page() const {
return model_.max_shingles_per_page();
}
size_t Scorer::shingle_size() const {
return model_.shingle_size();
}
double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
const FeatureMap& features) const {
const base::hash_map<std::string, double>& feature_map = features.features();
......
......@@ -57,6 +57,12 @@ class Scorer {
// Returns the murmurhash3 seed for the loaded model.
uint32 murmurhash3_seed() const;
// Return the maximum number of unique shingle hashes per page.
size_t max_shingles_per_page() const;
// Return the number of words in a shingle.
size_t shingle_size() const;
protected:
// Most clients should use the factory method. This constructor is public
// to allow for mock implementations.
......@@ -79,6 +85,6 @@ class Scorer {
DISALLOW_COPY_AND_ASSIGN(Scorer);
};
} // namepsace safe_browsing
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
......@@ -55,6 +55,8 @@ class PhishingScorerTest : public ::testing::Test {
model_.set_max_words_per_term(2);
model_.set_murmur_hash_seed(12345U);
model_.set_max_shingles_per_page(10);
model_.set_shingle_size(3);
}
ClientSideModel model_;
......@@ -96,6 +98,8 @@ TEST_F(PhishingScorerTest, PageWords) {
::testing::ContainerEq(expected_page_words));
EXPECT_EQ(2U, scorer->max_words_per_term());
EXPECT_EQ(12345U, scorer->murmurhash3_seed());
EXPECT_EQ(10U, scorer->max_shingles_per_page());
EXPECT_EQ(3U, scorer->shingle_size());
}
TEST_F(PhishingScorerTest, ComputeScore) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment