Commit c1dcc8f0 authored by Sophie Chang's avatar Sophie Chang Committed by Chromium LUCI CQ

Add field in OptimizationFilter for exclusion_regexps

These regexps will explicitly be not matched against an opt filter

Bug: 1158883
Change-Id: I25b0659bcfbe5580598a08719bd673f8fed46f29
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2631447Reviewed-by: default avatarWei-Yin Chen (陳威尹) <wychen@chromium.org>
Reviewed-by: default avatarMichael Crouse <mcrouse@chromium.org>
Commit-Queue: Sophie Chang <sophiechang@chromium.org>
Cr-Commit-Position: refs/heads/master@{#843959}
parent 97a91f46
...@@ -81,16 +81,15 @@ std::unique_ptr<BloomFilter> ProcessBloomFilter( ...@@ -81,16 +81,15 @@ std::unique_ptr<BloomFilter> ProcessBloomFilter(
} }
// Attempts to construct a valid RegexpList from the given // Attempts to construct a valid RegexpList from the given
// |optimization_filter|. If given, |out_status| will be populated with the // |regexps_list|. If given, |out_status| will be populated with the
// status of the operation. If a valid RegexpList cannot be constructed, nullptr // status of the operation. If a valid RegexpList cannot be constructed, nullptr
// is returned. // is returned.
std::unique_ptr<RegexpList> ProcessRegexps( std::unique_ptr<RegexpList> ProcessRegexps(
const proto::OptimizationFilter& optimization_filter, const google::protobuf::RepeatedPtrField<std::string>& regexps_list,
OptimizationFilterStatus* out_status) { OptimizationFilterStatus* out_status) {
std::unique_ptr<RegexpList> regexps = std::make_unique<RegexpList>(); std::unique_ptr<RegexpList> regexps = std::make_unique<RegexpList>();
for (int i = 0; i < optimization_filter.regexps_size(); ++i) { for (int i = 0; i < regexps_list.size(); ++i) {
regexps->emplace_back( regexps->emplace_back(std::make_unique<re2::RE2>(regexps_list.at(i)));
std::make_unique<re2::RE2>(optimization_filter.regexps(i)));
if (!regexps->at(i)->ok()) { if (!regexps->at(i)->ok()) {
PopulateOptimizationFilterStatusIfSet( PopulateOptimizationFilterStatusIfSet(
OptimizationFilterStatus::kInvalidRegexp, out_status); OptimizationFilterStatus::kInvalidRegexp, out_status);
...@@ -164,13 +163,21 @@ std::unique_ptr<OptimizationFilter> ProcessOptimizationFilter( ...@@ -164,13 +163,21 @@ std::unique_ptr<OptimizationFilter> ProcessOptimizationFilter(
std::unique_ptr<RegexpList> regexps; std::unique_ptr<RegexpList> regexps;
if (optimization_filter.regexps_size() > 0) { if (optimization_filter.regexps_size() > 0) {
regexps = ProcessRegexps(optimization_filter, out_status); regexps = ProcessRegexps(optimization_filter.regexps(), out_status);
if (!regexps) if (!regexps)
return nullptr; return nullptr;
} }
std::unique_ptr<RegexpList> exclusion_regexps;
if (optimization_filter.exclusion_regexps_size() > 0) {
exclusion_regexps =
ProcessRegexps(optimization_filter.exclusion_regexps(), out_status);
if (!exclusion_regexps)
return nullptr;
}
return std::make_unique<OptimizationFilter>( return std::make_unique<OptimizationFilter>(
std::move(bloom_filter), std::move(regexps), std::move(bloom_filter), std::move(regexps), std::move(exclusion_regexps),
optimization_filter.skip_host_suffix_checking()); optimization_filter.skip_host_suffix_checking());
} }
......
...@@ -208,6 +208,19 @@ TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithRegexps) { ...@@ -208,6 +208,19 @@ TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithRegexps) {
EXPECT_TRUE(optimization_filter->Matches(GURL("https://test.com"))); EXPECT_TRUE(optimization_filter->Matches(GURL("https://test.com")));
} }
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithExclusionRegexps) {
proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_exclusion_regexps("test");
OptimizationFilterStatus status;
std::unique_ptr<OptimizationFilter> optimization_filter =
ProcessOptimizationFilter(optimization_filter_proto, &status);
EXPECT_EQ(status, OptimizationFilterStatus::kCreatedServerFilter);
ASSERT_TRUE(optimization_filter);
EXPECT_FALSE(optimization_filter->Matches(GURL("https://test.com")));
}
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithInvalidRegexps) { TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithInvalidRegexps) {
proto::OptimizationFilter optimization_filter_proto; proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_regexps("test["); optimization_filter_proto.add_regexps("test[");
...@@ -246,6 +259,33 @@ TEST_F(HintsComponentUtilTest, ...@@ -246,6 +259,33 @@ TEST_F(HintsComponentUtilTest,
EXPECT_EQ(nullptr, optimization_filter); EXPECT_EQ(nullptr, optimization_filter);
} }
TEST_F(
HintsComponentUtilTest,
ProcessOptimizationFilterInvalidExclusionRegexpsOverridesBloomFilterStatus) {
int num_hash_functions = 7;
int num_bits = 1234;
proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_exclusion_regexps("test[");
BloomFilter bloom_filter(num_hash_functions, num_bits);
bloom_filter.Add("host.com");
proto::BloomFilter* bloom_filter_proto =
optimization_filter_proto.mutable_bloom_filter();
bloom_filter_proto->set_num_hash_functions(num_hash_functions);
bloom_filter_proto->set_num_bits(num_bits);
std::string blocklist_data(
reinterpret_cast<const char*>(&bloom_filter.bytes()[0]),
bloom_filter.bytes().size());
bloom_filter_proto->set_data(blocklist_data);
OptimizationFilterStatus status;
std::unique_ptr<OptimizationFilter> optimization_filter =
ProcessOptimizationFilter(optimization_filter_proto, &status);
EXPECT_EQ(status, OptimizationFilterStatus::kInvalidRegexp);
EXPECT_EQ(nullptr, optimization_filter);
}
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithTooLargeFilter) { TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithTooLargeFilter) {
int too_many_bits = features::MaxServerBloomFilterByteSize() * 8 + 1; int too_many_bits = features::MaxServerBloomFilterByteSize() * 8 + 1;
......
...@@ -10,18 +10,42 @@ ...@@ -10,18 +10,42 @@
namespace optimization_guide { namespace optimization_guide {
namespace {
// Maximum number of suffixes to check per url. // Maximum number of suffixes to check per url.
const int kMaxSuffixCount = 5; const int kMaxSuffixCount = 5;
// Realistic minimum length of a host suffix. // Realistic minimum length of a host suffix.
const int kMinHostSuffix = 6; // eg., abc.tv const int kMinHostSuffix = 6; // eg., abc.tv
bool MatchesRegexp(const GURL& url, const RegexpList& regexps) {
if (!url.is_valid())
return false;
std::string clean_url = base::ToLowerASCII(url.GetAsReferrer().spec());
for (auto& regexp : regexps) {
if (!regexp->ok()) {
continue;
}
if (re2::RE2::PartialMatch(clean_url, *regexp)) {
return true;
}
}
return false;
}
} // namespace
OptimizationFilter::OptimizationFilter( OptimizationFilter::OptimizationFilter(
std::unique_ptr<BloomFilter> bloom_filter, std::unique_ptr<BloomFilter> bloom_filter,
std::unique_ptr<RegexpList> regexps, std::unique_ptr<RegexpList> regexps,
std::unique_ptr<RegexpList> exclusion_regexps,
bool skip_host_suffix_checking) bool skip_host_suffix_checking)
: bloom_filter_(std::move(bloom_filter)), : bloom_filter_(std::move(bloom_filter)),
regexps_(std::move(regexps)), regexps_(std::move(regexps)),
exclusion_regexps_(std::move(exclusion_regexps)),
skip_host_suffix_checking_(skip_host_suffix_checking) { skip_host_suffix_checking_(skip_host_suffix_checking) {
// May be created on one thread but used on another. The first call to // May be created on one thread but used on another. The first call to
// CalledOnValidSequence() will re-bind it. // CalledOnValidSequence() will re-bind it.
...@@ -32,12 +56,12 @@ OptimizationFilter::~OptimizationFilter() = default; ...@@ -32,12 +56,12 @@ OptimizationFilter::~OptimizationFilter() = default;
bool OptimizationFilter::Matches(const GURL& url) const { bool OptimizationFilter::Matches(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
return ContainsHostSuffix(url) || MatchesRegexp(url); if (exclusion_regexps_ && MatchesRegexp(url, *exclusion_regexps_))
return false;
return ContainsHostSuffix(url) || (regexps_ && MatchesRegexp(url, *regexps_));
} }
bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const { bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (!bloom_filter_) if (!bloom_filter_)
return false; return false;
...@@ -67,26 +91,4 @@ bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const { ...@@ -67,26 +91,4 @@ bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const {
return false; return false;
} }
bool OptimizationFilter::MatchesRegexp(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (!regexps_)
return false;
if (!url.is_valid())
return false;
std::string clean_url = base::ToLowerASCII(url.GetAsReferrer().spec());
for (auto& regexp : *regexps_) {
if (!regexp->ok()) {
continue;
}
if (re2::RE2::PartialMatch(clean_url, *regexp)) {
return true;
}
}
return false;
}
} // namespace optimization_guide } // namespace optimization_guide
...@@ -19,16 +19,14 @@ namespace optimization_guide { ...@@ -19,16 +19,14 @@ namespace optimization_guide {
typedef std::vector<std::unique_ptr<re2::RE2>> RegexpList; typedef std::vector<std::unique_ptr<re2::RE2>> RegexpList;
// OptimizationFilter represents a filter with two underlying implementations: a // OptimizationFilter represents a filter with two underlying implementations: a
// Bloom filter and a set of regexps. This class has a 1:1 mapping with an // Bloom filter and sets of regexps. This class has a 1:1 mapping with an
// OptimizationFilter protobuf message where this is the logical implementation // OptimizationFilter protobuf message where this is the logical implementation
// of the proto data. // of the proto data.
//
// TODO(dougarnett): consider moving this and BloomFilter under
// components/blacklist/.
class OptimizationFilter { class OptimizationFilter {
public: public:
explicit OptimizationFilter(std::unique_ptr<BloomFilter> bloom_filter, explicit OptimizationFilter(std::unique_ptr<BloomFilter> bloom_filter,
std::unique_ptr<RegexpList> regexps, std::unique_ptr<RegexpList> regexps,
std::unique_ptr<RegexpList> exclusion_regexps,
bool skip_host_suffix_checking); bool skip_host_suffix_checking);
~OptimizationFilter(); ~OptimizationFilter();
...@@ -50,13 +48,12 @@ class OptimizationFilter { ...@@ -50,13 +48,12 @@ class OptimizationFilter {
// This method will return true if any of those suffixes are present. // This method will return true if any of those suffixes are present.
bool ContainsHostSuffix(const GURL& url) const; bool ContainsHostSuffix(const GURL& url) const;
// Returns whether this filter contains a regexp that matches the given url.
bool MatchesRegexp(const GURL& url) const;
std::unique_ptr<BloomFilter> bloom_filter_; std::unique_ptr<BloomFilter> bloom_filter_;
std::unique_ptr<RegexpList> regexps_; std::unique_ptr<RegexpList> regexps_;
std::unique_ptr<RegexpList> exclusion_regexps_;
bool skip_host_suffix_checking_ = false; bool skip_host_suffix_checking_ = false;
SEQUENCE_CHECKER(sequence_checker_); SEQUENCE_CHECKER(sequence_checker_);
......
...@@ -31,7 +31,8 @@ std::unique_ptr<RegexpList> CreateRegexps( ...@@ -31,7 +31,8 @@ std::unique_ptr<RegexpList> CreateRegexps(
TEST(OptimizationFilterTest, TestMatchesBloomFilter) { TEST(OptimizationFilterTest, TestMatchesBloomFilter) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter()); std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("fooco.co.uk"); bloom_filter->Add("fooco.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr, OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://shopping.fooco.co.uk"))); EXPECT_TRUE(opt_filter.Matches(GURL("http://shopping.fooco.co.uk")));
EXPECT_TRUE( EXPECT_TRUE(
...@@ -40,10 +41,22 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilter) { ...@@ -40,10 +41,22 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilter) {
EXPECT_FALSE(opt_filter.Matches(GURL("https://nonfooco.co.uk"))); EXPECT_FALSE(opt_filter.Matches(GURL("https://nonfooco.co.uk")));
} }
TEST(OptimizationFilterTest, TestMatchesBloomFilterChecksRegexpFirst) {
std::unique_ptr<RegexpList> exclusion_regexps(CreateRegexps({"shopping"}));
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("google.com");
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
std::move(exclusion_regexps),
/*skip_host_suffix_checking=*/false);
EXPECT_FALSE(opt_filter.Matches(GURL("http://shopping.google.com")));
EXPECT_TRUE(opt_filter.Matches(GURL("http://www.google.com")));
}
TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) { TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter()); std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("fooco.co.uk"); bloom_filter->Add("fooco.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr, OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/true); /*skip_host_suffix_checking=*/true);
EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk/somepath"))); EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk/somepath")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk"))); EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk")));
...@@ -53,7 +66,8 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) { ...@@ -53,7 +66,8 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) {
TEST(OptimizationFilterTest, TestMatchesRegexp) { TEST(OptimizationFilterTest, TestMatchesRegexp) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"})); std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps), OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://test.com"))); EXPECT_TRUE(opt_filter.Matches(GURL("http://test.com")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://shopping.com/test"))); EXPECT_TRUE(opt_filter.Matches(GURL("https://shopping.com/test")));
...@@ -63,7 +77,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexp) { ...@@ -63,7 +77,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexp) {
TEST(OptimizationFilterTest, TestMatchesRegexpFragment) { TEST(OptimizationFilterTest, TestMatchesRegexpFragment) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"})); std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps), OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
// Fragments are not matched. // Fragments are not matched.
EXPECT_FALSE(opt_filter.Matches(GURL("https://shopping.com/#test"))); EXPECT_FALSE(opt_filter.Matches(GURL("https://shopping.com/#test")));
...@@ -71,7 +86,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpFragment) { ...@@ -71,7 +86,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpFragment) {
TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) { TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test[", "shop"})); std::unique_ptr<RegexpList> regexps(CreateRegexps({"test[", "shop"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps), OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
// Invalid regexps are not used // Invalid regexps are not used
EXPECT_FALSE(opt_filter.Matches(GURL("https://test.com/"))); EXPECT_FALSE(opt_filter.Matches(GURL("https://test.com/")));
...@@ -80,7 +96,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) { ...@@ -80,7 +96,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) {
TEST(OptimizationFilterTest, TestMatchesRegexpInvalidGURL) { TEST(OptimizationFilterTest, TestMatchesRegexpInvalidGURL) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"})); std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps), OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
// Invalid urls are not matched. // Invalid urls are not matched.
EXPECT_FALSE(opt_filter.Matches(GURL("test"))); EXPECT_FALSE(opt_filter.Matches(GURL("test")));
...@@ -90,7 +107,8 @@ TEST(OptimizationFilterTest, TestMatchesMaxSuffix) { ...@@ -90,7 +107,8 @@ TEST(OptimizationFilterTest, TestMatchesMaxSuffix) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter()); std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("one.two.three.four.co.uk"); bloom_filter->Add("one.two.three.four.co.uk");
bloom_filter->Add("one.two.three.four.five.co.uk"); bloom_filter->Add("one.two.three.four.five.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr, OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://host.one.two.three.four.co.uk"))); EXPECT_TRUE(opt_filter.Matches(GURL("http://host.one.two.three.four.co.uk")));
EXPECT_FALSE( EXPECT_FALSE(
...@@ -104,7 +122,8 @@ TEST(OptimizationFilterTest, TestMatchesMinSuffix) { ...@@ -104,7 +122,8 @@ TEST(OptimizationFilterTest, TestMatchesMinSuffix) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter()); std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("abc.tv"); bloom_filter->Add("abc.tv");
bloom_filter->Add("xy.tv"); bloom_filter->Add("xy.tv");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr, OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false); /*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("https://abc.tv"))); EXPECT_TRUE(opt_filter.Matches(GURL("https://abc.tv")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://host.abc.tv"))); EXPECT_TRUE(opt_filter.Matches(GURL("https://host.abc.tv")));
......
...@@ -245,14 +245,22 @@ message BloomFilter { ...@@ -245,14 +245,22 @@ message BloomFilter {
} }
// A scalable filter for an optimization type. // A scalable filter for an optimization type.
//
// Next ID: 6
message OptimizationFilter { message OptimizationFilter {
// The type of optimization this filter applies to. // The type of optimization this filter applies to.
optional OptimizationType optimization_type = 1; optional OptimizationType optimization_type = 1;
// The filter data represented as a Bloom filter. // The filter data represented as a Bloom filter.
optional BloomFilter bloom_filter = 2; optional BloomFilter bloom_filter = 2;
// Additional filters represented as regexps matched against the main page // Additional filters represented as regexps matched against the main page
// URL. // URL. If matched, the URL will be considered as contained in the filter.
repeated string regexps = 3; repeated string regexps = 3;
// Additional filters represented as regexps matched against the main page
// URL. If matched, the URL will be considered as not contained in the filter.
//
// This set of regexps should be checked prior to any matching against the
// contained bloom_filter or regexps fields.
repeated string exclusion_regexps = 5;
// Whether additional host suffixes (other than the host) should be checked // Whether additional host suffixes (other than the host) should be checked
// against the filter. // against the filter.
optional bool skip_host_suffix_checking = 4; optional bool skip_host_suffix_checking = 4;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment