Commit c1dcc8f0 authored by Sophie Chang's avatar Sophie Chang Committed by Chromium LUCI CQ

Add field in OptimizationFilter for exclusion_regexps

These regexps will explicitly be not matched against an opt filter

Bug: 1158883
Change-Id: I25b0659bcfbe5580598a08719bd673f8fed46f29
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2631447Reviewed-by: default avatarWei-Yin Chen (陳威尹) <wychen@chromium.org>
Reviewed-by: default avatarMichael Crouse <mcrouse@chromium.org>
Commit-Queue: Sophie Chang <sophiechang@chromium.org>
Cr-Commit-Position: refs/heads/master@{#843959}
parent 97a91f46
......@@ -81,16 +81,15 @@ std::unique_ptr<BloomFilter> ProcessBloomFilter(
}
// Attempts to construct a valid RegexpList from the given
// |optimization_filter|. If given, |out_status| will be populated with the
// |regexps_list|. If given, |out_status| will be populated with the
// status of the operation. If a valid RegexpList cannot be constructed, nullptr
// is returned.
std::unique_ptr<RegexpList> ProcessRegexps(
const proto::OptimizationFilter& optimization_filter,
const google::protobuf::RepeatedPtrField<std::string>& regexps_list,
OptimizationFilterStatus* out_status) {
std::unique_ptr<RegexpList> regexps = std::make_unique<RegexpList>();
for (int i = 0; i < optimization_filter.regexps_size(); ++i) {
regexps->emplace_back(
std::make_unique<re2::RE2>(optimization_filter.regexps(i)));
for (int i = 0; i < regexps_list.size(); ++i) {
regexps->emplace_back(std::make_unique<re2::RE2>(regexps_list.at(i)));
if (!regexps->at(i)->ok()) {
PopulateOptimizationFilterStatusIfSet(
OptimizationFilterStatus::kInvalidRegexp, out_status);
......@@ -164,13 +163,21 @@ std::unique_ptr<OptimizationFilter> ProcessOptimizationFilter(
std::unique_ptr<RegexpList> regexps;
if (optimization_filter.regexps_size() > 0) {
regexps = ProcessRegexps(optimization_filter, out_status);
regexps = ProcessRegexps(optimization_filter.regexps(), out_status);
if (!regexps)
return nullptr;
}
std::unique_ptr<RegexpList> exclusion_regexps;
if (optimization_filter.exclusion_regexps_size() > 0) {
exclusion_regexps =
ProcessRegexps(optimization_filter.exclusion_regexps(), out_status);
if (!exclusion_regexps)
return nullptr;
}
return std::make_unique<OptimizationFilter>(
std::move(bloom_filter), std::move(regexps),
std::move(bloom_filter), std::move(regexps), std::move(exclusion_regexps),
optimization_filter.skip_host_suffix_checking());
}
......
......@@ -208,6 +208,19 @@ TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithRegexps) {
EXPECT_TRUE(optimization_filter->Matches(GURL("https://test.com")));
}
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithExclusionRegexps) {
proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_exclusion_regexps("test");
OptimizationFilterStatus status;
std::unique_ptr<OptimizationFilter> optimization_filter =
ProcessOptimizationFilter(optimization_filter_proto, &status);
EXPECT_EQ(status, OptimizationFilterStatus::kCreatedServerFilter);
ASSERT_TRUE(optimization_filter);
EXPECT_FALSE(optimization_filter->Matches(GURL("https://test.com")));
}
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithInvalidRegexps) {
proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_regexps("test[");
......@@ -246,6 +259,33 @@ TEST_F(HintsComponentUtilTest,
EXPECT_EQ(nullptr, optimization_filter);
}
TEST_F(
HintsComponentUtilTest,
ProcessOptimizationFilterInvalidExclusionRegexpsOverridesBloomFilterStatus) {
int num_hash_functions = 7;
int num_bits = 1234;
proto::OptimizationFilter optimization_filter_proto;
optimization_filter_proto.add_exclusion_regexps("test[");
BloomFilter bloom_filter(num_hash_functions, num_bits);
bloom_filter.Add("host.com");
proto::BloomFilter* bloom_filter_proto =
optimization_filter_proto.mutable_bloom_filter();
bloom_filter_proto->set_num_hash_functions(num_hash_functions);
bloom_filter_proto->set_num_bits(num_bits);
std::string blocklist_data(
reinterpret_cast<const char*>(&bloom_filter.bytes()[0]),
bloom_filter.bytes().size());
bloom_filter_proto->set_data(blocklist_data);
OptimizationFilterStatus status;
std::unique_ptr<OptimizationFilter> optimization_filter =
ProcessOptimizationFilter(optimization_filter_proto, &status);
EXPECT_EQ(status, OptimizationFilterStatus::kInvalidRegexp);
EXPECT_EQ(nullptr, optimization_filter);
}
TEST_F(HintsComponentUtilTest, ProcessOptimizationFilterWithTooLargeFilter) {
int too_many_bits = features::MaxServerBloomFilterByteSize() * 8 + 1;
......
......@@ -10,18 +10,42 @@
namespace optimization_guide {
namespace {
// Maximum number of suffixes to check per url.
const int kMaxSuffixCount = 5;
// Realistic minimum length of a host suffix.
const int kMinHostSuffix = 6; // eg., abc.tv
bool MatchesRegexp(const GURL& url, const RegexpList& regexps) {
if (!url.is_valid())
return false;
std::string clean_url = base::ToLowerASCII(url.GetAsReferrer().spec());
for (auto& regexp : regexps) {
if (!regexp->ok()) {
continue;
}
if (re2::RE2::PartialMatch(clean_url, *regexp)) {
return true;
}
}
return false;
}
} // namespace
OptimizationFilter::OptimizationFilter(
std::unique_ptr<BloomFilter> bloom_filter,
std::unique_ptr<RegexpList> regexps,
std::unique_ptr<RegexpList> exclusion_regexps,
bool skip_host_suffix_checking)
: bloom_filter_(std::move(bloom_filter)),
regexps_(std::move(regexps)),
exclusion_regexps_(std::move(exclusion_regexps)),
skip_host_suffix_checking_(skip_host_suffix_checking) {
// May be created on one thread but used on another. The first call to
// CalledOnValidSequence() will re-bind it.
......@@ -32,12 +56,12 @@ OptimizationFilter::~OptimizationFilter() = default;
bool OptimizationFilter::Matches(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
return ContainsHostSuffix(url) || MatchesRegexp(url);
if (exclusion_regexps_ && MatchesRegexp(url, *exclusion_regexps_))
return false;
return ContainsHostSuffix(url) || (regexps_ && MatchesRegexp(url, *regexps_));
}
bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (!bloom_filter_)
return false;
......@@ -67,26 +91,4 @@ bool OptimizationFilter::ContainsHostSuffix(const GURL& url) const {
return false;
}
bool OptimizationFilter::MatchesRegexp(const GURL& url) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (!regexps_)
return false;
if (!url.is_valid())
return false;
std::string clean_url = base::ToLowerASCII(url.GetAsReferrer().spec());
for (auto& regexp : *regexps_) {
if (!regexp->ok()) {
continue;
}
if (re2::RE2::PartialMatch(clean_url, *regexp)) {
return true;
}
}
return false;
}
} // namespace optimization_guide
......@@ -19,16 +19,14 @@ namespace optimization_guide {
typedef std::vector<std::unique_ptr<re2::RE2>> RegexpList;
// OptimizationFilter represents a filter with two underlying implementations: a
// Bloom filter and a set of regexps. This class has a 1:1 mapping with an
// Bloom filter and sets of regexps. This class has a 1:1 mapping with an
// OptimizationFilter protobuf message where this is the logical implementation
// of the proto data.
//
// TODO(dougarnett): consider moving this and BloomFilter under
// components/blacklist/.
class OptimizationFilter {
public:
explicit OptimizationFilter(std::unique_ptr<BloomFilter> bloom_filter,
std::unique_ptr<RegexpList> regexps,
std::unique_ptr<RegexpList> exclusion_regexps,
bool skip_host_suffix_checking);
~OptimizationFilter();
......@@ -50,13 +48,12 @@ class OptimizationFilter {
// This method will return true if any of those suffixes are present.
bool ContainsHostSuffix(const GURL& url) const;
// Returns whether this filter contains a regexp that matches the given url.
bool MatchesRegexp(const GURL& url) const;
std::unique_ptr<BloomFilter> bloom_filter_;
std::unique_ptr<RegexpList> regexps_;
std::unique_ptr<RegexpList> exclusion_regexps_;
bool skip_host_suffix_checking_ = false;
SEQUENCE_CHECKER(sequence_checker_);
......
......@@ -31,7 +31,8 @@ std::unique_ptr<RegexpList> CreateRegexps(
TEST(OptimizationFilterTest, TestMatchesBloomFilter) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("fooco.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr,
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://shopping.fooco.co.uk")));
EXPECT_TRUE(
......@@ -40,10 +41,22 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilter) {
EXPECT_FALSE(opt_filter.Matches(GURL("https://nonfooco.co.uk")));
}
TEST(OptimizationFilterTest, TestMatchesBloomFilterChecksRegexpFirst) {
std::unique_ptr<RegexpList> exclusion_regexps(CreateRegexps({"shopping"}));
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("google.com");
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
std::move(exclusion_regexps),
/*skip_host_suffix_checking=*/false);
EXPECT_FALSE(opt_filter.Matches(GURL("http://shopping.google.com")));
EXPECT_TRUE(opt_filter.Matches(GURL("http://www.google.com")));
}
TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("fooco.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr,
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/true);
EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk/somepath")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://fooco.co.uk")));
......@@ -53,7 +66,8 @@ TEST(OptimizationFilterTest, TestMatchesBloomFilterSkipHostSuffixChecking) {
TEST(OptimizationFilterTest, TestMatchesRegexp) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps),
OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://test.com")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://shopping.com/test")));
......@@ -63,7 +77,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexp) {
TEST(OptimizationFilterTest, TestMatchesRegexpFragment) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps),
OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
// Fragments are not matched.
EXPECT_FALSE(opt_filter.Matches(GURL("https://shopping.com/#test")));
......@@ -71,7 +86,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpFragment) {
TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test[", "shop"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps),
OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
// Invalid regexps are not used
EXPECT_FALSE(opt_filter.Matches(GURL("https://test.com/")));
......@@ -80,7 +96,8 @@ TEST(OptimizationFilterTest, TestMatchesRegexpInvalid) {
TEST(OptimizationFilterTest, TestMatchesRegexpInvalidGURL) {
std::unique_ptr<RegexpList> regexps(CreateRegexps({"test"}));
OptimizationFilter opt_filter(nullptr, std::move(regexps),
OptimizationFilter opt_filter(/*bloom_filter=*/nullptr, std::move(regexps),
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
// Invalid urls are not matched.
EXPECT_FALSE(opt_filter.Matches(GURL("test")));
......@@ -90,7 +107,8 @@ TEST(OptimizationFilterTest, TestMatchesMaxSuffix) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("one.two.three.four.co.uk");
bloom_filter->Add("one.two.three.four.five.co.uk");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr,
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("http://host.one.two.three.four.co.uk")));
EXPECT_FALSE(
......@@ -104,7 +122,8 @@ TEST(OptimizationFilterTest, TestMatchesMinSuffix) {
std::unique_ptr<BloomFilter> bloom_filter(CreateBloomFilter());
bloom_filter->Add("abc.tv");
bloom_filter->Add("xy.tv");
OptimizationFilter opt_filter(std::move(bloom_filter), nullptr,
OptimizationFilter opt_filter(std::move(bloom_filter), /*regexps=*/nullptr,
/*exclusion_regexps=*/nullptr,
/*skip_host_suffix_checking=*/false);
EXPECT_TRUE(opt_filter.Matches(GURL("https://abc.tv")));
EXPECT_TRUE(opt_filter.Matches(GURL("https://host.abc.tv")));
......
......@@ -245,14 +245,22 @@ message BloomFilter {
}
// A scalable filter for an optimization type.
//
// Next ID: 6
message OptimizationFilter {
// The type of optimization this filter applies to.
optional OptimizationType optimization_type = 1;
// The filter data represented as a Bloom filter.
optional BloomFilter bloom_filter = 2;
// Additional filters represented as regexps matched against the main page
// URL.
// URL. If matched, the URL will be considered as contained in the filter.
repeated string regexps = 3;
// Additional filters represented as regexps matched against the main page
// URL. If matched, the URL will be considered as not contained in the filter.
//
// This set of regexps should be checked prior to any matching against the
// contained bloom_filter or regexps fields.
repeated string exclusion_regexps = 5;
// Whether additional host suffixes (other than the host) should be checked
// against the filter.
optional bool skip_host_suffix_checking = 4;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment