Commit 5a379a99 authored by rajendrant's avatar rajendrant Committed by Commit Bot

Add support for robots rules parsing

This CL adds the robots rules that will be retrieved from Litepages, and
some support classes to parse and apply the rules.

Bug: 1144836
Change-Id: I4ae00f0b08ccbb0ddfe98007c18f041f64d8022b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2518970
Commit-Queue: rajendrant <rajendrant@chromium.org>
Reviewed-by: default avatarTarun Bansal <tbansal@chromium.org>
Reviewed-by: default avatarMichael Crouse <mcrouse@chromium.org>
Cr-Commit-Position: refs/heads/master@{#826670}
parent b6a49e16
......@@ -97,6 +97,8 @@ static_library("renderer") {
"previews/resource_loading_hints_agent.h",
"subresource_redirect/public_image_hints_url_loader_throttle.cc",
"subresource_redirect/public_image_hints_url_loader_throttle.h",
"subresource_redirect/robots_rules_decider.cc",
"subresource_redirect/robots_rules_decider.h",
"subresource_redirect/subresource_redirect_hints_agent.cc",
"subresource_redirect/subresource_redirect_hints_agent.h",
"subresource_redirect/subresource_redirect_params.cc",
......@@ -151,6 +153,7 @@ static_library("renderer") {
"//components/content_settings/renderer",
"//components/contextual_search/content:renderer",
"//components/data_reduction_proxy/core/common",
"//components/data_reduction_proxy/proto:subresource_redirect_proto",
"//components/dom_distiller/content/renderer",
"//components/network_hints/renderer",
"//components/no_state_prefetch/common",
......
include_rules = [
"+components/base32",
"+services/metrics/public/cpp",
"+components/data_reduction_proxy/proto/robots_rules.pb.h",
]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/subresource_redirect/robots_rules_decider.h"
#include "base/callback.h"
#include "base/logging.h"
#include "base/metrics/histogram_macros.h"
#include "base/strings/string_util.h"
#include "base/time/time.h"
#include "base/timer/elapsed_timer.h"
#include "chrome/renderer/subresource_redirect/subresource_redirect_params.h"
#include "components/data_reduction_proxy/proto/robots_rules.pb.h"
namespace subresource_redirect {
namespace {
// Returns true if URL path matches the specified pattern. Pattern is anchored
// at the beginning of path. '$' is special only at the end of pattern.
// Algorithm taken from
// https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
bool IsMatchingRobotsRule(const std::string& path, const std::string& pattern) {
// Fast path return when pattern is a simple string and not a regex.
if (pattern.find('*') == std::string::npos &&
pattern.find('$') == std::string::npos) {
return base::StartsWith(path, pattern);
}
size_t numpos = 1;
size_t pos[path.length() + 1];
// The pos[] array holds a sorted list of indexes of 'path', with length
// 'numpos'. At the start and end of each iteration of the main loop below,
// the pos[] array will hold a list of the prefixes of the 'path' which can
// match the current prefix of 'pattern'. If this list is ever empty,
// return false. If we reach the end of 'pattern' with at least one element
// in pos[], return true.
pos[0] = 0;
for (auto pat = pattern.begin(); pat != pattern.end(); ++pat) {
if (*pat == '$' && pat + 1 == pattern.end()) {
return (pos[numpos - 1] == path.length());
}
if (*pat == '*') {
numpos = path.length() - pos[0] + 1;
for (size_t i = 1; i < numpos; i++) {
pos[i] = pos[i - 1] + 1;
}
} else {
// Includes '$' when not at end of pattern.
size_t newnumpos = 0;
for (size_t i = 0; i < numpos; i++) {
if (pos[i] < path.length() && path[pos[i]] == *pat) {
pos[newnumpos++] = pos[i] + 1;
}
}
numpos = newnumpos;
if (numpos == 0)
return false;
}
}
return true;
}
void RecordRobotsRulesReceiveResultHistogram(
RobotsRulesDecider::SubresourceRedirectRobotsRulesReceiveResult result) {
UMA_HISTOGRAM_ENUMERATION(
"SubresourceRedirect.RobotRulesDecider.ReceiveResult", result);
}
void RecordRobotsRulesApplyDurationHistogram(base::TimeDelta duration) {
UMA_HISTOGRAM_TIMES("SubresourceRedirect.RobotRulesDecider.ApplyDuration",
duration);
}
} // namespace
bool RobotsRulesDecider::RobotsRule::Match(const std::string& path) const {
return IsMatchingRobotsRule(path, pattern_);
}
RobotsRulesDecider::RobotsRulesDecider() {
// Using base::Unretained(this) is safe here, since the timer
// |rules_receive_timeout_timer_| is owned by |this| and destroyed before
// |this|.
rules_receive_timeout_timer_.Start(
FROM_HERE, GetRobotsRulesReceiveTimeout(),
base::BindOnce(&RobotsRulesDecider::OnRulesReceiveTimeout,
base::Unretained(this)));
}
RobotsRulesDecider::~RobotsRulesDecider() {
// Consider this as a timeout
if (rules_receive_timeout_timer_.IsRunning())
rules_receive_timeout_timer_.FireNow();
}
void RobotsRulesDecider::UpdateRobotsRules(const std::string& rules) {
robots_rules_.reset();
rules_receive_timeout_timer_.Stop();
proto::RobotsRules robots_rules;
bool is_parse_success = robots_rules.ParseFromString(rules);
RecordRobotsRulesReceiveResultHistogram(
is_parse_success
? SubresourceRedirectRobotsRulesReceiveResult::kSuccess
: SubresourceRedirectRobotsRulesReceiveResult::kParseError);
if (is_parse_success) {
robots_rules_ = std::vector<RobotsRule>();
robots_rules_->reserve(robots_rules.image_ordered_rules_size());
for (const auto& rule : robots_rules.image_ordered_rules()) {
if (rule.has_allowed_pattern()) {
robots_rules_->emplace_back(true, rule.allowed_pattern());
} else if (rule.has_disallowed_pattern()) {
robots_rules_->emplace_back(false, rule.disallowed_pattern());
}
}
}
if (robots_rules_) {
UMA_HISTOGRAM_COUNTS_1000("SubresourceRedirect.RobotRulesDecider.Count",
robots_rules_->size());
}
// Respond to the pending requests, even if robots proto parse failed.
for (auto& request : pending_check_requests_) {
std::move(request.first)
.Run(IsAllowed(request.second) ? CheckResult::kAllowed
: CheckResult::kDisallowed);
}
pending_check_requests_.clear();
}
void RobotsRulesDecider::CheckRobotsRules(const std::string& url_path,
CheckResultCallback callback) {
if (rules_receive_timeout_timer_.IsRunning()) {
// Rules have not been received yet.
pending_check_requests_.emplace_back(
std::make_pair(std::move(callback), url_path));
return;
}
std::move(callback).Run(IsAllowed(url_path) ? CheckResult::kAllowed
: CheckResult::kDisallowed);
}
bool RobotsRulesDecider::IsAllowed(const std::string& url_path) const {
// Rules not received. Could be rule parse error or timeout.
if (!robots_rules_)
return false;
base::ElapsedTimer rules_apply_timer;
for (const auto& rule : *robots_rules_) {
if (rule.Match(url_path)) {
RecordRobotsRulesApplyDurationHistogram(rules_apply_timer.Elapsed());
return rule.is_allow_rule_;
}
}
RecordRobotsRulesApplyDurationHistogram(rules_apply_timer.Elapsed());
// Treat as allowed when none of the allow/disallow rules match.
return true;
}
void RobotsRulesDecider::OnRulesReceiveTimeout() {
DCHECK(!rules_receive_timeout_timer_.IsRunning());
for (auto& request : pending_check_requests_)
std::move(request.first).Run(CheckResult::kTimedout);
pending_check_requests_.clear();
RecordRobotsRulesReceiveResultHistogram(
SubresourceRedirectRobotsRulesReceiveResult::kTimeout);
}
} // namespace subresource_redirect
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_RENDERER_SUBRESOURCE_REDIRECT_ROBOTS_RULES_DECIDER_H_
#define CHROME_RENDERER_SUBRESOURCE_REDIRECT_ROBOTS_RULES_DECIDER_H_
#include <map>
#include <vector>
#include "base/callback_forward.h"
#include "base/macros.h"
#include "base/optional.h"
#include "base/timer/timer.h"
#include "url/gurl.h"
namespace subresource_redirect {
// Holds the robots rules for a singe origin, and enables checking whether an
// url path is allowed or disallowed. Also supports a timeout to receive the
// robots rules after which it will be treated as a full disallow. The check
// result is delivered via callback asynchronously.
class RobotsRulesDecider {
public:
// The final result of robots rule retrieval.
// This should be kept in sync with
// SubresourceRedirectRobotsRulesReceiveResult in enums.xml.
enum SubresourceRedirectRobotsRulesReceiveResult {
kSuccess, // Received and parsed successfully
kTimeout, // Timeout in waiting for rules
kParseError, // Parsing the received binary proto
kMaxValue = kParseError
};
enum CheckResult {
kAllowed, // The resource URL passed the robots rules check
kDisallowed, // The resource URL failed the robots rules check
kTimedout, // Timeout in retrieving the robots rules
};
// Callback to notify the check robot rules result.
using CheckResultCallback = base::OnceCallback<void(CheckResult)>;
RobotsRulesDecider();
~RobotsRulesDecider();
RobotsRulesDecider(const RobotsRulesDecider&) = delete;
RobotsRulesDecider& operator=(const RobotsRulesDecider&) = delete;
// Update the robots rules. This causes any pending check requests to be
// processed immediately and called with th result.
void UpdateRobotsRules(const std::string& rules);
// Check whether the URL path is allowed or disallowed by robots rules.
// |callback| will be called with the result. The callback could be immediate
// if rules are available. Otherwise the callback will be added to
// |pending_check_requests_| and called when a decision can be made like when
// rules are retrieved, or rule fetch timeout, etc.
void CheckRobotsRules(const std::string& url_path,
CheckResultCallback callback);
private:
// Contains one robots.txt rule.
struct RobotsRule {
RobotsRule(bool is_allow_rule, const std::string& pattern)
: is_allow_rule_(is_allow_rule), pattern_(pattern) {}
bool Match(const std::string& path) const;
const bool is_allow_rule_;
const std::string pattern_;
};
// Returns if allowed or disallowed by robots rules.
bool IsAllowed(const std::string& url_path) const;
// Called on rules receive timeout. All pending checks for robots rules are
// notified that the timeout expired and the requests known to |this| are
// cleared.
void OnRulesReceiveTimeout();
// The list of robots rules. When this is empty, it could mean either the
// rules were not received yet, or rules parsing failed.
base::Optional<std::vector<RobotsRule>> robots_rules_;
// Contains the requests that are pending for robots rules to be received.
// Holds the URL path and the callback.
std::vector<std::pair<CheckResultCallback, std::string>>
pending_check_requests_;
// To trigger the timeout for the robots rules to be received.
base::OneShotTimer rules_receive_timeout_timer_;
};
} // namespace subresource_redirect
#endif // CHROME_RENDERER_SUBRESOURCE_REDIRECT_ROBOTS_RULES_DECIDER_H_
......@@ -56,4 +56,11 @@ int64_t GetHintsReceiveTimeout() {
kHintsReceiveDefaultTimeoutSeconds);
}
base::TimeDelta GetRobotsRulesReceiveTimeout() {
return base::TimeDelta::FromMilliseconds(
base::GetFieldTrialParamByFeatureAsInt(
blink::features::kSubresourceRedirect, "robots_rules_receive_timeout",
10));
}
} // namespace subresource_redirect
......@@ -31,6 +31,10 @@ base::TimeDelta GetCompressionRedirectTimeout();
// Returns the public image hinte receive timeout value from field trial.
int64_t GetHintsReceiveTimeout();
// Returns the timeout to wait for the robots rules to be received, after which
// the subresource should be fetched directly from the origin.
base::TimeDelta GetRobotsRulesReceiveTimeout();
} // namespace subresource_redirect
#endif // CHROME_RENDERER_SUBRESOURCE_REDIRECT_SUBRESOURCE_REDIRECT_PARAMS_H_
......@@ -3778,6 +3778,7 @@ test("unit_tests") {
"../renderer/media/flash_embed_rewrite_unittest.cc",
"../renderer/net/net_error_helper_core_unittest.cc",
"../renderer/plugins/plugin_uma_unittest.cc",
"../renderer/subresource_redirect/robots_rules_decider_unittest.cc",
"../renderer/subresource_redirect/subresource_redirect_util_unittest.cc",
"../renderer/v8_unwinder_unittest.cc",
"../test/base/chrome_render_view_test.cc",
......
......@@ -10,3 +10,7 @@ proto_library("data_reduction_proxy_proto") {
"data_store.proto",
]
}
proto_library("subresource_redirect_proto") {
sources = [ "robots_rules.proto" ]
}
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package subresource_redirect.proto;
message RobotsRules {
// A single page-pattern rule, either allowed or disallowed.
message Rule {
oneof rule_field {
// The allowed path-patterns.
string allowed_pattern = 1;
// The disallowed path-patterns.
string disallowed_pattern = 2;
}
}
// Rules for image resources, ordered by length (longest first).
repeated Rule image_ordered_rules = 1;
// Rules for video resources, ordered by length (longest first).
repeated Rule video_ordered_rules = 2;
}
\ No newline at end of file
......@@ -70664,6 +70664,12 @@ https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_2.7.1.pdf
<int value="11" label="AbortedBecauseSentinelFileWasPresent"/>
</enum>
<enum name="SubresourceRedirectRobotsRulesReceiveResult">
<int value="0" label="Success"/>
<int value="1" label="Timeout"/>
<int value="2" label="Parse error"/>
</enum>
<enum name="SuccessTimeoutStarted">
<int value="0" label="Success"/>
<int value="1" label="Timeout"/>
......@@ -572,6 +572,40 @@ reviews. Googlers can read more about this at go/gwsq-gerrit.
</summary>
</histogram>
<histogram name="SubresourceRedirect.RobotRulesDecider.ApplyDuration"
units="ms" expires_after="M92">
<owner>rajendrant@chromium.org</owner>
<owner>mcrouse@chromium.org</owner>
<summary>
Records the duration that the robots rules decider took to apply the rules.
Recorded once for each image fetch. There could be multiple robots rules for
an image origin, and the total duration needed to match with all the rules
and make the final Allow/Disallow decision is recorded here.
</summary>
</histogram>
<histogram name="SubresourceRedirect.RobotRulesDecider.Count" units="count"
expires_after="M92">
<owner>rajendrant@chromium.org</owner>
<owner>mcrouse@chromium.org</owner>
<summary>
Records the total number of robots rules received in the decider. Recorded
whenever robots rules decider object receives successfully parsable robots
rules.
</summary>
</histogram>
<histogram name="SubresourceRedirect.RobotRulesDecider.ReceiveResult"
enum="SubresourceRedirectRobotsRulesReceiveResult" expires_after="M92">
<owner>rajendrant@chromium.org</owner>
<owner>mcrouse@chromium.org</owner>
<summary>
Records the final result of retrieving robots.txt rules. Recorded once per
robots rules decider object, which is roughly the number of unique image
origins in the page.
</summary>
</histogram>
</histograms>
</histogram-configuration>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment