Commit 8525b6e8 authored by Jeffrey Kardatzke's avatar Jeffrey Kardatzke Committed by Commit Bot

Retain better redaction context and enhance serial redaction

When data was redacted with context, previously just a unique numeric
identifier was put in place of the data. This wasn't obvious to users of
the logs because sometimes one number was just getting replaced with
another. This is now made clear so that even when we redact with
context, we put a redaction identifier in place of what we removed.

This change also enhances serial number redaction to cover a wider range
of occurrences of serial numbers in logs.

BUG=chromium:940884
TEST=Unit tests pass, manually inspected uploaded feedback report

Change-Id: I57e9a7f612be30eb0823fd0bb9d13ffc16bd3a8a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1740132Reviewed-by: default avatarMike Frysinger <vapier@chromium.org>
Reviewed-by: default avatarJ Kardatzke <jkardatzke@chromium.org>
Commit-Queue: Jeffrey Kardatzke <jkardatzke@google.com>
Cr-Commit-Position: refs/heads/master@{#685291}
parent cf7f598e
......@@ -44,24 +44,28 @@ namespace {
// (?i) turns on case insensitivity for the remainder of the regex.
// (?-s) turns off "dot matches newline" for the remainder of the regex.
// (?:regex) denotes non-capturing parentheses group.
constexpr const char* kCustomPatternsWithContext[] = {
CustomPatternWithAlias kCustomPatternsWithContext[] = {
// ModemManager
"(\\bCell ID: ')([0-9a-fA-F]+)(')",
"(\\bLocation area code: ')([0-9a-fA-F]+)(')",
{"CellID", "(\\bCell ID: ')([0-9a-fA-F]+)(')"},
{"LocAC", "(\\bLocation area code: ')([0-9a-fA-F]+)(')"},
// wpa_supplicant
"(?i-s)(\\bssid[= ]')(.+)(')",
"(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()",
{"SSID", "(?i-s)(\\bssid[= ]')(.+)(')"},
{"SSIDHex", "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()"},
// shill
"(?-s)(\\[SSID=)(.+?)(\\])",
{"SSID", "(?-s)(\\[SSID=)(.+?)(\\])"},
// Serial numbers
"(?i-s)(serial\\s*(?:number)?\\s*[:=]\\s*)([0-9a-zA-Z\\-\"]+)()",
// Serial numbers. The actual serial number itself can include any alphanum
// char as well as dashes, periods, colons, slashes and unprintable ASCII
// chars (except newline).
{"Serial",
"(?i-s)(\\bserial\\s*_?(?:number)?['\"]?\\s*[:=]\\s*['\"]?)"
"([0-9a-zA-Z\\-.:\\/\\\\\\x00-\\x09\\x0B-\\x1F]+)(\\b)"},
// GAIA IDs
R"xxx((\"?\bgaia_id\"?[=:]['\"])(\d+)(\b['\"]))xxx",
R"xxx((\{id: )(\d+)(, email:))xxx",
{"GAIA", R"xxx((\"?\bgaia_id\"?[=:]['\"])(\d+)(\b['\"]))xxx"},
{"GAIA", R"xxx((\{id: )(\d+)(, email:))xxx"},
};
bool MaybeUnmapAddress(net::IPAddress* addr) {
......@@ -292,7 +296,7 @@ std::string MaybeScrubIPAddress(const std::string& addr) {
// The |kCustomPatternWithoutContext| array defines further patterns to match
// and anonymize. Each pattern consists of a single capturing group.
CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {
CustomPatternWithAlias kCustomPatternsWithoutContext[] = {
{"URL", "(?i)(" IRI ")"},
// Email Addresses need to come after URLs because they can be part
// of a query parameter.
......@@ -363,10 +367,7 @@ constexpr size_t kNumNonAnonymizedMacs = base::size(kNonAnonymizedMacAddresses);
} // namespace
AnonymizerTool::AnonymizerTool(const char* const* first_party_extension_ids)
: first_party_extension_ids_(first_party_extension_ids),
custom_patterns_with_context_(base::size(kCustomPatternsWithContext)),
custom_patterns_without_context_(
base::size(kCustomPatternsWithoutContext)) {
: first_party_extension_ids_(first_party_extension_ids) {
DETACH_FROM_SEQUENCE(sequence_checker_);
// Identity-map these, so we don't mangle them.
for (const char* mac : kNonAnonymizedMacAddresses)
......@@ -562,23 +563,22 @@ std::string AnonymizerTool::AnonymizeAndroidAppStoragePaths(
std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
for (size_t i = 0; i < base::size(kCustomPatternsWithContext); i++) {
input =
AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],
&custom_patterns_with_context_[i]);
AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i]);
}
for (size_t i = 0; i < base::size(kCustomPatternsWithoutContext); i++) {
input = AnonymizeCustomPatternWithoutContext(
input, kCustomPatternsWithoutContext[i],
&custom_patterns_without_context_[i]);
input, kCustomPatternsWithoutContext[i]);
}
return input;
}
std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
const std::string& input,
const std::string& pattern,
std::map<std::string, std::string>* identifier_space) {
RE2* re = GetRegExp(pattern);
const CustomPatternWithAlias& pattern) {
RE2* re = GetRegExp(pattern.pattern);
DCHECK_EQ(3, re->NumberOfCapturingGroups());
std::map<std::string, std::string>* identifier_space =
&custom_patterns_with_context_[pattern.alias];
std::string result;
result.reserve(input.size());
......@@ -592,7 +592,11 @@ std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
std::string matched_id_as_string = matched_id.as_string();
std::string replacement_id = (*identifier_space)[matched_id_as_string];
if (replacement_id.empty()) {
replacement_id = base::NumberToString(identifier_space->size());
// The weird NumberToString trick is because Windows does not like
// to deal with %zu and a size_t in printf, nor does it support %llu.
replacement_id = base::StringPrintf(
"<%s: %s>", pattern.alias,
base::NumberToString(identifier_space->size()).c_str());
(*identifier_space)[matched_id_as_string] = replacement_id;
}
......@@ -652,11 +656,13 @@ bool IsUrlWhitelisted(re2::StringPiece url,
std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
const std::string& input,
const CustomPatternWithoutContext& pattern,
std::map<std::string, std::string>* identifier_space) {
const CustomPatternWithAlias& pattern) {
RE2* re = GetRegExp(pattern.pattern);
DCHECK_EQ(1, re->NumberOfCapturingGroups());
std::map<std::string, std::string>* identifier_space =
&custom_patterns_without_context_[pattern.alias];
std::string result;
result.reserve(input.size());
......@@ -675,7 +681,7 @@ std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
if (replacement_id.empty()) {
replacement_id = MaybeScrubIPAddress(matched_id_as_string);
if (replacement_id != matched_id_as_string) {
// The weird Uint64toString trick is because Windows does not like
// The weird NumberToString trick is because Windows does not like
// to deal with %zu and a size_t in printf, nor does it support %llu.
replacement_id = base::StringPrintf(
"<%s: %s>",
......
......@@ -21,12 +21,12 @@ class RE2;
namespace feedback {
struct CustomPatternWithoutContext {
struct CustomPatternWithAlias {
// A string literal used in anonymized tests. Matches to the |pattern| are
// replaced with <|alias|: 1>, <|alias|: 2>, ...
const char* alias;
// A RE2 regexp with exactly one capture group. Matches will be replaced by
// the alias reference described above.
// A RE2 regexp used in the replacing logic. Matches will be replaced by the
// alias reference described above.
const char* pattern;
};
......@@ -55,12 +55,10 @@ class AnonymizerTool {
std::string AnonymizeCustomPatterns(std::string input);
std::string AnonymizeCustomPatternWithContext(
const std::string& input,
const std::string& pattern,
std::map<std::string, std::string>* identifier_space);
const CustomPatternWithAlias& pattern);
std::string AnonymizeCustomPatternWithoutContext(
const std::string& input,
const CustomPatternWithoutContext& pattern,
std::map<std::string, std::string>* identifier_space);
const CustomPatternWithAlias& pattern);
// Null-terminated list of first party extension IDs. We need to have this
// passed into us because we can't refer to the code where these are defined.
......@@ -81,10 +79,13 @@ class AnonymizerTool {
std::map<std::string, std::string> hashes_;
// Like mac addresses, identifiers in custom patterns are anonymized.
// custom_patterns_with_context_[i] contains a map of original identifier to
// anonymized identifier for custom pattern number i.
std::vector<std::map<std::string, std::string>> custom_patterns_with_context_;
std::vector<std::map<std::string, std::string>>
// custom_patterns_with_context_["alias"] contains a map of original
// identifier to anonymized identifier for custom pattern with the given
// "alias". We key on alias to allow different patterns to use the same
// replacement maps.
std::map<std::string, std::map<std::string, std::string>>
custom_patterns_with_context_;
std::map<std::string, std::map<std::string, std::string>>
custom_patterns_without_context_;
// Cache to prevent the repeated compilation of the same regular expression
......
......@@ -33,17 +33,14 @@ class AnonymizerToolTest : public testing::Test {
std::string AnonymizeCustomPatternWithContext(
const std::string& input,
const std::string& pattern,
std::map<std::string, std::string>* space) {
return anonymizer_.AnonymizeCustomPatternWithContext(input, pattern, space);
const CustomPatternWithAlias& pattern) {
return anonymizer_.AnonymizeCustomPatternWithContext(input, pattern);
}
std::string AnonymizeCustomPatternWithoutContext(
const std::string& input,
const CustomPatternWithoutContext& pattern,
std::map<std::string, std::string>* space) {
return anonymizer_.AnonymizeCustomPatternWithoutContext(input, pattern,
space);
const CustomPatternWithAlias& pattern) {
return anonymizer_.AnonymizeCustomPatternWithoutContext(input, pattern);
}
AnonymizerTool anonymizer_{kFakeFirstPartyExtensionIDs};
......@@ -62,7 +59,8 @@ TEST_F(AnonymizerToolTest, Anonymize) {
anonymizer_.Anonymize("11223344556677889900AABBCCDDEEFF"));
// Make sure custom pattern anonymization is invoked.
EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'"));
EXPECT_EQ("Cell ID: '<CellID: 1>'",
AnonymizeCustomPatterns("Cell ID: 'A1B2'"));
// Make sure UUIDs are anonymized.
EXPECT_EQ(
......@@ -158,40 +156,50 @@ TEST_F(AnonymizerToolTest, AnonymizeHashes) {
TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) {
EXPECT_EQ("", AnonymizeCustomPatterns(""));
EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'"));
EXPECT_EQ("Cell ID: '2'", AnonymizeCustomPatterns("Cell ID: 'C1D2'"));
EXPECT_EQ("foo Cell ID: '1' bar",
EXPECT_EQ("Cell ID: '<CellID: 1>'",
AnonymizeCustomPatterns("Cell ID: 'A1B2'"));
EXPECT_EQ("Cell ID: '<CellID: 2>'",
AnonymizeCustomPatterns("Cell ID: 'C1D2'"));
EXPECT_EQ("foo Cell ID: '<CellID: 1>' bar",
AnonymizeCustomPatterns("foo Cell ID: 'A1B2' bar"));
EXPECT_EQ("foo Location area code: '1' bar",
EXPECT_EQ("foo Location area code: '<LocAC: 1>' bar",
AnonymizeCustomPatterns("foo Location area code: 'A1B2' bar"));
EXPECT_EQ("foo\na SSID='1' b\n'",
EXPECT_EQ("foo\na SSID='<SSID: 1>' b\n'",
AnonymizeCustomPatterns("foo\na SSID='Joe's' b\n'"));
EXPECT_EQ("ssid '2'", AnonymizeCustomPatterns("ssid 'My AP'"));
EXPECT_EQ("ssid '<SSID: 2>'", AnonymizeCustomPatterns("ssid 'My AP'"));
EXPECT_EQ("bssid 'aa:bb'", AnonymizeCustomPatterns("bssid 'aa:bb'"));
EXPECT_EQ("Scan SSID - hexdump(len=6): 1\nfoo",
EXPECT_EQ("Scan SSID - hexdump(len=6): <SSIDHex: 1>\nfoo",
AnonymizeCustomPatterns(
"Scan SSID - hexdump(len=6): 47 6f 6f 67 6c 65\nfoo"));
EXPECT_EQ(
"a\nb [SSID=1] [SSID=2] [SSID=foo\nbar] b",
AnonymizeCustomPatterns("a\nb [SSID=foo] [SSID=bar] [SSID=foo\nbar] b"));
EXPECT_EQ("a\nb [SSID=<SSID: 3>] [SSID=<SSID: 1>] [SSID=foo\nbar] b",
AnonymizeCustomPatterns(
"a\nb [SSID=foo] [SSID=Joe's] [SSID=foo\nbar] b"));
EXPECT_EQ("SerialNumber: 1",
EXPECT_EQ("SerialNumber: <Serial: 1>",
AnonymizeCustomPatterns("SerialNumber: 1217D7EF"));
EXPECT_EQ("serial number: 2",
EXPECT_EQ("serial number: <Serial: 2>",
AnonymizeCustomPatterns("serial number: 50C971FEE7F3x010900"));
EXPECT_EQ("SerialNumber: 3",
EXPECT_EQ("SerialNumber: <Serial: 3>",
AnonymizeCustomPatterns("SerialNumber: EVT23-17BA01-004"));
EXPECT_EQ("serial=4", AnonymizeCustomPatterns("serial=\"1234AA5678\""));
EXPECT_EQ("\"gaia_id\":\"1\"",
EXPECT_EQ("serial=\"<Serial: 4>\"",
AnonymizeCustomPatterns("serial=\"1234AA5678\""));
EXPECT_EQ("\"serial_number\"=\"<Serial: 1>\"",
AnonymizeCustomPatterns("\"serial_number\"=\"1217D7EF\""));
EXPECT_EQ("SerialNumber: <Serial: 5>",
AnonymizeCustomPatterns("SerialNumber: 5:00:14.0"));
EXPECT_EQ("Serial: <Serial: 6>",
AnonymizeCustomPatterns("Serial: ABCEFG\x01kjmn-as:342/234\\432"));
EXPECT_EQ("\"gaia_id\":\"<GAIA: 1>\"",
AnonymizeCustomPatterns("\"gaia_id\":\"1234567890\""));
EXPECT_EQ("gaia_id='2'", AnonymizeCustomPatterns("gaia_id='987654321'"));
EXPECT_EQ("{id: 1, email:",
AnonymizeCustomPatterns("{id: 123454321, email:"));
EXPECT_EQ("gaia_id='<GAIA: 2>'",
AnonymizeCustomPatterns("gaia_id='987654321'"));
EXPECT_EQ("{id: <GAIA: 1>, email:",
AnonymizeCustomPatterns("{id: 1234567890, email:"));
EXPECT_EQ("<email: 1>",
AnonymizeCustomPatterns("foo@bar.com"));
......@@ -210,8 +218,9 @@ TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) {
EXPECT_EQ("<URL: 1>",
AnonymizeCustomPatterns("http://example.com/foo?test=1"));
EXPECT_EQ("Foo <URL: 2> Bar",
AnonymizeCustomPatterns("Foo http://192.168.0.1/foo?test=1#123 Bar"));
EXPECT_EQ(
"Foo <URL: 2> Bar",
AnonymizeCustomPatterns("Foo http://192.168.0.1/foo?test=1#123 Bar"));
const char* kURLs[] = {
"http://example.com/foo?test=1",
"http://userid:password@example.com:8080",
......@@ -242,38 +251,38 @@ TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) {
}
TEST_F(AnonymizerToolTest, AnonymizeCustomPatternWithContext) {
const char kPattern[] = "(\\b(?i)id:? ')(\\d+)(')";
std::map<std::string, std::string> space;
EXPECT_EQ("", AnonymizeCustomPatternWithContext("", kPattern, &space));
const CustomPatternWithAlias kPattern1 = {"ID", "(\\b(?i)id:? ')(\\d+)(')"};
const CustomPatternWithAlias kPattern2 = {"ID", "(\\b(?i)id=')(\\d+)(')"};
const CustomPatternWithAlias kPattern3 = {"IDG", "(\\b(?i)idg=')(\\d+)(')"};
EXPECT_EQ("", AnonymizeCustomPatternWithContext("", kPattern1));
EXPECT_EQ("foo\nbar\n",
AnonymizeCustomPatternWithContext("foo\nbar\n", kPattern, &space));
EXPECT_EQ("id '1'",
AnonymizeCustomPatternWithContext("id '2345'", kPattern, &space));
EXPECT_EQ("id '2'",
AnonymizeCustomPatternWithContext("id '1234'", kPattern, &space));
EXPECT_EQ("id: '2'",
AnonymizeCustomPatternWithContext("id: '1234'", kPattern, &space));
EXPECT_EQ("ID: '1'",
AnonymizeCustomPatternWithContext("ID: '2345'", kPattern, &space));
EXPECT_EQ("x1 id '1' 1x id '2'\nid '1'\n",
AnonymizeCustomPatternWithContext("foo\nbar\n", kPattern1));
EXPECT_EQ("id '<ID: 1>'",
AnonymizeCustomPatternWithContext("id '2345'", kPattern1));
EXPECT_EQ("id '<ID: 2>'",
AnonymizeCustomPatternWithContext("id '1234'", kPattern1));
EXPECT_EQ("id: '<ID: 2>'",
AnonymizeCustomPatternWithContext("id: '1234'", kPattern1));
EXPECT_EQ("ID: '<ID: 1>'",
AnonymizeCustomPatternWithContext("ID: '2345'", kPattern1));
EXPECT_EQ("x1 id '<ID: 1>' 1x id '<ID: 2>'\nid '<ID: 1>'\n",
AnonymizeCustomPatternWithContext(
"x1 id '2345' 1x id '1234'\nid '2345'\n", kPattern, &space));
space.clear();
EXPECT_EQ("id '1'",
AnonymizeCustomPatternWithContext("id '1234'", kPattern, &space));
space.clear();
EXPECT_EQ("x1z",
AnonymizeCustomPatternWithContext("xyz", "()(y+)()", &space));
"x1 id '2345' 1x id '1234'\nid '2345'\n", kPattern1));
// Different pattern with same alias should reuse the replacements.
EXPECT_EQ("id='<ID: 2>'",
AnonymizeCustomPatternWithContext("id='1234'", kPattern2));
// Different alias should not reuse replacement from another pattern.
EXPECT_EQ("idg='<IDG: 1>'",
AnonymizeCustomPatternWithContext("idg='1234'", kPattern3));
EXPECT_EQ("x<FOO: 1>z",
AnonymizeCustomPatternWithContext("xyz", {"FOO", "()(y+)()"}));
}
TEST_F(AnonymizerToolTest, AnonymizeCustomPatternWithoutContext) {
CustomPatternWithoutContext kPattern = {"pattern", "(o+)"};
std::map<std::string, std::string> space;
EXPECT_EQ("", AnonymizeCustomPatternWithoutContext("", kPattern, &space));
CustomPatternWithAlias kPattern = {"pattern", "(o+)"};
EXPECT_EQ("", AnonymizeCustomPatternWithoutContext("", kPattern));
EXPECT_EQ("f<pattern: 1>\nf<pattern: 2>z\nf<pattern: 1>l\n",
AnonymizeCustomPatternWithoutContext("fo\nfooz\nfol\n", kPattern,
&space));
AnonymizeCustomPatternWithoutContext("fo\nfooz\nfol\n", kPattern));
}
TEST_F(AnonymizerToolTest, AnonymizeChunk) {
......@@ -282,7 +291,7 @@ TEST_F(AnonymizerToolTest, AnonymizeChunk) {
// output of the anonymizer.
std::pair<std::string, std::string> data[] = {
{"aaaaaaaa [SSID=123aaaaaa]aaaaa", // SSID.
"aaaaaaaa [SSID=1]aaaaa"},
"aaaaaaaa [SSID=<SSID: 1>]aaaaa"},
{"aaaaaaaahttp://tets.comaaaaaaa", // URL.
"aaaaaaaa<URL: 1>"},
{"aaaaaemail@example.comaaa", // Email address.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment