Commit c359d0b8 authored by Jeffrey Kardatzke's avatar Jeffrey Kardatzke Committed by Commit Bot

Expand MAC anonymization and add hash anonymization

This expands MAC anonymization to allow for dash and underscore
separators which are generally seen in Bluetooth MACs. It also adds
anonymization of 32, 40 and 64 char length hex hash values and puts the
4 char hash prefix in the redacted value. It also looks for a special
case which was happening with modetest and ignores that so the data dump
there is preserved.

Bug: 940884
Test: Unit tests pass, uploaded feedback report redaction looks good
Change-Id: I7ced3399f9387392eed24c866d6b60652762e984
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1733749
Commit-Queue: Jeffrey Kardatzke <jkardatzke@google.com>
Reviewed-by: default avatarMichael Giuffrida <michaelpg@chromium.org>
Reviewed-by: default avatarJorge Lucangeli Obes <jorgelo@chromium.org>
Reviewed-by: default avatarMike Frysinger <vapier@chromium.org>
Reviewed-by: default avatarJ Kardatzke <jkardatzke@chromium.org>
Cr-Commit-Position: refs/heads/master@{#684095}
parent c2c22b98
...@@ -385,6 +385,9 @@ std::string AnonymizerTool::Anonymize(const std::string& input) { ...@@ -385,6 +385,9 @@ std::string AnonymizerTool::Anonymize(const std::string& input) {
std::string anonymized = AnonymizeMACAddresses(input); std::string anonymized = AnonymizeMACAddresses(input);
anonymized = AnonymizeAndroidAppStoragePaths(std::move(anonymized)); anonymized = AnonymizeAndroidAppStoragePaths(std::move(anonymized));
anonymized = AnonymizeCustomPatterns(std::move(anonymized)); anonymized = AnonymizeCustomPatterns(std::move(anonymized));
// Do hashes last since they may appear in URLs and they also prevent us from
// properly recognizing the Android storage paths.
anonymized = AnonymizeHashes(std::move(anonymized));
return anonymized; return anonymized;
} }
...@@ -404,14 +407,15 @@ RE2* AnonymizerTool::GetRegExp(const std::string& pattern) { ...@@ -404,14 +407,15 @@ RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {
std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
// This regular expression finds the next MAC address. It splits the data into // This regular expression finds the next MAC address. It splits the data into
// an OUI (Organizationally Unique Identifier) part and a NIC (Network // an OUI (Organizationally Unique Identifier) part and a NIC (Network
// Interface Controller) specific part. // Interface Controller) specific part. We also match on dash and underscore
// because we have seen instances of both of those occurring.
RE2* mac_re = GetRegExp( RE2* mac_re = GetRegExp(
"([0-9a-fA-F][0-9a-fA-F]:" "([0-9a-fA-F][0-9a-fA-F][:\\-_]"
"[0-9a-fA-F][0-9a-fA-F]:" "[0-9a-fA-F][0-9a-fA-F][:\\-_]"
"[0-9a-fA-F][0-9a-fA-F]):(" "[0-9a-fA-F][0-9a-fA-F])[:\\-_]("
"[0-9a-fA-F][0-9a-fA-F]:" "[0-9a-fA-F][0-9a-fA-F][:\\-_]"
"[0-9a-fA-F][0-9a-fA-F]:" "[0-9a-fA-F][0-9a-fA-F][:\\-_]"
"[0-9a-fA-F][0-9a-fA-F])"); "[0-9a-fA-F][0-9a-fA-F])");
std::string result; std::string result;
...@@ -419,12 +423,15 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { ...@@ -419,12 +423,15 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
// Keep consuming, building up a result string as we go. // Keep consuming, building up a result string as we go.
re2::StringPiece text(input); re2::StringPiece text(input);
re2::StringPiece skipped; re2::StringPiece skipped, oui, nic;
re2::StringPiece pre_mac, oui, nic; static const char kMacSeparatorChars[] = "-_";
while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) { while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {
// Look up the MAC address in the hash. // Look up the MAC address in the hash. Force the separator to be a colon
// so that the same MAC with a different format will match in all cases.
std::string oui_string = base::ToLowerASCII(oui.as_string()); std::string oui_string = base::ToLowerASCII(oui.as_string());
base::ReplaceChars(oui_string, kMacSeparatorChars, ":", &oui_string);
std::string nic_string = base::ToLowerASCII(nic.as_string()); std::string nic_string = base::ToLowerASCII(nic.as_string());
base::ReplaceChars(nic_string, kMacSeparatorChars, ":", &nic_string);
std::string mac = oui_string + ":" + nic_string; std::string mac = oui_string + ":" + nic_string;
std::string replacement_mac = mac_addresses_[mac]; std::string replacement_mac = mac_addresses_[mac];
if (replacement_mac.empty()) { if (replacement_mac.empty()) {
...@@ -444,6 +451,58 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { ...@@ -444,6 +451,58 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
return result; return result;
} }
std::string AnonymizerTool::AnonymizeHashes(const std::string& input) {
// This will match hexadecimal strings from length 32 to 64 that have a word
// boundary at each end. We then check to make sure they are one of our valid
// hash lengths before replacing.
// NOTE: There are some occurrences in the dump data (specifically modetest)
// where relevant data is formatted with 32 hex chars on a line. In this case,
// it is preceded by at least 3 whitespace chars, so check for that and in
// that case do not redact.
RE2* hash_re = GetRegExp(R"((\s*)\b([0-9a-fA-F]{4})([0-9a-fA-F]{28,60})\b)");
std::string result;
result.reserve(input.size());
// Keep consuming, building up a result string as we go.
re2::StringPiece text(input);
re2::StringPiece skipped, pre_whitespace, hash_prefix, hash_suffix;
while (FindAndConsumeAndGetSkipped(&text, *hash_re, &skipped, &pre_whitespace,
&hash_prefix, &hash_suffix)) {
skipped.AppendToString(&result);
pre_whitespace.AppendToString(&result);
// Check if it's a valid length for our hashes or if we need to skip due to
// the whitespace check.
size_t hash_length = 4 + hash_suffix.length();
if ((hash_length != 32 && hash_length != 40 && hash_length != 64) ||
(hash_length == 32 && pre_whitespace.length() >= 3)) {
// This is not a hash string, skip it.
hash_prefix.AppendToString(&result);
hash_suffix.AppendToString(&result);
continue;
}
// Look up the hash value address in the map of replacements.
std::string hash_prefix_string =
base::ToLowerASCII(hash_prefix.as_string());
std::string hash =
hash_prefix_string + base::ToLowerASCII(hash_suffix.as_string());
std::string replacement_hash = hashes_[hash];
if (replacement_hash.empty()) {
// If not found, build up a replacement value.
replacement_hash = base::StringPrintf(
"<HASH:%s %zd>", hash_prefix_string.c_str(), hashes_.size());
hashes_[hash] = replacement_hash;
}
result += replacement_hash;
}
text.AppendToString(&result);
return result;
}
std::string AnonymizerTool::AnonymizeAndroidAppStoragePaths( std::string AnonymizerTool::AnonymizeAndroidAppStoragePaths(
const std::string& input) { const std::string& input) {
// We only use this on Chrome OS and there's differences in the API for // We only use this on Chrome OS and there's differences in the API for
......
...@@ -51,6 +51,7 @@ class AnonymizerTool { ...@@ -51,6 +51,7 @@ class AnonymizerTool {
std::string AnonymizeMACAddresses(const std::string& input); std::string AnonymizeMACAddresses(const std::string& input);
std::string AnonymizeAndroidAppStoragePaths(const std::string& input); std::string AnonymizeAndroidAppStoragePaths(const std::string& input);
std::string AnonymizeHashes(const std::string& input);
std::string AnonymizeCustomPatterns(std::string input); std::string AnonymizeCustomPatterns(std::string input);
std::string AnonymizeCustomPatternWithContext( std::string AnonymizeCustomPatternWithContext(
const std::string& input, const std::string& input,
...@@ -66,12 +67,19 @@ class AnonymizerTool { ...@@ -66,12 +67,19 @@ class AnonymizerTool {
const char* const* first_party_extension_ids_; // Not owned. const char* const* first_party_extension_ids_; // Not owned.
// Map of MAC addresses discovered in anonymized strings to anonymized // Map of MAC addresses discovered in anonymized strings to anonymized
// representations. 11:22:33:44:55:66 gets anonymized to 11:22:33:00:00:01, // representations. 11:22:33:44:55:66 gets anonymized to
// where the first three bytes represent the manufacturer. The last three // [MAC OUI=11:22:33 IFACE=1], where the first three bytes (OUI) represent the
// bytes are used to distinguish different MAC addresses and are incremented // manufacturer. The IFACE value is incremented for each newly discovered MAC
// for each newly discovered MAC address. // address.
std::map<std::string, std::string> mac_addresses_; std::map<std::string, std::string> mac_addresses_;
// Map of hashes discovered in anonymized strings to anonymized
// representations. Hexadecimal strings of length 32, 40 and 64 are considered
// to be hashes. 11223344556677889900aabbccddeeff gets anonymized to
// <HASH:1122 1> where the first 2 bytes of the hash are retained as-is and
// the value after that is incremented for each newly discovered hash.
std::map<std::string, std::string> hashes_;
// Like mac addresses, identifiers in custom patterns are anonymized. // Like mac addresses, identifiers in custom patterns are anonymized.
// custom_patterns_with_context_[i] contains a map of original identifier to // custom_patterns_with_context_[i] contains a map of original identifier to
// anonymized identifier for custom pattern number i. // anonymized identifier for custom pattern number i.
......
...@@ -19,6 +19,10 @@ class AnonymizerToolTest : public testing::Test { ...@@ -19,6 +19,10 @@ class AnonymizerToolTest : public testing::Test {
return anonymizer_.AnonymizeMACAddresses(input); return anonymizer_.AnonymizeMACAddresses(input);
} }
std::string AnonymizeHashes(const std::string& input) {
return anonymizer_.AnonymizeHashes(input);
}
std::string AnonymizeAndroidAppStoragePaths(const std::string& input) { std::string AnonymizeAndroidAppStoragePaths(const std::string& input) {
return anonymizer_.AnonymizeAndroidAppStoragePaths(input); return anonymizer_.AnonymizeAndroidAppStoragePaths(input);
} }
...@@ -53,6 +57,10 @@ TEST_F(AnonymizerToolTest, Anonymize) { ...@@ -53,6 +57,10 @@ TEST_F(AnonymizerToolTest, Anonymize) {
EXPECT_EQ("[MAC OUI=02:46:8a IFACE=1]", EXPECT_EQ("[MAC OUI=02:46:8a IFACE=1]",
anonymizer_.Anonymize("02:46:8a:ce:13:57")); anonymizer_.Anonymize("02:46:8a:ce:13:57"));
// Make sure hash anonymization is invoked.
EXPECT_EQ("<HASH:1122 1>",
anonymizer_.Anonymize("11223344556677889900AABBCCDDEEFF"));
// Make sure custom pattern anonymization is invoked. // Make sure custom pattern anonymization is invoked.
EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'")); EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'"));
...@@ -79,6 +87,10 @@ TEST_F(AnonymizerToolTest, AnonymizeMACAddresses) { ...@@ -79,6 +87,10 @@ TEST_F(AnonymizerToolTest, AnonymizeMACAddresses) {
EXPECT_EQ("11:22:33:44:55", AnonymizeMACAddresses("11:22:33:44:55")); EXPECT_EQ("11:22:33:44:55", AnonymizeMACAddresses("11:22:33:44:55"));
EXPECT_EQ("[MAC OUI=aa:bb:cc IFACE=1]", EXPECT_EQ("[MAC OUI=aa:bb:cc IFACE=1]",
AnonymizeMACAddresses("aa:bb:cc:dd:ee:ff")); AnonymizeMACAddresses("aa:bb:cc:dd:ee:ff"));
EXPECT_EQ("[MAC OUI=aa:bb:cc IFACE=1]",
AnonymizeMACAddresses("aa_bb_cc_dd_ee_ff"));
EXPECT_EQ("[MAC OUI=aa:bb:cc IFACE=1]",
AnonymizeMACAddresses("aa-bb-cc-dd-ee-ff"));
EXPECT_EQ("00:00:00:00:00:00", AnonymizeMACAddresses("00:00:00:00:00:00")); EXPECT_EQ("00:00:00:00:00:00", AnonymizeMACAddresses("00:00:00:00:00:00"));
EXPECT_EQ("ff:ff:ff:ff:ff:ff", AnonymizeMACAddresses("ff:ff:ff:ff:ff:ff")); EXPECT_EQ("ff:ff:ff:ff:ff:ff", AnonymizeMACAddresses("ff:ff:ff:ff:ff:ff"));
EXPECT_EQ( EXPECT_EQ(
...@@ -98,6 +110,51 @@ TEST_F(AnonymizerToolTest, AnonymizeMACAddresses) { ...@@ -98,6 +110,51 @@ TEST_F(AnonymizerToolTest, AnonymizeMACAddresses) {
AnonymizeMACAddresses("Remember bB:Cc:DD:ee:ff:00?")); AnonymizeMACAddresses("Remember bB:Cc:DD:ee:ff:00?"));
} }
TEST_F(AnonymizerToolTest, AnonymizeHashes) {
EXPECT_EQ("", AnonymizeHashes(""));
EXPECT_EQ("foo\nbar\n", AnonymizeHashes("foo\nbar\n"));
// Too short.
EXPECT_EQ("11223344556677889900aabbccddee",
AnonymizeHashes("11223344556677889900aabbccddee"));
// Not the right length.
EXPECT_EQ("11223344556677889900aabbccddeeff1122",
AnonymizeHashes("11223344556677889900aabbccddeeff1122"));
// Too long.
EXPECT_EQ(
"11223344556677889900aabbccddeeff11223344556677889900aabbccddeeff11",
AnonymizeHashes("11223344556677889900aabbccddeeff11223344556677889900aabb"
"ccddeeff11"));
// Test all 3 valid lengths.
EXPECT_EQ("<HASH:aabb 1>",
AnonymizeHashes("aabbccddeeff00112233445566778899"));
EXPECT_EQ("<HASH:aabb 2>",
AnonymizeHashes("aabbccddeeff00112233445566778899aabbccdd"));
EXPECT_EQ(
"<HASH:9988 3>",
AnonymizeHashes(
"99887766554433221100ffeeddccbbaaaabbccddeeff00112233445566778899"));
// Skip 32 byte hashes that have a at least 3 whitespace chars before it.
EXPECT_EQ(" <HASH:aabb 1>",
AnonymizeHashes(" aabbccddeeff00112233445566778899"));
EXPECT_EQ(" aabbccddeeff00112233445566778899",
AnonymizeHashes(" aabbccddeeff00112233445566778899"));
// Multiline test.
EXPECT_EQ(
"Hash value=<HASH:aabb 1>, should be replaced as\n"
"well as /<HASH:aabb 1>/ and mixed case of\n"
"<HASH:aabb 1> but we don't go across lines\n"
"aabbccddeeff\n00112233445566778899 but allow multiple on a line "
"<HASH:aabb 4>-"
"<HASH:0011 5>\n",
AnonymizeHashes(
"Hash value=aabbccddeeff00112233445566778899, should be replaced as\n"
"well as /aabbccddeeff00112233445566778899/ and mixed case of\n"
"AaBbCCddEeFf00112233445566778899 but we don't go across lines\n"
"aabbccddeeff\n00112233445566778899 but allow multiple on a line "
"aabbccddeeffaabbccddeeffaabbccddeeffaabb-"
"00112233445566778899aabbccddeeff\n"));
}
TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) { TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) {
EXPECT_EQ("", AnonymizeCustomPatterns("")); EXPECT_EQ("", AnonymizeCustomPatterns(""));
...@@ -366,7 +423,9 @@ TEST_F(AnonymizerToolTest, AnonymizeChunk) { ...@@ -366,7 +423,9 @@ TEST_F(AnonymizerToolTest, AnonymizeChunk) {
{"chrome://resources/f?user=bar", // Potentially PII in parameter. {"chrome://resources/f?user=bar", // Potentially PII in parameter.
"<URL: 2>"}, "<URL: 2>"},
{"chrome-extension://nkoccljplnhpfnfiajclkommnmllphnl/foobar.js?bar=x", {"chrome-extension://nkoccljplnhpfnfiajclkommnmllphnl/foobar.js?bar=x",
"<URL: 3>"}, // Potentially PII in parameter. "<URL: 3>"}, // Potentially PII in parameter.
{"/root/27540283740a0897ab7c8de0f809add2bacde78f/foo",
"/root/<HASH:2754 1>/foo"}, // Hash string.
#if defined(OS_CHROMEOS) // We only anonymize Android paths on Chrome OS. #if defined(OS_CHROMEOS) // We only anonymize Android paths on Chrome OS.
// Allowed android storage path. // Allowed android storage path.
{"112K\t/home/root/deadbeef1234/android-data/data/system_de", {"112K\t/home/root/deadbeef1234/android-data/data/system_de",
......
...@@ -60,8 +60,8 @@ TEST_F(ShellSystemLogsFetcherTest, TestLogSources) { ...@@ -60,8 +60,8 @@ TEST_F(ShellSystemLogsFetcherTest, TestLogSources) {
EXPECT_TRUE(registry); EXPECT_TRUE(registry);
std::vector<scoped_refptr<const Extension>> extensions{ std::vector<scoped_refptr<const Extension>> extensions{
BuildExtension("My First Extension", "1.1", std::string(32, 'a')), BuildExtension("My First Extension", "1.1", std::string(32, 'g')),
BuildExtension("My Second Extension", "1.2", std::string(32, 'b'))}; BuildExtension("My Second Extension", "1.2", std::string(32, 'h'))};
for (const scoped_refptr<const Extension>& extension : extensions) for (const scoped_refptr<const Extension>& extension : extensions)
registry->AddEnabled(extension); registry->AddEnabled(extension);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment