Commit 7f1542b7 authored by Samuel Huang's avatar Samuel Huang Committed by Commit Bot

[Zucchini] Introduce OutlierDetector.

OutlierDetector is a component of (upcoming) ensemble patching. Its goal
is to reduce the likelyhood of false-positive matches by applying simple
statistical tests on BinaryDataHistogram scores.

Bug: 778316
Change-Id: I57e9859b73b9fc62ffffe973ca47ac723e64f6fd
Reviewed-on: https://chromium-review.googlesource.com/798899Reviewed-by: default avatarGreg Thompson <grt@chromium.org>
Commit-Queue: Samuel Huang <huangs@chromium.org>
Cr-Commit-Position: refs/heads/master@{#521342}
parent 41b1f381
......@@ -4,14 +4,59 @@
#include "chrome/installer/zucchini/binary_data_histogram.h"
#include <algorithm>
#include <cmath>
#include <limits>
#include "base/format_macros.h"
#include "base/logging.h"
#include "base/memory/ptr_util.h"
#include "base/strings/stringprintf.h"
namespace zucchini {
/******** OutlierDetector ********/
OutlierDetector::OutlierDetector() = default;
OutlierDetector::~OutlierDetector() = default;
// For BinaryDataHistogram, |sample| is typically in interval [0, 1].
void OutlierDetector::Add(double sample) {
++n_;
sum_ += sample;
sum_of_squares_ += sample * sample;
}
void OutlierDetector::Prepare() {
if (n_ > 0) {
mean_ = sum_ / n_;
standard_deviation_ = ::sqrt((sum_of_squares_ - sum_ * mean_) /
std::max(static_cast<size_t>(1), n_ - 1));
}
}
std::string OutlierDetector::RenderStats() {
return base::StringPrintf("Mean = %.5f, StdDev = %.5f over %" PRIuS
" samples",
mean_, standard_deviation_, n_);
}
// Constants are chosen for BinaryDataHistogram, where |sample| is typically in
// [0, 1].
int OutlierDetector::DecideOutlier(double sample) {
// Lower bound to avoid divide-by-zero and penalizing tight clusters.
constexpr double kMinTolerance = 0.1;
// Number of standard deviations away from mean for value to become outlier.
constexpr double kSigmaBound = 1.9;
if (n_ <= 1)
return 0;
double tolerance = std::max(kMinTolerance, standard_deviation_);
double num_sigma = (sample - mean_) / tolerance;
return num_sigma > kSigmaBound ? 1 : num_sigma < -kSigmaBound ? -1 : 0;
}
/******** BinaryDataHistogram ********/
BinaryDataHistogram::BinaryDataHistogram() = default;
BinaryDataHistogram::~BinaryDataHistogram() = default;
......@@ -24,7 +69,7 @@ bool BinaryDataHistogram::Compute(ConstBufferView region) {
DCHECK_LE(region.size(),
static_cast<size_t>(std::numeric_limits<int32_t>::max()));
histogram_ = base::MakeUnique<int32_t[]>(kNumBins);
histogram_ = std::make_unique<int32_t[]>(kNumBins);
size_ = region.size();
// Number of 2-byte intervals fully contained in |region|.
size_t bound = size_ - sizeof(uint16_t) + 1;
......
......@@ -9,12 +9,48 @@
#include <stdint.h>
#include <memory>
#include <string>
#include "base/macros.h"
#include "chrome/installer/zucchini/buffer_view.h"
namespace zucchini {
// A class to detect outliers in a list of doubles using Chauvenet's criterion:
// Compute mean and standard deviation of observations, then determine whether
// a query value lies beyond a fixed number of standard deviations (sigmas) from
// the mean. The purpose of this test is to reduce the chance of false-positive
// ensemble matches.
class OutlierDetector {
public:
OutlierDetector();
~OutlierDetector();
// Incorporates |sample| into mean and standard deviation.
void Add(double sample);
// Prepares basic statistics for DecideOutlier() calls. Should be called after
// all samples have been added.
void Prepare();
// Renders current statistics as strings for logging.
std::string RenderStats();
// Heuristically decides whether |sample| is an outlier. Returns 1 if |sample|
// is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low".
// Must be called after Prepare().
int DecideOutlier(double sample);
private:
size_t n_ = 0;
double sum_ = 0;
double sum_of_squares_ = 0;
double mean_ = 0;
double standard_deviation_ = 0;
DISALLOW_COPY_AND_ASSIGN(OutlierDetector);
};
// A class to compute similarity score between binary data. The heuristic here
// preprocesses input data to a size-65536 histogram, counting the frequency of
// consecutive 2-byte sequences. Therefore data with lengths < 2 are considered
......
......@@ -6,6 +6,7 @@
#include <stddef.h>
#include <memory>
#include <vector>
#include "chrome/installer/zucchini/buffer_view.h"
......@@ -13,6 +14,65 @@
namespace zucchini {
TEST(OutlierDetectorTest, Basic) {
auto make_detector = [](const std::vector<double>& values) {
auto detector = std::make_unique<OutlierDetector>();
for (double v : values)
detector->Add(v);
detector->Prepare();
return detector;
};
std::unique_ptr<OutlierDetector> detector;
// No data: Should at least not cause error.
detector = make_detector({});
EXPECT_EQ(0, detector->DecideOutlier(0.0));
// Single point: Trivially inert.
detector = make_detector({0.5});
EXPECT_EQ(0, detector->DecideOutlier(0.1));
EXPECT_EQ(0, detector->DecideOutlier(0.5));
EXPECT_EQ(0, detector->DecideOutlier(0.9));
// Two identical points: StdDev is 0, so falls back to built-in tolerance.
detector = make_detector({0.5, 0.5});
EXPECT_EQ(-1, detector->DecideOutlier(0.3));
EXPECT_EQ(0, detector->DecideOutlier(0.499));
EXPECT_EQ(0, detector->DecideOutlier(0.5));
EXPECT_EQ(0, detector->DecideOutlier(0.501));
EXPECT_EQ(1, detector->DecideOutlier(0.7));
// Two separate points: Outliner test is pretty lax.
detector = make_detector({0.4, 0.6});
EXPECT_EQ(-1, detector->DecideOutlier(0.2));
EXPECT_EQ(0, detector->DecideOutlier(0.3));
EXPECT_EQ(0, detector->DecideOutlier(0.5));
EXPECT_EQ(0, detector->DecideOutlier(0.7));
EXPECT_EQ(1, detector->DecideOutlier(0.8));
// Sharpen distribution by clustering toward norm: Now test is stricter.
detector = make_detector({0.4, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.6});
EXPECT_EQ(-1, detector->DecideOutlier(0.3));
EXPECT_EQ(0, detector->DecideOutlier(0.4));
EXPECT_EQ(0, detector->DecideOutlier(0.5));
EXPECT_EQ(0, detector->DecideOutlier(0.6));
EXPECT_EQ(1, detector->DecideOutlier(0.7));
// Shift numbers around: Mean is 0.3, and data order scrambled.
detector = make_detector({0.28, 0.2, 0.31, 0.4, 0.29, 0.32, 0.27, 0.30});
EXPECT_EQ(-1, detector->DecideOutlier(0.0));
EXPECT_EQ(-1, detector->DecideOutlier(0.1));
EXPECT_EQ(0, detector->DecideOutlier(0.2));
EXPECT_EQ(0, detector->DecideOutlier(0.3));
EXPECT_EQ(0, detector->DecideOutlier(0.4));
EXPECT_EQ(1, detector->DecideOutlier(0.5));
EXPECT_EQ(1, detector->DecideOutlier(1.0));
// Typical usage: Potential outlier would be part of original input data!
detector = make_detector({0.3, 0.29, 0.31, 0.0, 0.3, 0.32, 0.3, 0.29, 0.6});
EXPECT_EQ(-1, detector->DecideOutlier(0.0));
EXPECT_EQ(0, detector->DecideOutlier(0.28));
EXPECT_EQ(0, detector->DecideOutlier(0.29));
EXPECT_EQ(0, detector->DecideOutlier(0.3));
EXPECT_EQ(0, detector->DecideOutlier(0.31));
EXPECT_EQ(0, detector->DecideOutlier(0.32));
EXPECT_EQ(1, detector->DecideOutlier(0.6));
}
TEST(BinaryDataHistogramTest, Basic) {
constexpr double kUninitScore = -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment