Commit e82151b9 authored by liberato@chromium.org's avatar liberato@chromium.org Committed by Commit Bot

Add regression support to ExtraTrees.

Since it turns out to be much easier to record results for a
regressor than a classifier, add support for it to ExtraTrees.

Change-Id: I0734c81bafd87e06c23df8f74b7a0c4c80d616e6
Reviewed-on: https://chromium-review.googlesource.com/c/1391327
Commit-Queue: Frank Liberato <liberato@chromium.org>
Reviewed-by: default avatarDan Sanders <sandersd@chromium.org>
Cr-Commit-Position: refs/heads/master@{#619540}
parent 84043b43
......@@ -60,8 +60,6 @@ TEST_P(ExtraTreesTest, FisherIrisDataset) {
}
// Expect very high accuracy. We should get ~100%.
// We get about 96% for one-hot features, and 100% for numeric. Since the
// data really is numeric, that seems reasonable.
double train_accuracy = ((double)num_correct) / training_data.total_weight();
EXPECT_GT(train_accuracy, 0.95);
}
......@@ -94,6 +92,99 @@ TEST_P(ExtraTreesTest, WeightedTrainingSetIsSupported) {
EXPECT_EQ(predicted_value, example_1.target_value);
}
TEST_P(ExtraTreesTest, RegressionWorks) {
// Create a training set with unseparable data, but give one of them a large
// weight. See if that one wins.
SetupFeatures(2);
TrainingExample example_1({FeatureValue(1), FeatureValue(123)},
TargetValue(1));
TrainingExample example_1_a({FeatureValue(1), FeatureValue(123)},
TargetValue(5));
TrainingExample example_2({FeatureValue(1), FeatureValue(456)},
TargetValue(20));
TrainingExample example_2_a({FeatureValue(1), FeatureValue(456)},
TargetValue(25));
TrainingData training_data;
example_1.weight = 100;
training_data.push_back(example_1);
training_data.push_back(example_1_a);
example_2.weight = 100;
training_data.push_back(example_2);
training_data.push_back(example_2_a);
task_.target_description.ordering = LearningTask::Ordering::kNumeric;
// Create a weighed set with |weight| for each example's weight.
auto model = trainer_.Train(task_, training_data);
// Make sure that the results are in the right range.
TargetDistribution distribution =
model->PredictDistribution(example_1.features);
EXPECT_GT(distribution.Average(), example_1.target_value.value() * 0.95);
EXPECT_LT(distribution.Average(), example_1.target_value.value() * 1.05);
distribution = model->PredictDistribution(example_2.features);
EXPECT_GT(distribution.Average(), example_2.target_value.value() * 0.95);
EXPECT_LT(distribution.Average(), example_2.target_value.value() * 1.05);
}
TEST_P(ExtraTreesTest, RegressionVsBinaryClassification) {
// Create a binary classification task and a regression task that are roughly
// the same. Verify that the results are the same, too. In particular, for
// each set of features, we choose a regression target |pct| between 0 and
// 100. For the corresponding binary classification problem, we add |pct|
// true instances, and 100-|pct| false instances. The predicted averages
// should be roughly the same.
SetupFeatures(3);
TrainingData c_data, r_data;
std::set<TrainingExample> r_examples;
for (size_t i = 0; i < 4 * 4 * 4; i++) {
FeatureValue f1(i & 3);
FeatureValue f2((i >> 2) & 3);
FeatureValue f3((i >> 4) & 3);
int pct = (100 * (f1.value() + f2.value() + f3.value())) / 9;
TrainingExample e({f1, f2, f3}, TargetValue(0));
// TODO(liberato): Consider adding noise, and verifying that the model
// predictions are roughly the same as each other, rather than the same as
// the currently noise-free target.
// Push some number of false and some number of true instances that is in
// the right ratio for |pct|. We add 100's instead of 1's so that it's
// scaled to the same range as the regression targets.
e.weight = 100 - pct;
if (e.weight > 0)
c_data.push_back(e);
e.target_value = TargetValue(100);
e.weight = pct;
if (e.weight > 0)
c_data.push_back(e);
// For the regression data, add an example with |pct| directly. Also save
// it so that we can look up the right answer below.
TrainingExample r_example(TrainingExample({f1, f2, f3}, TargetValue(pct)));
r_examples.insert(r_example);
r_data.push_back(r_example);
}
// Train a model on the binary classification task and the regression task.
auto c_model = trainer_.Train(task_, c_data);
task_.target_description.ordering = LearningTask::Ordering::kNumeric;
auto r_model = trainer_.Train(task_, r_data);
// Verify that, for all feature combinations, the models roughly agree. Since
// the data is separable, it probably should be exact.
for (auto& r_example : r_examples) {
const FeatureVector& fv = r_example.features;
TargetDistribution c_dist = c_model->PredictDistribution(fv);
EXPECT_LE(c_dist.Average(), r_example.target_value.value() * 1.05);
EXPECT_GE(c_dist.Average(), r_example.target_value.value() * 0.95);
TargetDistribution r_dist = r_model->PredictDistribution(fv);
EXPECT_LE(r_dist.Average(), r_example.target_value.value() * 1.05);
EXPECT_GE(r_dist.Average(), r_example.target_value.value() * 0.95);
}
}
INSTANTIATE_TEST_CASE_P(ExtraTreesTest,
ExtraTreesTest,
testing::ValuesIn({LearningTask::Ordering::kUnordered,
......
......@@ -80,8 +80,10 @@ void OneHotConverter::ProcessOneFeature(
const TrainingData& training_data) {
// Collect all the distinct values for |index|.
std::set<Value> values;
for (auto& example : training_data)
for (auto& example : training_data) {
DCHECK_GE(example.features.size(), index);
values.insert(example.features[index]);
}
// We let the set's ordering be the one-hot value. It doesn't really matter
// as long as we don't change it once we pick it.
......
......@@ -120,9 +120,15 @@ struct InteriorNode : public Model {
struct LeafNode : public Model {
LeafNode(const TrainingData& training_data,
const std::vector<size_t> training_idx) {
const std::vector<size_t> training_idx,
LearningTask::Ordering ordering) {
for (size_t idx : training_idx)
distribution_ += training_data[idx];
// Note that we don't treat numeric targets any differently. We want to
// weight the leaf by the number of examples, so replacing it with an
// average would just introduce rounding errors. One might as well take the
// average of the final distribution.
}
// TreeNode
......@@ -155,32 +161,77 @@ std::unique_ptr<Model> RandomTreeTrainer::Train(
const LearningTask& task,
const TrainingData& training_data,
const std::vector<size_t>& training_idx) {
if (training_data.empty())
return std::make_unique<LeafNode>(training_data, std::vector<size_t>());
if (training_data.empty()) {
return std::make_unique<LeafNode>(training_data, std::vector<size_t>(),
LearningTask::Ordering::kUnordered);
}
DCHECK_EQ(task.feature_descriptions.size(), training_data[0].features.size());
return Build(task, training_data, training_idx, FeatureSet());
// Start with all features unused.
FeatureSet unused_set;
for (size_t idx = 0; idx < task.feature_descriptions.size(); idx++)
unused_set.insert(idx);
return Build(task, training_data, training_idx, unused_set);
}
std::unique_ptr<Model> RandomTreeTrainer::Build(
const LearningTask& task,
const TrainingData& training_data,
const std::vector<size_t>& training_idx,
const FeatureSet& used_set) {
const FeatureSet& unused_set) {
DCHECK_GT(training_idx.size(), 0u);
// TODO(liberato): Does it help if we refuse to split without an info gain?
Split best_potential_split;
// TODO: enforce a minimum number of samples. ExtraTrees uses 2 for
// classification, and 5 for regression.
// Select the feature subset to consider at this leaf.
FeatureSet feature_candidates;
for (size_t i = 0; i < training_data[0].features.size(); i++) {
if (used_set.find(i) != used_set.end())
continue;
feature_candidates.insert(i);
// Remove any constant attributes in |training_data| from |unused_set|. Also
// check if our training data has a constant target value.
std::set<TargetValue> target_values;
std::vector<std::set<FeatureValue>> feature_values;
feature_values.resize(training_data[0].features.size());
for (size_t idx : training_idx) {
const TrainingExample& example = training_data[idx];
// Record this target value to see if there is more than one. We skip the
// insertion if we've already determined that it's not constant.
if (target_values.size() < 2)
target_values.insert(example.target_value);
// For all features in |unused_set|, see if it's a constant in our subset of
// the training data.
for (size_t feature_idx : unused_set) {
auto& values = feature_values[feature_idx];
if (values.size() < 2)
values.insert(example.features[feature_idx]);
}
}
// Is the output constant in |training_data|? If so, then generate a leaf.
// If we're not normalizing leaves, then this matters since this training data
// might be split across multiple leaves.
if (target_values.size() == 1) {
return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
}
// Remove any constant features from the unused set, so that we don't try to
// split on them. It would work, but it would be trivially useless. We also
// don't want to use one of our potential splits on it.
FeatureSet new_unused_set = unused_set;
for (size_t feature_idx : unused_set) {
auto& values = feature_values[feature_idx];
if (values.size() == 1)
new_unused_set.erase(feature_idx);
}
// Select the feature subset to consider at this leaf.
FeatureSet feature_candidates = new_unused_set;
// TODO(liberato): Let our caller override this.
const size_t features_per_split =
std::min(static_cast<int>(sqrt(feature_candidates.size())), 3);
std::max(static_cast<int>(sqrt(feature_candidates.size())), 3);
// Note that it's okay if there are fewer features left; we'll select all of
// them instead.
while (feature_candidates.size() > features_per_split) {
// Remove a random feature.
size_t which = rng()->Generate(feature_candidates.size());
......@@ -190,6 +241,9 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
feature_candidates.erase(iter);
}
// TODO(liberato): Does it help if we refuse to split without an info gain?
Split best_potential_split;
// Find the best split among the candidates that we have.
for (int i : feature_candidates) {
Split potential_split =
......@@ -204,7 +258,8 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
// but all had the same value). Either way, we should end up with a leaf.
if (best_potential_split.branch_infos.size() < 2) {
// Stop when there is no more tree.
return std::make_unique<LeafNode>(training_data, training_idx);
return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
}
// Build an interior node
......@@ -215,16 +270,17 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
// there's nothing left to split. For numeric splits, we might want to split
// it further. Note that if there is only one branch for this split, then
// we returned a leaf anyway.
FeatureSet new_used_set(used_set);
if (task.feature_descriptions[best_potential_split.split_index].ordering ==
LearningTask::Ordering::kUnordered) {
new_used_set.insert(best_potential_split.split_index);
DCHECK(new_unused_set.find(best_potential_split.split_index) !=
new_unused_set.end());
new_unused_set.erase(best_potential_split.split_index);
}
for (auto& branch_iter : best_potential_split.branch_infos) {
node->AddChild(branch_iter.first,
Build(task, training_data, branch_iter.second.training_idx,
new_used_set));
new_unused_set));
}
return node;
......@@ -242,6 +298,9 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
Split split(split_index);
base::Optional<FeatureValue> split_point;
// TODO(liberato): Consider removing nominal feature support and RF. That
// would make this code somewhat simpler.
// For a numeric split, find the split point. Otherwise, we'll split on every
// nominal value that this feature has in |training_data|.
if (task.feature_descriptions[split_index].ordering ==
......@@ -282,9 +341,24 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
branch_info.target_distribution += example;
}
// Figure out how good / bad this split is.
switch (task.target_description.ordering) {
case LearningTask::Ordering::kUnordered:
ComputeNominalSplitScore(&split, total_weight);
break;
case LearningTask::Ordering::kNumeric:
ComputeNumericSplitScore(&split, total_weight);
break;
}
return split;
}
void RandomTreeTrainer::ComputeNominalSplitScore(Split* split,
double total_weight) {
// Compute the nats given that we're at this node.
split.nats_remaining = 0;
for (auto& info_iter : split.branch_infos) {
split->nats_remaining = 0;
for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second;
const double total_counts = branch_info.target_distribution.total_counts();
......@@ -294,11 +368,37 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
double p = iter.second / total_counts;
// p*log(p) is the expected nats if the answer is |iter|. We multiply
// that by the probability of being in this bucket at all.
split.nats_remaining -= (p * log(p)) * p_branch;
split->nats_remaining -= (p * log(p)) * p_branch;
}
}
}
return split;
void RandomTreeTrainer::ComputeNumericSplitScore(Split* split,
double total_weight) {
// Compute the nats given that we're at this node.
split->nats_remaining = 0;
for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second;
const double total_counts = branch_info.target_distribution.total_counts();
// |p_branch| is the probability of following this branch.
const double p_branch = total_counts / total_weight;
// Compute the average at this node. Note that we have no idea if the leaf
// node would actually use an average, but really it should match. It would
// be really nice if we could compute the value (or TargetDistribution) as
// part of computing the split, and have somebody just hand that target
// distribution to the leaf if it ends up as one.
double average = branch_info.target_distribution.Average();
for (auto& iter : branch_info.target_distribution) {
// Compute the squared error for all |iter.second| counts that each have a
// value of |iter.first|, when this leaf approximates them as |average|.
double sq_err = (iter.first.value() - average) *
(iter.first.value() - average) * iter.second;
split->nats_remaining += sq_err * p_branch;
}
}
}
FeatureValue RandomTreeTrainer::FindNumericSplitPoint(
......
......@@ -158,6 +158,13 @@ class COMPONENT_EXPORT(LEARNING_IMPL) RandomTreeTrainer
const std::vector<size_t>& training_idx,
int index);
// Fill in |nats_remaining| for |split| for a nominal target. |total_weight|
// is the total weight of all instances coming into this split.
void ComputeNominalSplitScore(Split* split, double total_weight);
// Fill in |nats_remaining| for |split| for a numeric target.
void ComputeNumericSplitScore(Split* split, double total_weight);
// Compute the split point for |training_data| for a numeric feature.
FeatureValue FindNumericSplitPoint(size_t index,
const TrainingData& training_data,
......
......@@ -153,7 +153,7 @@ TEST_P(RandomTreeTest, ComplexSeparableTrainingData) {
}
TEST_P(RandomTreeTest, UnseparableTrainingData) {
SetupFeatures(2);
SetupFeatures(1);
TrainingData training_data;
TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
TrainingExample example_2({FeatureValue(123)}, TargetValue(2));
......@@ -177,7 +177,7 @@ TEST_P(RandomTreeTest, UnseparableTrainingData) {
TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
// Verify how a previously unseen feature value is handled.
SetupFeatures(2);
SetupFeatures(1);
TrainingData training_data;
TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
TrainingExample example_2({FeatureValue(456)}, TargetValue(2));
......@@ -219,7 +219,7 @@ TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
TEST_P(RandomTreeTest, NumericFeaturesSplitMultipleTimes) {
// Verify that numeric features can be split more than once in the tree.
// This should also pass for nominal features, though it's less interesting.
SetupFeatures(2);
SetupFeatures(1);
TrainingData training_data;
const int feature_mult = 10;
for (size_t i = 0; i < 4; i++) {
......
......@@ -85,6 +85,20 @@ bool TargetDistribution::FindSingularMax(TargetValue* value_out,
return singular_max;
}
double TargetDistribution::Average() const {
double total_value = 0.;
size_t total_counts = 0;
for (auto& iter : counts_) {
total_value += iter.first.value() * iter.second;
total_counts += iter.second;
}
if (!total_counts)
return 0.;
return total_value / total_counts;
}
std::string TargetDistribution::ToString() const {
std::ostringstream ss;
ss << "[";
......
......@@ -70,6 +70,10 @@ class COMPONENT_EXPORT(LEARNING_IMPL) TargetDistribution {
bool FindSingularMax(TargetValue* value_out,
size_t* counts_out = nullptr) const;
// Return the average value of the entries in this distribution. Of course,
// this only makes sense if the TargetValues can be interpreted as numeric.
double Average() const;
std::string ToString() const;
private:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment