Commit e82151b9 authored by liberato@chromium.org's avatar liberato@chromium.org Committed by Commit Bot

Add regression support to ExtraTrees.

Since it turns out to be much easier to record results for a
regressor than a classifier, add support for it to ExtraTrees.

Change-Id: I0734c81bafd87e06c23df8f74b7a0c4c80d616e6
Reviewed-on: https://chromium-review.googlesource.com/c/1391327
Commit-Queue: Frank Liberato <liberato@chromium.org>
Reviewed-by: default avatarDan Sanders <sandersd@chromium.org>
Cr-Commit-Position: refs/heads/master@{#619540}
parent 84043b43
...@@ -60,8 +60,6 @@ TEST_P(ExtraTreesTest, FisherIrisDataset) { ...@@ -60,8 +60,6 @@ TEST_P(ExtraTreesTest, FisherIrisDataset) {
} }
// Expect very high accuracy. We should get ~100%. // Expect very high accuracy. We should get ~100%.
// We get about 96% for one-hot features, and 100% for numeric. Since the
// data really is numeric, that seems reasonable.
double train_accuracy = ((double)num_correct) / training_data.total_weight(); double train_accuracy = ((double)num_correct) / training_data.total_weight();
EXPECT_GT(train_accuracy, 0.95); EXPECT_GT(train_accuracy, 0.95);
} }
...@@ -94,6 +92,99 @@ TEST_P(ExtraTreesTest, WeightedTrainingSetIsSupported) { ...@@ -94,6 +92,99 @@ TEST_P(ExtraTreesTest, WeightedTrainingSetIsSupported) {
EXPECT_EQ(predicted_value, example_1.target_value); EXPECT_EQ(predicted_value, example_1.target_value);
} }
TEST_P(ExtraTreesTest, RegressionWorks) {
// Create a training set with unseparable data, but give one of them a large
// weight. See if that one wins.
SetupFeatures(2);
TrainingExample example_1({FeatureValue(1), FeatureValue(123)},
TargetValue(1));
TrainingExample example_1_a({FeatureValue(1), FeatureValue(123)},
TargetValue(5));
TrainingExample example_2({FeatureValue(1), FeatureValue(456)},
TargetValue(20));
TrainingExample example_2_a({FeatureValue(1), FeatureValue(456)},
TargetValue(25));
TrainingData training_data;
example_1.weight = 100;
training_data.push_back(example_1);
training_data.push_back(example_1_a);
example_2.weight = 100;
training_data.push_back(example_2);
training_data.push_back(example_2_a);
task_.target_description.ordering = LearningTask::Ordering::kNumeric;
// Create a weighed set with |weight| for each example's weight.
auto model = trainer_.Train(task_, training_data);
// Make sure that the results are in the right range.
TargetDistribution distribution =
model->PredictDistribution(example_1.features);
EXPECT_GT(distribution.Average(), example_1.target_value.value() * 0.95);
EXPECT_LT(distribution.Average(), example_1.target_value.value() * 1.05);
distribution = model->PredictDistribution(example_2.features);
EXPECT_GT(distribution.Average(), example_2.target_value.value() * 0.95);
EXPECT_LT(distribution.Average(), example_2.target_value.value() * 1.05);
}
TEST_P(ExtraTreesTest, RegressionVsBinaryClassification) {
// Create a binary classification task and a regression task that are roughly
// the same. Verify that the results are the same, too. In particular, for
// each set of features, we choose a regression target |pct| between 0 and
// 100. For the corresponding binary classification problem, we add |pct|
// true instances, and 100-|pct| false instances. The predicted averages
// should be roughly the same.
SetupFeatures(3);
TrainingData c_data, r_data;
std::set<TrainingExample> r_examples;
for (size_t i = 0; i < 4 * 4 * 4; i++) {
FeatureValue f1(i & 3);
FeatureValue f2((i >> 2) & 3);
FeatureValue f3((i >> 4) & 3);
int pct = (100 * (f1.value() + f2.value() + f3.value())) / 9;
TrainingExample e({f1, f2, f3}, TargetValue(0));
// TODO(liberato): Consider adding noise, and verifying that the model
// predictions are roughly the same as each other, rather than the same as
// the currently noise-free target.
// Push some number of false and some number of true instances that is in
// the right ratio for |pct|. We add 100's instead of 1's so that it's
// scaled to the same range as the regression targets.
e.weight = 100 - pct;
if (e.weight > 0)
c_data.push_back(e);
e.target_value = TargetValue(100);
e.weight = pct;
if (e.weight > 0)
c_data.push_back(e);
// For the regression data, add an example with |pct| directly. Also save
// it so that we can look up the right answer below.
TrainingExample r_example(TrainingExample({f1, f2, f3}, TargetValue(pct)));
r_examples.insert(r_example);
r_data.push_back(r_example);
}
// Train a model on the binary classification task and the regression task.
auto c_model = trainer_.Train(task_, c_data);
task_.target_description.ordering = LearningTask::Ordering::kNumeric;
auto r_model = trainer_.Train(task_, r_data);
// Verify that, for all feature combinations, the models roughly agree. Since
// the data is separable, it probably should be exact.
for (auto& r_example : r_examples) {
const FeatureVector& fv = r_example.features;
TargetDistribution c_dist = c_model->PredictDistribution(fv);
EXPECT_LE(c_dist.Average(), r_example.target_value.value() * 1.05);
EXPECT_GE(c_dist.Average(), r_example.target_value.value() * 0.95);
TargetDistribution r_dist = r_model->PredictDistribution(fv);
EXPECT_LE(r_dist.Average(), r_example.target_value.value() * 1.05);
EXPECT_GE(r_dist.Average(), r_example.target_value.value() * 0.95);
}
}
INSTANTIATE_TEST_CASE_P(ExtraTreesTest, INSTANTIATE_TEST_CASE_P(ExtraTreesTest,
ExtraTreesTest, ExtraTreesTest,
testing::ValuesIn({LearningTask::Ordering::kUnordered, testing::ValuesIn({LearningTask::Ordering::kUnordered,
......
...@@ -80,8 +80,10 @@ void OneHotConverter::ProcessOneFeature( ...@@ -80,8 +80,10 @@ void OneHotConverter::ProcessOneFeature(
const TrainingData& training_data) { const TrainingData& training_data) {
// Collect all the distinct values for |index|. // Collect all the distinct values for |index|.
std::set<Value> values; std::set<Value> values;
for (auto& example : training_data) for (auto& example : training_data) {
DCHECK_GE(example.features.size(), index);
values.insert(example.features[index]); values.insert(example.features[index]);
}
// We let the set's ordering be the one-hot value. It doesn't really matter // We let the set's ordering be the one-hot value. It doesn't really matter
// as long as we don't change it once we pick it. // as long as we don't change it once we pick it.
......
...@@ -120,9 +120,15 @@ struct InteriorNode : public Model { ...@@ -120,9 +120,15 @@ struct InteriorNode : public Model {
struct LeafNode : public Model { struct LeafNode : public Model {
LeafNode(const TrainingData& training_data, LeafNode(const TrainingData& training_data,
const std::vector<size_t> training_idx) { const std::vector<size_t> training_idx,
LearningTask::Ordering ordering) {
for (size_t idx : training_idx) for (size_t idx : training_idx)
distribution_ += training_data[idx]; distribution_ += training_data[idx];
// Note that we don't treat numeric targets any differently. We want to
// weight the leaf by the number of examples, so replacing it with an
// average would just introduce rounding errors. One might as well take the
// average of the final distribution.
} }
// TreeNode // TreeNode
...@@ -155,32 +161,77 @@ std::unique_ptr<Model> RandomTreeTrainer::Train( ...@@ -155,32 +161,77 @@ std::unique_ptr<Model> RandomTreeTrainer::Train(
const LearningTask& task, const LearningTask& task,
const TrainingData& training_data, const TrainingData& training_data,
const std::vector<size_t>& training_idx) { const std::vector<size_t>& training_idx) {
if (training_data.empty()) if (training_data.empty()) {
return std::make_unique<LeafNode>(training_data, std::vector<size_t>()); return std::make_unique<LeafNode>(training_data, std::vector<size_t>(),
LearningTask::Ordering::kUnordered);
}
DCHECK_EQ(task.feature_descriptions.size(), training_data[0].features.size());
return Build(task, training_data, training_idx, FeatureSet()); // Start with all features unused.
FeatureSet unused_set;
for (size_t idx = 0; idx < task.feature_descriptions.size(); idx++)
unused_set.insert(idx);
return Build(task, training_data, training_idx, unused_set);
} }
std::unique_ptr<Model> RandomTreeTrainer::Build( std::unique_ptr<Model> RandomTreeTrainer::Build(
const LearningTask& task, const LearningTask& task,
const TrainingData& training_data, const TrainingData& training_data,
const std::vector<size_t>& training_idx, const std::vector<size_t>& training_idx,
const FeatureSet& used_set) { const FeatureSet& unused_set) {
DCHECK_GT(training_idx.size(), 0u); DCHECK_GT(training_idx.size(), 0u);
// TODO(liberato): Does it help if we refuse to split without an info gain? // TODO: enforce a minimum number of samples. ExtraTrees uses 2 for
Split best_potential_split; // classification, and 5 for regression.
// Select the feature subset to consider at this leaf. // Remove any constant attributes in |training_data| from |unused_set|. Also
FeatureSet feature_candidates; // check if our training data has a constant target value.
for (size_t i = 0; i < training_data[0].features.size(); i++) { std::set<TargetValue> target_values;
if (used_set.find(i) != used_set.end()) std::vector<std::set<FeatureValue>> feature_values;
continue; feature_values.resize(training_data[0].features.size());
feature_candidates.insert(i); for (size_t idx : training_idx) {
const TrainingExample& example = training_data[idx];
// Record this target value to see if there is more than one. We skip the
// insertion if we've already determined that it's not constant.
if (target_values.size() < 2)
target_values.insert(example.target_value);
// For all features in |unused_set|, see if it's a constant in our subset of
// the training data.
for (size_t feature_idx : unused_set) {
auto& values = feature_values[feature_idx];
if (values.size() < 2)
values.insert(example.features[feature_idx]);
}
} }
// Is the output constant in |training_data|? If so, then generate a leaf.
// If we're not normalizing leaves, then this matters since this training data
// might be split across multiple leaves.
if (target_values.size() == 1) {
return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
}
// Remove any constant features from the unused set, so that we don't try to
// split on them. It would work, but it would be trivially useless. We also
// don't want to use one of our potential splits on it.
FeatureSet new_unused_set = unused_set;
for (size_t feature_idx : unused_set) {
auto& values = feature_values[feature_idx];
if (values.size() == 1)
new_unused_set.erase(feature_idx);
}
// Select the feature subset to consider at this leaf.
FeatureSet feature_candidates = new_unused_set;
// TODO(liberato): Let our caller override this. // TODO(liberato): Let our caller override this.
const size_t features_per_split = const size_t features_per_split =
std::min(static_cast<int>(sqrt(feature_candidates.size())), 3); std::max(static_cast<int>(sqrt(feature_candidates.size())), 3);
// Note that it's okay if there are fewer features left; we'll select all of
// them instead.
while (feature_candidates.size() > features_per_split) { while (feature_candidates.size() > features_per_split) {
// Remove a random feature. // Remove a random feature.
size_t which = rng()->Generate(feature_candidates.size()); size_t which = rng()->Generate(feature_candidates.size());
...@@ -190,6 +241,9 @@ std::unique_ptr<Model> RandomTreeTrainer::Build( ...@@ -190,6 +241,9 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
feature_candidates.erase(iter); feature_candidates.erase(iter);
} }
// TODO(liberato): Does it help if we refuse to split without an info gain?
Split best_potential_split;
// Find the best split among the candidates that we have. // Find the best split among the candidates that we have.
for (int i : feature_candidates) { for (int i : feature_candidates) {
Split potential_split = Split potential_split =
...@@ -204,7 +258,8 @@ std::unique_ptr<Model> RandomTreeTrainer::Build( ...@@ -204,7 +258,8 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
// but all had the same value). Either way, we should end up with a leaf. // but all had the same value). Either way, we should end up with a leaf.
if (best_potential_split.branch_infos.size() < 2) { if (best_potential_split.branch_infos.size() < 2) {
// Stop when there is no more tree. // Stop when there is no more tree.
return std::make_unique<LeafNode>(training_data, training_idx); return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
} }
// Build an interior node // Build an interior node
...@@ -215,16 +270,17 @@ std::unique_ptr<Model> RandomTreeTrainer::Build( ...@@ -215,16 +270,17 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
// there's nothing left to split. For numeric splits, we might want to split // there's nothing left to split. For numeric splits, we might want to split
// it further. Note that if there is only one branch for this split, then // it further. Note that if there is only one branch for this split, then
// we returned a leaf anyway. // we returned a leaf anyway.
FeatureSet new_used_set(used_set);
if (task.feature_descriptions[best_potential_split.split_index].ordering == if (task.feature_descriptions[best_potential_split.split_index].ordering ==
LearningTask::Ordering::kUnordered) { LearningTask::Ordering::kUnordered) {
new_used_set.insert(best_potential_split.split_index); DCHECK(new_unused_set.find(best_potential_split.split_index) !=
new_unused_set.end());
new_unused_set.erase(best_potential_split.split_index);
} }
for (auto& branch_iter : best_potential_split.branch_infos) { for (auto& branch_iter : best_potential_split.branch_infos) {
node->AddChild(branch_iter.first, node->AddChild(branch_iter.first,
Build(task, training_data, branch_iter.second.training_idx, Build(task, training_data, branch_iter.second.training_idx,
new_used_set)); new_unused_set));
} }
return node; return node;
...@@ -242,6 +298,9 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit( ...@@ -242,6 +298,9 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
Split split(split_index); Split split(split_index);
base::Optional<FeatureValue> split_point; base::Optional<FeatureValue> split_point;
// TODO(liberato): Consider removing nominal feature support and RF. That
// would make this code somewhat simpler.
// For a numeric split, find the split point. Otherwise, we'll split on every // For a numeric split, find the split point. Otherwise, we'll split on every
// nominal value that this feature has in |training_data|. // nominal value that this feature has in |training_data|.
if (task.feature_descriptions[split_index].ordering == if (task.feature_descriptions[split_index].ordering ==
...@@ -282,9 +341,24 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit( ...@@ -282,9 +341,24 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
branch_info.target_distribution += example; branch_info.target_distribution += example;
} }
// Figure out how good / bad this split is.
switch (task.target_description.ordering) {
case LearningTask::Ordering::kUnordered:
ComputeNominalSplitScore(&split, total_weight);
break;
case LearningTask::Ordering::kNumeric:
ComputeNumericSplitScore(&split, total_weight);
break;
}
return split;
}
void RandomTreeTrainer::ComputeNominalSplitScore(Split* split,
double total_weight) {
// Compute the nats given that we're at this node. // Compute the nats given that we're at this node.
split.nats_remaining = 0; split->nats_remaining = 0;
for (auto& info_iter : split.branch_infos) { for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second; Split::BranchInfo& branch_info = info_iter.second;
const double total_counts = branch_info.target_distribution.total_counts(); const double total_counts = branch_info.target_distribution.total_counts();
...@@ -294,11 +368,37 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit( ...@@ -294,11 +368,37 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
double p = iter.second / total_counts; double p = iter.second / total_counts;
// p*log(p) is the expected nats if the answer is |iter|. We multiply // p*log(p) is the expected nats if the answer is |iter|. We multiply
// that by the probability of being in this bucket at all. // that by the probability of being in this bucket at all.
split.nats_remaining -= (p * log(p)) * p_branch; split->nats_remaining -= (p * log(p)) * p_branch;
} }
} }
}
return split; void RandomTreeTrainer::ComputeNumericSplitScore(Split* split,
double total_weight) {
// Compute the nats given that we're at this node.
split->nats_remaining = 0;
for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second;
const double total_counts = branch_info.target_distribution.total_counts();
// |p_branch| is the probability of following this branch.
const double p_branch = total_counts / total_weight;
// Compute the average at this node. Note that we have no idea if the leaf
// node would actually use an average, but really it should match. It would
// be really nice if we could compute the value (or TargetDistribution) as
// part of computing the split, and have somebody just hand that target
// distribution to the leaf if it ends up as one.
double average = branch_info.target_distribution.Average();
for (auto& iter : branch_info.target_distribution) {
// Compute the squared error for all |iter.second| counts that each have a
// value of |iter.first|, when this leaf approximates them as |average|.
double sq_err = (iter.first.value() - average) *
(iter.first.value() - average) * iter.second;
split->nats_remaining += sq_err * p_branch;
}
}
} }
FeatureValue RandomTreeTrainer::FindNumericSplitPoint( FeatureValue RandomTreeTrainer::FindNumericSplitPoint(
......
...@@ -158,6 +158,13 @@ class COMPONENT_EXPORT(LEARNING_IMPL) RandomTreeTrainer ...@@ -158,6 +158,13 @@ class COMPONENT_EXPORT(LEARNING_IMPL) RandomTreeTrainer
const std::vector<size_t>& training_idx, const std::vector<size_t>& training_idx,
int index); int index);
// Fill in |nats_remaining| for |split| for a nominal target. |total_weight|
// is the total weight of all instances coming into this split.
void ComputeNominalSplitScore(Split* split, double total_weight);
// Fill in |nats_remaining| for |split| for a numeric target.
void ComputeNumericSplitScore(Split* split, double total_weight);
// Compute the split point for |training_data| for a numeric feature. // Compute the split point for |training_data| for a numeric feature.
FeatureValue FindNumericSplitPoint(size_t index, FeatureValue FindNumericSplitPoint(size_t index,
const TrainingData& training_data, const TrainingData& training_data,
......
...@@ -153,7 +153,7 @@ TEST_P(RandomTreeTest, ComplexSeparableTrainingData) { ...@@ -153,7 +153,7 @@ TEST_P(RandomTreeTest, ComplexSeparableTrainingData) {
} }
TEST_P(RandomTreeTest, UnseparableTrainingData) { TEST_P(RandomTreeTest, UnseparableTrainingData) {
SetupFeatures(2); SetupFeatures(1);
TrainingData training_data; TrainingData training_data;
TrainingExample example_1({FeatureValue(123)}, TargetValue(1)); TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
TrainingExample example_2({FeatureValue(123)}, TargetValue(2)); TrainingExample example_2({FeatureValue(123)}, TargetValue(2));
...@@ -177,7 +177,7 @@ TEST_P(RandomTreeTest, UnseparableTrainingData) { ...@@ -177,7 +177,7 @@ TEST_P(RandomTreeTest, UnseparableTrainingData) {
TEST_P(RandomTreeTest, UnknownFeatureValueHandling) { TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
// Verify how a previously unseen feature value is handled. // Verify how a previously unseen feature value is handled.
SetupFeatures(2); SetupFeatures(1);
TrainingData training_data; TrainingData training_data;
TrainingExample example_1({FeatureValue(123)}, TargetValue(1)); TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
TrainingExample example_2({FeatureValue(456)}, TargetValue(2)); TrainingExample example_2({FeatureValue(456)}, TargetValue(2));
...@@ -219,7 +219,7 @@ TEST_P(RandomTreeTest, UnknownFeatureValueHandling) { ...@@ -219,7 +219,7 @@ TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
TEST_P(RandomTreeTest, NumericFeaturesSplitMultipleTimes) { TEST_P(RandomTreeTest, NumericFeaturesSplitMultipleTimes) {
// Verify that numeric features can be split more than once in the tree. // Verify that numeric features can be split more than once in the tree.
// This should also pass for nominal features, though it's less interesting. // This should also pass for nominal features, though it's less interesting.
SetupFeatures(2); SetupFeatures(1);
TrainingData training_data; TrainingData training_data;
const int feature_mult = 10; const int feature_mult = 10;
for (size_t i = 0; i < 4; i++) { for (size_t i = 0; i < 4; i++) {
......
...@@ -85,6 +85,20 @@ bool TargetDistribution::FindSingularMax(TargetValue* value_out, ...@@ -85,6 +85,20 @@ bool TargetDistribution::FindSingularMax(TargetValue* value_out,
return singular_max; return singular_max;
} }
double TargetDistribution::Average() const {
double total_value = 0.;
size_t total_counts = 0;
for (auto& iter : counts_) {
total_value += iter.first.value() * iter.second;
total_counts += iter.second;
}
if (!total_counts)
return 0.;
return total_value / total_counts;
}
std::string TargetDistribution::ToString() const { std::string TargetDistribution::ToString() const {
std::ostringstream ss; std::ostringstream ss;
ss << "["; ss << "[";
......
...@@ -70,6 +70,10 @@ class COMPONENT_EXPORT(LEARNING_IMPL) TargetDistribution { ...@@ -70,6 +70,10 @@ class COMPONENT_EXPORT(LEARNING_IMPL) TargetDistribution {
bool FindSingularMax(TargetValue* value_out, bool FindSingularMax(TargetValue* value_out,
size_t* counts_out = nullptr) const; size_t* counts_out = nullptr) const;
// Return the average value of the entries in this distribution. Of course,
// this only makes sense if the TargetValues can be interpreted as numeric.
double Average() const;
std::string ToString() const; std::string ToString() const;
private: private:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment