Add regression support to ExtraTrees.

Since it turns out to be much easier to record results for a regressor than a classifier, add support for it to ExtraTrees. Change-Id: I0734c81bafd87e06c23df8f74b7a0c4c80d616e6 Reviewed-on: https://chromium-review.googlesource.com/c/1391327 Commit-Queue: Frank Liberato <liberato@chromium.org> Reviewed-by: Dan Sanders <sandersd@chromium.org> Cr-Commit-Position: refs/heads/master@{#619540}

Add regression support to ExtraTrees.
Since it turns out to be much easier to record results for a regressor than a classifier, add support for it to ExtraTrees. Change-Id: I0734c81bafd87e06c23df8f74b7a0c4c80d616e6 Reviewed-on: https://chromium-review.googlesource.com/c/1391327 Commit-Queue: Frank Liberato <liberato@chromium.org> Reviewed-by: Dan Sanders <sandersd@chromium.org> Cr-Commit-Position: refs/heads/master@{#619540}
e82151b9 · liberato@chromium.org · Commit Bot · 84043b43 · e82151b9 · e82151b9
Commit e82151b9 authored Jan 03, 2019 by liberato@chromium.org Committed by Commit Bot Jan 03, 2019
7 changed files
--- a/media/learning/impl/extra_trees_trainer_unittest.cc
+++ b/media/learning/impl/extra_trees_trainer_unittest.cc
@@ -60,8 +60,6 @@ TEST_P(ExtraTreesTest, FisherIrisDataset) {
  }

  // Expect very high accuracy.  We should get ~100%.
-  // We get about 96% for one-hot features, and 100% for numeric.  Since the
-  // data really is numeric, that seems reasonable.
  double train_accuracy = ((double)num_correct) / training_data.total_weight();
  EXPECT_GT(train_accuracy, 0.95);
 }
@@ -94,6 +92,99 @@ TEST_P(ExtraTreesTest, WeightedTrainingSetIsSupported) {
  EXPECT_EQ(predicted_value, example_1.target_value);
 }

+TEST_P(ExtraTreesTest, RegressionWorks) {
+  // Create a training set with unseparable data, but give one of them a large
+  // weight.  See if that one wins.
+  SetupFeatures(2);
+  TrainingExample example_1({FeatureValue(1), FeatureValue(123)},
+                            TargetValue(1));
+  TrainingExample example_1_a({FeatureValue(1), FeatureValue(123)},
+                              TargetValue(5));
+  TrainingExample example_2({FeatureValue(1), FeatureValue(456)},
+                            TargetValue(20));
+  TrainingExample example_2_a({FeatureValue(1), FeatureValue(456)},
+                              TargetValue(25));
+  TrainingData training_data;
+  example_1.weight = 100;
+  training_data.push_back(example_1);
+  training_data.push_back(example_1_a);
+  example_2.weight = 100;
+  training_data.push_back(example_2);
+  training_data.push_back(example_2_a);
+
+  task_.target_description.ordering = LearningTask::Ordering::kNumeric;
+
+  // Create a weighed set with |weight| for each example's weight.
+  auto model = trainer_.Train(task_, training_data);
+
+  // Make sure that the results are in the right range.
+  TargetDistribution distribution =
+      model->PredictDistribution(example_1.features);
+  EXPECT_GT(distribution.Average(), example_1.target_value.value() * 0.95);
+  EXPECT_LT(distribution.Average(), example_1.target_value.value() * 1.05);
+  distribution = model->PredictDistribution(example_2.features);
+  EXPECT_GT(distribution.Average(), example_2.target_value.value() * 0.95);
+  EXPECT_LT(distribution.Average(), example_2.target_value.value() * 1.05);
+}
+
+TEST_P(ExtraTreesTest, RegressionVsBinaryClassification) {
+  // Create a binary classification task and a regression task that are roughly
+  // the same.  Verify that the results are the same, too.  In particular, for
+  // each set of features, we choose a regression target |pct| between 0 and
+  // 100.  For the corresponding binary classification problem, we add |pct|
+  // true instances, and 100-|pct| false instances.  The predicted averages
+  // should be roughly the same.
+  SetupFeatures(3);
+  TrainingData c_data, r_data;
+
+  std::set<TrainingExample> r_examples;
+  for (size_t i = 0; i < 4 * 4 * 4; i++) {
+    FeatureValue f1(i & 3);
+    FeatureValue f2((i >> 2) & 3);
+    FeatureValue f3((i >> 4) & 3);
+    int pct = (100 * (f1.value() + f2.value() + f3.value())) / 9;
+    TrainingExample e({f1, f2, f3}, TargetValue(0));
+
+    // TODO(liberato): Consider adding noise, and verifying that the model
+    // predictions are roughly the same as each other, rather than the same as
+    // the currently noise-free target.
+
+    // Push some number of false and some number of true instances that is in
+    // the right ratio for |pct|.  We add 100's instead of 1's so that it's
+    // scaled to the same range as the regression targets.
+    e.weight = 100 - pct;
+    if (e.weight > 0)
+      c_data.push_back(e);
+    e.target_value = TargetValue(100);
+    e.weight = pct;
+    if (e.weight > 0)
+      c_data.push_back(e);
+
+    // For the regression data, add an example with |pct| directly.  Also save
+    // it so that we can look up the right answer below.
+    TrainingExample r_example(TrainingExample({f1, f2, f3}, TargetValue(pct)));
+    r_examples.insert(r_example);
+    r_data.push_back(r_example);
+  }
+
+  // Train a model on the binary classification task and the regression task.
+  auto c_model = trainer_.Train(task_, c_data);
+  task_.target_description.ordering = LearningTask::Ordering::kNumeric;
+  auto r_model = trainer_.Train(task_, r_data);
+
+  // Verify that, for all feature combinations, the models roughly agree.  Since
+  // the data is separable, it probably should be exact.
+  for (auto& r_example : r_examples) {
+    const FeatureVector& fv = r_example.features;
+    TargetDistribution c_dist = c_model->PredictDistribution(fv);
+    EXPECT_LE(c_dist.Average(), r_example.target_value.value() * 1.05);
+    EXPECT_GE(c_dist.Average(), r_example.target_value.value() * 0.95);
+    TargetDistribution r_dist = r_model->PredictDistribution(fv);
+    EXPECT_LE(r_dist.Average(), r_example.target_value.value() * 1.05);
+    EXPECT_GE(r_dist.Average(), r_example.target_value.value() * 0.95);
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(ExtraTreesTest,
                        ExtraTreesTest,
                        testing::ValuesIn({LearningTask::Ordering::kUnordered,

--- a/media/learning/impl/one_hot.cc
+++ b/media/learning/impl/one_hot.cc
@@ -80,8 +80,10 @@ void OneHotConverter::ProcessOneFeature(
    const TrainingData& training_data) {
  // Collect all the distinct values for |index|.
  std::set<Value> values;
-  for (auto& example : training_data)
+  for (auto& example : training_data) {
+    DCHECK_GE(example.features.size(), index);
    values.insert(example.features[index]);
+  }

  // We let the set's ordering be the one-hot value.  It doesn't really matter
  // as long as we don't change it once we pick it.

--- a/media/learning/impl/random_tree_trainer.cc
+++ b/media/learning/impl/random_tree_trainer.cc
@@ -120,9 +120,15 @@ struct InteriorNode : public Model {

 struct LeafNode : public Model {
  LeafNode(const TrainingData& training_data,
-           const std::vector<size_t> training_idx) {
+           const std::vector<size_t> training_idx,
+           LearningTask::Ordering ordering) {
    for (size_t idx : training_idx)
      distribution_ += training_data[idx];
+
+    // Note that we don't treat numeric targets any differently.  We want to
+    // weight the leaf by the number of examples, so replacing it with an
+    // average would just introduce rounding errors.  One might as well take the
+    // average of the final distribution.
  }

  // TreeNode
@@ -155,32 +161,77 @@ std::unique_ptr<Model> RandomTreeTrainer::Train(
    const LearningTask& task,
    const TrainingData& training_data,
    const std::vector<size_t>& training_idx) {
-  if (training_data.empty())
-    return std::make_unique<LeafNode>(training_data, std::vector<size_t>());
+  if (training_data.empty()) {
+    return std::make_unique<LeafNode>(training_data, std::vector<size_t>(),
+                                      LearningTask::Ordering::kUnordered);
+  }
+
+  DCHECK_EQ(task.feature_descriptions.size(), training_data[0].features.size());

-  return Build(task, training_data, training_idx, FeatureSet());
+  // Start with all features unused.
+  FeatureSet unused_set;
+  for (size_t idx = 0; idx < task.feature_descriptions.size(); idx++)
+    unused_set.insert(idx);
+
+  return Build(task, training_data, training_idx, unused_set);
 }

 std::unique_ptr<Model> RandomTreeTrainer::Build(
    const LearningTask& task,
    const TrainingData& training_data,
    const std::vector<size_t>& training_idx,
-    const FeatureSet& used_set) {
+    const FeatureSet& unused_set) {
  DCHECK_GT(training_idx.size(), 0u);

-  // TODO(liberato): Does it help if we refuse to split without an info gain?
-  Split best_potential_split;
+  // TODO: enforce a minimum number of samples.  ExtraTrees uses 2 for
+  // classification, and 5 for regression.

-  // Select the feature subset to consider at this leaf.
-  FeatureSet feature_candidates;
-  for (size_t i = 0; i < training_data[0].features.size(); i++) {
-    if (used_set.find(i) != used_set.end())
-      continue;
-    feature_candidates.insert(i);
+  // Remove any constant attributes in |training_data| from |unused_set|.  Also
+  // check if our training data has a constant target value.
+  std::set<TargetValue> target_values;
+  std::vector<std::set<FeatureValue>> feature_values;
+  feature_values.resize(training_data[0].features.size());
+  for (size_t idx : training_idx) {
+    const TrainingExample& example = training_data[idx];
+    // Record this target value to see if there is more than one.  We skip the
+    // insertion if we've already determined that it's not constant.
+    if (target_values.size() < 2)
+      target_values.insert(example.target_value);
+
+    // For all features in |unused_set|, see if it's a constant in our subset of
+    // the training data.
+    for (size_t feature_idx : unused_set) {
+      auto& values = feature_values[feature_idx];
+      if (values.size() < 2)
+        values.insert(example.features[feature_idx]);
+    }
  }
+
+  // Is the output constant in |training_data|?  If so, then generate a leaf.
+  // If we're not normalizing leaves, then this matters since this training data
+  // might be split across multiple leaves.
+  if (target_values.size() == 1) {
+    return std::make_unique<LeafNode>(training_data, training_idx,
+                                      task.target_description.ordering);
+  }
+
+  // Remove any constant features from the unused set, so that we don't try to
+  // split on them.  It would work, but it would be trivially useless.  We also
+  // don't want to use one of our potential splits on it.
+  FeatureSet new_unused_set = unused_set;
+  for (size_t feature_idx : unused_set) {
+    auto& values = feature_values[feature_idx];
+    if (values.size() == 1)
+      new_unused_set.erase(feature_idx);
+  }
+
+  // Select the feature subset to consider at this leaf.
+  FeatureSet feature_candidates = new_unused_set;
  // TODO(liberato): Let our caller override this.
  const size_t features_per_split =
-      std::min(static_cast<int>(sqrt(feature_candidates.size())), 3);
+      std::max(static_cast<int>(sqrt(feature_candidates.size())), 3);
+  // Note that it's okay if there are fewer features left; we'll select all of
+  // them instead.
  while (feature_candidates.size() > features_per_split) {
    // Remove a random feature.
    size_t which = rng()->Generate(feature_candidates.size());
@@ -190,6 +241,9 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
    feature_candidates.erase(iter);
  }

+  // TODO(liberato): Does it help if we refuse to split without an info gain?
+  Split best_potential_split;
+
  // Find the best split among the candidates that we have.
  for (int i : feature_candidates) {
    Split potential_split =
@@ -204,7 +258,8 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
  // but all had the same value).  Either way, we should end up with a leaf.
  if (best_potential_split.branch_infos.size() < 2) {
    // Stop when there is no more tree.
-    return std::make_unique<LeafNode>(training_data, training_idx);
+    return std::make_unique<LeafNode>(training_data, training_idx,
+                                      task.target_description.ordering);
  }

  // Build an interior node
@@ -215,16 +270,17 @@ std::unique_ptr<Model> RandomTreeTrainer::Build(
  // there's nothing left to split.  For numeric splits, we might want to split
  // it further.  Note that if there is only one branch for this split, then
  // we returned a leaf anyway.
-  FeatureSet new_used_set(used_set);
  if (task.feature_descriptions[best_potential_split.split_index].ordering ==
      LearningTask::Ordering::kUnordered) {
-    new_used_set.insert(best_potential_split.split_index);
+    DCHECK(new_unused_set.find(best_potential_split.split_index) !=
+           new_unused_set.end());
+    new_unused_set.erase(best_potential_split.split_index);
  }

  for (auto& branch_iter : best_potential_split.branch_infos) {
    node->AddChild(branch_iter.first,
                   Build(task, training_data, branch_iter.second.training_idx,
-                         new_used_set));
+                         new_unused_set));
  }

  return node;
@@ -242,6 +298,9 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
  Split split(split_index);
  base::Optional<FeatureValue> split_point;

+  // TODO(liberato): Consider removing nominal feature support and RF.  That
+  // would make this code somewhat simpler.
+
  // For a numeric split, find the split point.  Otherwise, we'll split on every
  // nominal value that this feature has in |training_data|.
  if (task.feature_descriptions[split_index].ordering ==
@@ -282,9 +341,24 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
    branch_info.target_distribution += example;
  }

+  // Figure out how good / bad this split is.
+  switch (task.target_description.ordering) {
+    case LearningTask::Ordering::kUnordered:
+      ComputeNominalSplitScore(&split, total_weight);
+      break;
+    case LearningTask::Ordering::kNumeric:
+      ComputeNumericSplitScore(&split, total_weight);
+      break;
+  }
+
+  return split;
+}
+
+void RandomTreeTrainer::ComputeNominalSplitScore(Split* split,
+                                                 double total_weight) {
  // Compute the nats given that we're at this node.
-  split.nats_remaining = 0;
-  for (auto& info_iter : split.branch_infos) {
+  split->nats_remaining = 0;
+  for (auto& info_iter : split->branch_infos) {
    Split::BranchInfo& branch_info = info_iter.second;

    const double total_counts = branch_info.target_distribution.total_counts();
@@ -294,11 +368,37 @@ RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
      double p = iter.second / total_counts;
      // p*log(p) is the expected nats if the answer is |iter|.  We multiply
      // that by the probability of being in this bucket at all.
-      split.nats_remaining -= (p * log(p)) * p_branch;
+      split->nats_remaining -= (p * log(p)) * p_branch;
    }
  }
+}

-  return split;
+void RandomTreeTrainer::ComputeNumericSplitScore(Split* split,
+                                                 double total_weight) {
+  // Compute the nats given that we're at this node.
+  split->nats_remaining = 0;
+  for (auto& info_iter : split->branch_infos) {
+    Split::BranchInfo& branch_info = info_iter.second;
+
+    const double total_counts = branch_info.target_distribution.total_counts();
+    // |p_branch| is the probability of following this branch.
+    const double p_branch = total_counts / total_weight;
+
+    // Compute the average at this node.  Note that we have no idea if the leaf
+    // node would actually use an average, but really it should match.  It would
+    // be really nice if we could compute the value (or TargetDistribution) as
+    // part of computing the split, and have somebody just hand that target
+    // distribution to the leaf if it ends up as one.
+    double average = branch_info.target_distribution.Average();
+
+    for (auto& iter : branch_info.target_distribution) {
+      // Compute the squared error for all |iter.second| counts that each have a
+      // value of |iter.first|, when this leaf approximates them as |average|.
+      double sq_err = (iter.first.value() - average) *
+                      (iter.first.value() - average) * iter.second;
+      split->nats_remaining += sq_err * p_branch;
+    }
+  }
 }

 FeatureValue RandomTreeTrainer::FindNumericSplitPoint(

--- a/media/learning/impl/random_tree_trainer.h
+++ b/media/learning/impl/random_tree_trainer.h
@@ -158,6 +158,13 @@ class COMPONENT_EXPORT(LEARNING_IMPL) RandomTreeTrainer
                       const std::vector<size_t>& training_idx,
                       int index);

+  // Fill in |nats_remaining| for |split| for a nominal target.  |total_weight|
+  // is the total weight of all instances coming into this split.
+  void ComputeNominalSplitScore(Split* split, double total_weight);
+
+  // Fill in |nats_remaining| for |split| for a numeric target.
+  void ComputeNumericSplitScore(Split* split, double total_weight);
+
  // Compute the split point for |training_data| for a numeric feature.
  FeatureValue FindNumericSplitPoint(size_t index,
                                     const TrainingData& training_data,

--- a/media/learning/impl/random_tree_trainer_unittest.cc
+++ b/media/learning/impl/random_tree_trainer_unittest.cc
@@ -153,7 +153,7 @@ TEST_P(RandomTreeTest, ComplexSeparableTrainingData) {
 }

 TEST_P(RandomTreeTest, UnseparableTrainingData) {
-  SetupFeatures(2);
+  SetupFeatures(1);
  TrainingData training_data;
  TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
  TrainingExample example_2({FeatureValue(123)}, TargetValue(2));
@@ -177,7 +177,7 @@ TEST_P(RandomTreeTest, UnseparableTrainingData) {

 TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
  // Verify how a previously unseen feature value is handled.
-  SetupFeatures(2);
+  SetupFeatures(1);
  TrainingData training_data;
  TrainingExample example_1({FeatureValue(123)}, TargetValue(1));
  TrainingExample example_2({FeatureValue(456)}, TargetValue(2));
@@ -219,7 +219,7 @@ TEST_P(RandomTreeTest, UnknownFeatureValueHandling) {
 TEST_P(RandomTreeTest, NumericFeaturesSplitMultipleTimes) {
  // Verify that numeric features can be split more than once in the tree.
  // This should also pass for nominal features, though it's less interesting.
-  SetupFeatures(2);
+  SetupFeatures(1);
  TrainingData training_data;
  const int feature_mult = 10;
  for (size_t i = 0; i < 4; i++) {

--- a/media/learning/impl/target_distribution.cc
+++ b/media/learning/impl/target_distribution.cc
@@ -85,6 +85,20 @@ bool TargetDistribution::FindSingularMax(TargetValue* value_out,
  return singular_max;
 }

+double TargetDistribution::Average() const {
+  double total_value = 0.;
+  size_t total_counts = 0;
+  for (auto& iter : counts_) {
+    total_value += iter.first.value() * iter.second;
+    total_counts += iter.second;
+  }
+
+  if (!total_counts)
+    return 0.;
+
+  return total_value / total_counts;
+}
+
 std::string TargetDistribution::ToString() const {
  std::ostringstream ss;
  ss << "[";

--- a/media/learning/impl/target_distribution.h
+++ b/media/learning/impl/target_distribution.h
@@ -70,6 +70,10 @@ class COMPONENT_EXPORT(LEARNING_IMPL) TargetDistribution {
  bool FindSingularMax(TargetValue* value_out,
                       size_t* counts_out = nullptr) const;

+  // Return the average value of the entries in this distribution.  Of course,
+  // this only makes sense if the TargetValues can be interpreted as numeric.
+  double Average() const;
+
  std::string ToString() const;

 private: