Commit 833515ea authored by Charles Zhao's avatar Charles Zhao Committed by Commit Bot

Add normalizer and convert_to_string preprocessors.

(1) Normalizer rescale a int32 or float feature by diving the
    normalizer.

(2) Convert_to_string preprocessor sets a feature from int32 or bool
    to string_value.

BUG=786472

Change-Id: Ia93bbb7a28be4fdd22a6d72e83c86852878bc0a6
Reviewed-on: https://chromium-review.googlesource.com/1001085
Commit-Queue: Charles . <charleszhao@chromium.org>
Reviewed-by: default avatarAndrew Moylan <amoylan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#549797}
parent 121920a0
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "components/assist_ranker/example_preprocessing.h" #include "components/assist_ranker/example_preprocessing.h"
#include "base/numerics/ranges.h"
#include "base/strings/strcat.h" #include "base/strings/strcat.h"
#include "base/strings/string_number_conversions.h" #include "base/strings/string_number_conversions.h"
#include "components/assist_ranker/ranker_example_util.h" #include "components/assist_ranker/ranker_example_util.h"
...@@ -32,7 +33,8 @@ std::string ExamplePreprocessor::FeatureFullname( ...@@ -32,7 +33,8 @@ std::string ExamplePreprocessor::FeatureFullname(
int ExamplePreprocessor::Process(RankerExample* const example, int ExamplePreprocessor::Process(RankerExample* const example,
const bool clear_other_features) const { const bool clear_other_features) const {
return AddMissingFeatures(example) | AddBucketizedFeatures(example) | return AddMissingFeatures(example) | NormalizeFeatures(example) |
AddBucketizedFeatures(example) | ConvertToStringFeatures(example) |
Vectorization(example, clear_other_features); Vectorization(example, clear_other_features);
} }
...@@ -77,7 +79,7 @@ int ExamplePreprocessor::AddBucketizedFeatures( ...@@ -77,7 +79,7 @@ int ExamplePreprocessor::AddBucketizedFeatures(
default: default:
DVLOG(2) << "Can't bucketize feature type: " DVLOG(2) << "Can't bucketize feature type: "
<< feature.feature_type_case(); << feature.feature_type_case();
error_code |= kUnbucketizableFeatureType; error_code |= kNonbucketizableFeatureType;
continue; continue;
} }
// Get the bucket from the boundaries; the first index that value<boundary. // Get the bucket from the boundaries; the first index that value<boundary.
...@@ -93,6 +95,55 @@ int ExamplePreprocessor::AddBucketizedFeatures( ...@@ -93,6 +95,55 @@ int ExamplePreprocessor::AddBucketizedFeatures(
return error_code; return error_code;
} }
int ExamplePreprocessor::NormalizeFeatures(RankerExample* example) const {
int error_code = kSuccess;
for (const MapPair<std::string, float>& pair : config_.normalizers()) {
const std::string& feature_name = pair.first;
float feature_value = 0.0f;
if (GetFeatureValueAsFloat(feature_name, *example, &feature_value)) {
if (pair.second == 0.0f) {
error_code |= kNormalizerIsZero;
} else {
feature_value = feature_value / pair.second;
}
// Truncate to be within [-1.0, 1.0].
feature_value = base::ClampToRange(feature_value, -1.0f, 1.0f);
(*example->mutable_features())[feature_name].set_float_value(
feature_value);
} else {
error_code |= kNonNormalizableFeatureType;
}
}
return error_code;
}
int ExamplePreprocessor::ConvertToStringFeatures(RankerExample* example) const {
int error_code = kSuccess;
for (const std::string& feature_name : config_.convert_to_string_features()) {
const auto find_feature = example->mutable_features()->find(feature_name);
if (find_feature != example->features().end()) {
auto& feature = find_feature->second;
switch (feature.feature_type_case()) {
case Feature::kBoolValue:
feature.set_string_value(
base::IntToString(static_cast<int>(feature.bool_value())));
break;
case Feature::kInt32Value:
feature.set_string_value(base::IntToString(feature.int32_value()));
break;
case Feature::kStringValue:
break;
default:
LOG(WARNING) << "Can't convert to string feature type: "
<< feature.feature_type_case();
error_code |= kNonConvertibleToStringFeatureType;
continue;
}
}
}
return error_code;
}
int ExamplePreprocessor::Vectorization(RankerExample* example, int ExamplePreprocessor::Vectorization(RankerExample* example,
const bool clear_other_features) const { const bool clear_other_features) const {
if (config_.feature_indices().empty()) { if (config_.feature_indices().empty()) {
......
...@@ -19,9 +19,12 @@ class ExamplePreprocessor { ...@@ -19,9 +19,12 @@ class ExamplePreprocessor {
enum PreprocessErrorCode { enum PreprocessErrorCode {
kSuccess = 0, kSuccess = 0,
kNoFeatureIndexFound = 1, kNoFeatureIndexFound = 1,
kUnbucketizableFeatureType = 2, kNonbucketizableFeatureType = 2,
kInvalidFeatureType = 4, kInvalidFeatureType = 4,
kInvalidFeatureListIndex = 8, kInvalidFeatureListIndex = 8,
kNonNormalizableFeatureType = 16,
kNonConvertibleToStringFeatureType = 32,
kNormalizerIsZero = 64,
}; };
explicit ExamplePreprocessor(const ExamplePreprocessorConfig& config) explicit ExamplePreprocessor(const ExamplePreprocessorConfig& config)
...@@ -58,6 +61,11 @@ class ExamplePreprocessor { ...@@ -58,6 +61,11 @@ class ExamplePreprocessor {
// bucketized based on the boundaries and reset as a one-hot feature with // bucketized based on the boundaries and reset as a one-hot feature with
// bucket index as it's string value. // bucket index as it's string value.
int AddBucketizedFeatures(RankerExample* example) const; int AddBucketizedFeatures(RankerExample* example) const;
// Normalizes numeric features to be within [-1.0, 1.0] as float features.
int NormalizeFeatures(RankerExample* example) const;
// Converts any features in |example| that are listed in
// |config_.convert_to_string_features()| into string-valued features.
int ConvertToStringFeatures(RankerExample* example) const;
// Add a new_float_list feature as kVectorizedFeatureDefaultName, and iterate // Add a new_float_list feature as kVectorizedFeatureDefaultName, and iterate
// for all existing features in example.features(), set corresponding // for all existing features in example.features(), set corresponding
// new_float_list.float_value(config_.feature_indices(feature_value_key)) to // new_float_list.float_value(config_.feature_indices(feature_value_key)) to
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "components/assist_ranker/example_preprocessing.h" #include "components/assist_ranker/example_preprocessing.h"
#include "base/strings/string_number_conversions.h"
#include "testing/gtest/include/gtest/gtest.h" #include "testing/gtest/include/gtest/gtest.h"
#include "third_party/protobuf/src/google/protobuf/map.h" #include "third_party/protobuf/src/google/protobuf/map.h"
#include "third_party/protobuf/src/google/protobuf/repeated_field.h" #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
...@@ -64,14 +65,14 @@ class ExamplePreprocessorTest : public ::testing::Test { ...@@ -64,14 +65,14 @@ class ExamplePreprocessorTest : public ::testing::Test {
}; };
TEST_F(ExamplePreprocessorTest, AddMissingFeatures) { TEST_F(ExamplePreprocessorTest, AddMissingFeatures) {
RankerExample example = example_; RankerExample expected = example_;
ExamplePreprocessorConfig config; ExamplePreprocessorConfig config;
// Adding missing feature label to an existing feature has no effect. // Adding missing feature label to an existing feature has no effect.
config.add_missing_features(bool_name_); config.add_missing_features(bool_name_);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Adding missing feature label to non-existing feature returns a // Adding missing feature label to non-existing feature returns a
...@@ -80,15 +81,16 @@ TEST_F(ExamplePreprocessorTest, AddMissingFeatures) { ...@@ -80,15 +81,16 @@ TEST_F(ExamplePreprocessorTest, AddMissingFeatures) {
config.add_missing_features(foo); config.add_missing_features(foo);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
(*example.mutable_features())[ExamplePreprocessor::kMissingFeatureDefaultName] (*expected
.mutable_features())[ExamplePreprocessor::kMissingFeatureDefaultName]
.mutable_string_list() .mutable_string_list()
->add_string_value(foo); ->add_string_value(foo);
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
} }
TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) { TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) {
RankerExample example = example_; RankerExample expected = example_;
ExamplePreprocessorConfig config; ExamplePreprocessorConfig config;
Map<std::string, ExamplePreprocessorConfig::Boundaries>& bucketizers = Map<std::string, ExamplePreprocessorConfig::Boundaries>& bucketizers =
*config.mutable_bucketizers(); *config.mutable_bucketizers();
...@@ -98,21 +100,21 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) { ...@@ -98,21 +100,21 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) {
bucketizers[foo].add_boundaries(0.5); bucketizers[foo].add_boundaries(0.5);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Bucketizing a bool feature returns same proto. // Bucketizing a bool feature returns same proto.
bucketizers[bool_name_].add_boundaries(0.5); bucketizers[bool_name_].add_boundaries(0.5);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kUnbucketizableFeatureType); ExamplePreprocessor::kNonbucketizableFeatureType);
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Bucketizing a string feature returns same proto. // Bucketizing a string feature returns same proto.
bucketizers[one_hot_name_].add_boundaries(0.5); bucketizers[one_hot_name_].add_boundaries(0.5);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kUnbucketizableFeatureType); ExamplePreprocessor::kNonbucketizableFeatureType);
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Bucketizing an int32 feature with 3 boundary. // Bucketizing an int32 feature with 3 boundary.
...@@ -121,8 +123,8 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) { ...@@ -121,8 +123,8 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) {
bucketizers[int32_name_].add_boundaries(int32_value_ + 1); bucketizers[int32_name_].add_boundaries(int32_value_ + 1);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
(*example.mutable_features())[int32_name_].set_string_value("2"); (*expected.mutable_features())[int32_name_].set_string_value("2");
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Bucketizing a float feature with 3 boundary. // Bucketizing a float feature with 3 boundary.
...@@ -131,8 +133,8 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) { ...@@ -131,8 +133,8 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) {
bucketizers[float_name_].add_boundaries(float_value_ + 0.1); bucketizers[float_name_].add_boundaries(float_value_ + 0.1);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
(*example.mutable_features())[float_name_].set_string_value("2"); (*expected.mutable_features())[float_name_].set_string_value("2");
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
// Bucketizing a float feature with value equal to a boundary. // Bucketizing a float feature with value equal to a boundary.
...@@ -143,11 +145,71 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) { ...@@ -143,11 +145,71 @@ TEST_F(ExamplePreprocessorTest, AddBucketizeFeatures) {
bucketizers[float_name_].add_boundaries(float_value_ + 0.1); bucketizers[float_name_].add_boundaries(float_value_ + 0.1);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_), EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess); ExamplePreprocessor::kSuccess);
(*example.mutable_features())[float_name_].set_string_value("3"); (*expected.mutable_features())[float_name_].set_string_value("3");
EXPECT_EQUALS_EXAMPLE(example, example_); EXPECT_EQUALS_EXAMPLE(example_, expected);
config.Clear(); config.Clear();
} }
// Tests normalization of float and int32 features.
TEST_F(ExamplePreprocessorTest, NormalizeFeatures) {
RankerExample expected = example_;
ExamplePreprocessorConfig config;
Map<std::string, float>& normalizers = *config.mutable_normalizers();
normalizers[int32_name_] = int32_value_ - 1.0f;
normalizers[float_name_] = float_value_ + 1.0f;
(*expected.mutable_features())[int32_name_].set_float_value(1.0f);
(*expected.mutable_features())[float_name_].set_float_value(
float_value_ / (float_value_ + 1.0f));
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess);
EXPECT_EQUALS_EXAMPLE(example_, expected);
// Zero normalizer returns an error.
normalizers[float_name_] = 0.0f;
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kNormalizerIsZero);
}
// Zero normalizer returns an error.
TEST_F(ExamplePreprocessorTest, ZeroNormalizerReturnsError) {
RankerExample expected = example_;
ExamplePreprocessorConfig config;
(*config.mutable_normalizers())[float_name_] = 0.0f;
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kNormalizerIsZero);
}
// Tests converts a bool or int32 feature to a string feature.
TEST_F(ExamplePreprocessorTest, ConvertToStringFeatures) {
RankerExample expected = example_;
ExamplePreprocessorConfig config;
auto& features_list = *config.mutable_convert_to_string_features();
*features_list.Add() = bool_name_;
*features_list.Add() = int32_name_;
*features_list.Add() = one_hot_name_;
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kSuccess);
(*expected.mutable_features())[bool_name_].set_string_value(
base::IntToString(static_cast<int>(bool_value_)));
(*expected.mutable_features())[int32_name_].set_string_value(
base::IntToString(int32_value_));
EXPECT_EQUALS_EXAMPLE(example_, expected);
}
// Float features can't be convert to string features.
TEST_F(ExamplePreprocessorTest,
ConvertFloatFeatureToStringFeatureReturnsError) {
RankerExample expected = example_;
ExamplePreprocessorConfig config;
config.add_convert_to_string_features(float_name_);
EXPECT_EQ(ExamplePreprocessor(config).Process(&example_),
ExamplePreprocessor::kNonConvertibleToStringFeatureType);
}
TEST_F(ExamplePreprocessorTest, Vectorization) { TEST_F(ExamplePreprocessorTest, Vectorization) {
ExamplePreprocessorConfig config; ExamplePreprocessorConfig config;
Map<std::string, int32_t>& feature_indices = Map<std::string, int32_t>& feature_indices =
...@@ -243,7 +305,7 @@ TEST_F(ExamplePreprocessorTest, MultipleErrorCode) { ...@@ -243,7 +305,7 @@ TEST_F(ExamplePreprocessorTest, MultipleErrorCode) {
// Error code contains features in example_ but not in feature_indices. // Error code contains features in example_ but not in feature_indices.
EXPECT_TRUE(error_code & ExamplePreprocessor::kNoFeatureIndexFound); EXPECT_TRUE(error_code & ExamplePreprocessor::kNoFeatureIndexFound);
// Error code contains features that are not bucketizable. // Error code contains features that are not bucketizable.
EXPECT_TRUE(error_code & ExamplePreprocessor::kUnbucketizableFeatureType); EXPECT_TRUE(error_code & ExamplePreprocessor::kNonbucketizableFeatureType);
// No kInvalidFeatureType error. // No kInvalidFeatureType error.
EXPECT_FALSE(error_code & ExamplePreprocessor::kInvalidFeatureType); EXPECT_FALSE(error_code & ExamplePreprocessor::kInvalidFeatureType);
// Only two elements is correctly vectorized. // Only two elements is correctly vectorized.
......
...@@ -23,4 +23,15 @@ message ExamplePreprocessorConfig { ...@@ -23,4 +23,15 @@ message ExamplePreprocessorConfig {
// (2) a combination of feature_name and feature_value if it's string_value // (2) a combination of feature_name and feature_value if it's string_value
// or i-th element of a string_list. // or i-th element of a string_list.
map<string, int32> feature_indices = 3; map<string, int32> feature_indices = 3;
// A map from feature name to a float value to normalize the original value.
// The new feature value is set as float_value of
// GetFeatureValueAsFloat(feature) / normalizers[feature_name]
// The specified feature must be convertible to float (e.g. int32 or float).
map<string, float> normalizers = 4;
// Features inside this list will be converted to string_value. bool_value,
// int32_value will be converted by base::IntToString; string_value will be
// kept as original; fails for other feature_types.
repeated string convert_to_string_features = 5;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment