Commit 55704e79 authored by Charles Zhao's avatar Charles Zhao Committed by Commit Bot

Add example preprocessor for assist_ranker.

(1) A ExamplePreprocessorConfig proto is added.

(2) A ExamplePreprocessor class that build on a config proto and does
    preprocessing including missing feature handling and bucketization.

(3) A ExampleFloatIterator is also added to iterate through all
    features of a RankerExample.

(4) Add a unittest.

Change-Id: I0de0f2fa6d6e9b7ae3b873d8b7b00009cd04ec80
Reviewed-on: https://chromium-review.googlesource.com/872831Reviewed-by: default avatarPhilippe Hamel <hamelphi@chromium.org>
Reviewed-by: default avatarPeter Kasting <pkasting@chromium.org>
Commit-Queue: Charles . <charleszhao@chromium.org>
Cr-Commit-Position: refs/heads/master@{#532089}
parent 6f59c7dc
......@@ -11,6 +11,8 @@ static_library("assist_ranker") {
"base_predictor.h",
"binary_classifier_predictor.cc",
"binary_classifier_predictor.h",
"example_preprocessing.cc",
"example_preprocessing.h",
"fake_ranker_model_loader.cc",
"fake_ranker_model_loader.h",
"generic_logistic_regression_inference.cc",
......@@ -47,6 +49,7 @@ source_set("unit_tests") {
sources = [
"base_predictor_unittest.cc",
"binary_classifier_predictor_unittest.cc",
"example_preprocessing_unittest.cc",
"generic_logistic_regression_inference_unittest.cc",
"ranker_example_util_unittest.cc",
"ranker_model_loader_impl_unittest.cc",
......
......@@ -5,4 +5,5 @@ include_rules = [
"+components/ukm",
"+net",
"+services/metrics/public",
]
\ No newline at end of file
"+third_party/protobuf",
]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/assist_ranker/example_preprocessing.h"
#include "base/strings/strcat.h"
#include "base/strings/string_number_conversions.h"
#include "components/assist_ranker/ranker_example_util.h"
#include "third_party/protobuf/src/google/protobuf/map.h"
#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
namespace assist_ranker {
using google::protobuf::Map;
using google::protobuf::MapPair;
using google::protobuf::RepeatedField;
// Initialize.
const char ExamplePreprocessor::kMissingFeatureDefaultName[] =
"_MissingFeature";
const char ExamplePreprocessor::kVectorizedFeatureDefaultName[] =
"_VectorizedFeature";
std::string ExamplePreprocessor::FeatureFullname(
const std::string& feature_name,
const std::string& feature_value) {
return feature_value.empty()
? feature_name
: base::StrCat({feature_name, "_", feature_value});
}
int ExamplePreprocessor::Process(RankerExample* const example,
const bool clear_other_features) const {
return AddMissingFeatures(example) | AddBucketizedFeatures(example) |
Vectorization(example, clear_other_features);
}
int ExamplePreprocessor::AddMissingFeatures(
RankerExample* const example) const {
Map<std::string, Feature>& feature_map = *example->mutable_features();
for (const std::string& feature_name : config_.missing_features()) {
// If a feature is missing in the example, set the place.
if (feature_map.find(feature_name) == feature_map.end()) {
feature_map[kMissingFeatureDefaultName]
.mutable_string_list()
->add_string_value(feature_name);
}
}
return kSuccess;
}
int ExamplePreprocessor::AddBucketizedFeatures(
RankerExample* const example) const {
int error_code = kSuccess;
Map<std::string, Feature>& feature_map = *example->mutable_features();
for (const MapPair<std::string, ExamplePreprocessorConfig::Boundaries>&
bucketizer : config_.bucketizers()) {
const std::string& feature_name = bucketizer.first;
// Simply continue if the feature is missing. The missing feature will later
// on be handled as missing one_hot feature, and it's up to the user how to
// handle this missing feature.
Feature feature;
if (!SafeGetFeature(feature_name, *example, &feature)) {
continue;
}
// Get feature value as float. Only int32 or float value is supported for
// Bucketization. Continue if the type_case is not int32 or float.
float value = 0;
switch (feature.feature_type_case()) {
case Feature::kInt32Value:
value = static_cast<float>(feature.int32_value());
break;
case Feature::kFloatValue:
value = feature.float_value();
break;
default:
DVLOG(2) << "Can't bucketize feature type: "
<< feature.feature_type_case();
error_code |= kUnbucketizableFeatureType;
continue;
}
// Get the bucket from the boundaries; the first index that value<boundary.
const RepeatedField<float>& boundaries = bucketizer.second.boundaries();
int index = 0;
for (; index < boundaries.size(); ++index) {
if (value < boundaries[index])
break;
}
// Set one hot feature as features[feature_name] = "index";
feature_map[feature_name].set_string_value(base::IntToString(index));
}
return error_code;
}
int ExamplePreprocessor::Vectorization(RankerExample* example,
const bool clear_other_features) const {
if (config_.feature_indices().empty()) {
DVLOG(2) << "Feature indices are empty, can't vectorize.";
return kSuccess;
}
Feature vectorized_features;
vectorized_features.mutable_float_list()->mutable_float_value()->Resize(
config_.feature_indices().size(), 0.0);
int error_code = kSuccess;
for (const auto& field : ExampleFloatIterator(*example)) {
error_code |= field.error;
if (field.error != kSuccess) {
continue;
}
const auto find_index = config_.feature_indices().find(field.fullname);
// If the feature_fullname is inside the indices map, then set the place.
if (find_index != config_.feature_indices().end()) {
vectorized_features.mutable_float_list()->set_float_value(
find_index->second, field.value);
} else {
DVLOG(2) << "Feature has no index: " << field.fullname;
error_code |= kNoFeatureIndexFound;
}
}
if (clear_other_features) {
example->clear_features();
}
(*example->mutable_features())[kVectorizedFeatureDefaultName] =
vectorized_features;
return error_code;
}
ExampleFloatIterator::Field ExampleFloatIterator::operator*() const {
const std::string& feature_name = feature_iterator_->first;
const Feature& feature = feature_iterator_->second;
Field field = {feature_name, 1.0f, ExamplePreprocessor::kSuccess};
switch (feature.feature_type_case()) {
case Feature::kBoolValue:
field.value = static_cast<float>(feature.bool_value());
break;
case Feature::kInt32Value:
field.value = static_cast<float>(feature.int32_value());
break;
case Feature::kFloatValue:
field.value = feature.float_value();
break;
case Feature::kStringValue:
field.fullname = ExamplePreprocessor::FeatureFullname(
feature_name, feature.string_value());
break;
case Feature::kStringList:
if (string_list_index_ < feature.string_list().string_value_size()) {
const std::string& string_value =
feature.string_list().string_value(string_list_index_);
field.fullname =
ExamplePreprocessor::FeatureFullname(feature_name, string_value);
} else {
// This happens when a string list field is added without any value.
field.error = ExamplePreprocessor::kInvalidFeatureListIndex;
}
break;
default:
field.error = ExamplePreprocessor::kInvalidFeatureType;
DVLOG(2) << "Feature type not supported: "
<< feature.feature_type_case();
break;
}
return field;
}
ExampleFloatIterator& ExampleFloatIterator::operator++() {
const Feature& feature = feature_iterator_->second;
switch (feature.feature_type_case()) {
case Feature::kBoolValue:
case Feature::kInt32Value:
case Feature::kFloatValue:
case Feature::kStringValue:
++feature_iterator_;
break;
case Feature::kStringList:
if (string_list_index_ < feature.string_list().string_value_size() - 1) {
// If not at the last element, advance the index.
++string_list_index_;
} else {
// If at the last element, advance the feature_iterator.
string_list_index_ = 0;
++feature_iterator_;
}
break;
default:
++feature_iterator_;
DVLOG(2) << "Feature type not supported: "
<< feature.feature_type_case();
}
return *this;
}
} // namespace assist_ranker
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_
#define COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_
#include "components/assist_ranker/proto/example_preprocessor.pb.h"
#include "components/assist_ranker/proto/ranker_example.pb.h"
#include "third_party/protobuf/src/google/protobuf/map.h"
namespace assist_ranker {
// Preprocessor for preprocessing RankerExample into formats that is needed by
// Ranker Predictors.
class ExamplePreprocessor {
public:
// Error code (bitwise) for preprocessing.
enum PreprocessErrorCode {
kSuccess = 0,
kNoFeatureIndexFound = 1,
kUnbucketizableFeatureType = 2,
kInvalidFeatureType = 4,
kInvalidFeatureListIndex = 8,
};
explicit ExamplePreprocessor(const ExamplePreprocessorConfig& config)
: config_(config) {}
// Processes a RankerExample with config_.
// Clear up all features except kVectorizedFeatureDefaultName if
// clear_other_features is set to true.
// Returns the error code of preprocessing, can be any sum of the error code
// in PreprocessErrorCode.
int Process(RankerExample* example, bool clear_other_features = false) const;
// Default feature name for missing features.
static const char kMissingFeatureDefaultName[];
// Default feature name for vectorized features.
static const char kVectorizedFeatureDefaultName[];
// Generates a feature's fullname based on feature_name and feature_value.
// A feature fullname is defined as:
// (1) feature_name if it's bool_value, int64_value or float_value.
// (2) a combination of feature_name and feature_value if it's string_value
// or i-th element of a string_list.
static std::string FeatureFullname(const std::string& feature_name,
const std::string& feature_value = "");
private:
// If a feature is specified in config_.missing_features() and missing in
// the example, then the feature name is added as a sparse feature value to
// the special sparse feature "_MissingFeature" in the example.
// Always returns kSuccess.
int AddMissingFeatures(RankerExample* example) const;
// If a numeric feature is specified in config_.bucketizers(), then it is
// bucketized based on the boundaries and reset as a one-hot feature with
// bucket index as it's string value.
int AddBucketizedFeatures(RankerExample* example) const;
// Add a new_float_list feature as kVectorizedFeatureDefaultName, and iterate
// for all existing features in example.features(), set corresponding
// new_float_list.float_value(config_.feature_indices(feature_value_key)) to
// be either numeric value (for scalars) or 1.0 (for string values).
int Vectorization(RankerExample* example, bool clear_other_features) const;
// Configuration proto for the preprocessor.
const ExamplePreprocessorConfig config_;
};
// An iterator that goes through all features of a RankerExample and converts
// each field as a struct Field{full_name, value, error}.
// (1) A numeric feature (bool_value, int32_value, float_value) is converted
// to {feature_name, float(original_value), kSuccess}.
// (2) A string feature is converted to
// {feature_name_string_value, 1.0, kSuccess}.
// (3) A string_value from a string list feature is converted to
// {feature_name_string_value, 1.0, error_code} where non-empty list
// gets error_code kSuccess, empty list gets kInvalidFeatureListIndex.
// Example:
// std::vector<float> ExampleToStdFloat(const RankerExample& example,
// const Map& feature_indices) {
// std::vector<float> vectorized(feature_indices.size());
// for (const auto& field : ExampleFloatIterator(example)) {
// if (field.error == ExamplePreprocessor::kSuccess) {
// const int index = feature_indices[field.fullname];
// vectorized[index] = field.value;
// }
// }
// return vectorized;
// }
class ExampleFloatIterator {
public:
// A struct as float value of one field from a RankerExample.
struct Field {
std::string fullname;
float value;
int error;
};
explicit ExampleFloatIterator(const RankerExample& example)
: feature_iterator_(example.features().begin()),
feature_end_iterator_(example.features().end()),
string_list_index_(0) {}
ExampleFloatIterator begin() const { return *this; }
ExampleFloatIterator end() const {
return ExampleFloatIterator(feature_end_iterator_);
}
Field operator*() const;
ExampleFloatIterator& operator++();
// Two iterators are equal if they point to the same field, with the same
// indices if it's a string_list.
bool operator==(const ExampleFloatIterator& other) const {
return feature_iterator_ == other.feature_iterator_ &&
string_list_index_ == other.string_list_index_;
}
bool operator!=(const ExampleFloatIterator& other) const {
return !(*this == other);
}
private:
// Returns the end iterator.
explicit ExampleFloatIterator(
const google::protobuf::Map<std::string, Feature>::const_iterator&
feature_end_iterator)
: feature_iterator_(feature_end_iterator),
feature_end_iterator_(feature_end_iterator),
string_list_index_(0) {}
google::protobuf::Map<std::string, Feature>::const_iterator feature_iterator_;
google::protobuf::Map<std::string, Feature>::const_iterator
feature_end_iterator_;
int string_list_index_;
};
} // namespace assist_ranker
#endif // COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_
This diff is collapsed.
......@@ -6,6 +6,7 @@ import("//third_party/protobuf/proto_library.gni")
proto_library("proto") {
sources = [
"example_preprocessor.proto",
"generic_logistic_regression_model.proto",
"ranker_example.proto",
"ranker_model.proto",
......
// Contains features required for Ranker model inference and training.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package assist_ranker;
message ExamplePreprocessorConfig {
// A list of features names for which you want to do special handling if
// the feature is missing in the RankerExample.
repeated string missing_features = 1;
// Boundaries to bucketize a feature.
message Boundaries { repeated float boundaries = 1; }
// A map of feature_name to boundaries for bucketizing this feature.
map<string, Boundaries> bucketizers = 2;
// A map from feature_fullnames to indices for vectorizing a RankerExample.
// A feature fullname is defined as:
// (1) feature_name if it's bool_value, int64_value or float_value.
// (2) a combination of feature_name and feature_value if it's string_value
// or i-th element of a string_list.
map<string, int32> feature_indices = 3;
}
......@@ -14,6 +14,11 @@ message StringList {
repeated bytes string_value = 1;
}
// A repeated list of float values.
message FloatList {
repeated float float_value = 1;
}
// Generic message that can contain a variety of data types.
message Feature {
oneof feature_type {
......@@ -26,6 +31,8 @@ message Feature {
bytes string_value = 4;
// String list are used for sparse features.
StringList string_list = 5;
// Float lists represent vectorial features.
FloatList float_list = 6;
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment