Commit 0de27079 authored by jkrcal's avatar jkrcal Committed by Commit bot

[LanguageModel] Return top languages only with a reasonable sample set

Before, the LanguageModel returned top languages even if it had only one
data point. Thus the frequencies of top languages could change
drastically in the warm-up phase of the model.

This CL adds a minimum size of the sample set. The model returns empty
list of top languages before this minimal size is reached.

BUG=653058

Review-Url: https://codereview.chromium.org/2391383005
Cr-Commit-Position: refs/heads/master@{#423849}
parent ad301d75
......@@ -19,6 +19,7 @@ namespace {
const char kLanguageModelCounters[] = "language_model_counters";
const int kMaxCountersSum = 1000;
const int kMinCountersSum = 100;
const float kCutoffRatio = 0.005f;
const float kDiscountFactor = 0.75f;
......@@ -60,9 +61,13 @@ void DiscountAndCleanCounters(base::DictionaryValue* dict) {
std::vector<LanguageModel::LanguageInfo> GetAllLanguages(
const base::DictionaryValue& dict) {
std::vector<LanguageModel::LanguageInfo> top_languages;
int counters_sum = GetCountersSum(dict);
// If the sample is not large enough yet, pretend there are no top languages.
if (counters_sum < kMinCountersSum)
return std::vector<LanguageModel::LanguageInfo>();
std::vector<LanguageModel::LanguageInfo> top_languages;
int counter_value = 0;
for (base::DictionaryValue::Iterator itr(dict); !itr.IsAtEnd();
itr.Advance()) {
......@@ -103,12 +108,15 @@ float LanguageModel::GetLanguageFrequency(
const std::string& language_code) const {
const base::DictionaryValue* dict =
pref_service_->GetDictionary(kLanguageModelCounters);
int counters_sum = GetCountersSum(*dict);
// If the sample is not large enough yet, pretend there are no top languages.
if (counters_sum < kMinCountersSum)
return 0;
int counter_value = 0;
// If the key |language_code| does not exist, |counter_value| stays 0.
dict->GetInteger(language_code, &counter_value);
int counters_sum = GetCountersSum(*dict);
return static_cast<float>(counter_value) / counters_sum;
}
......
......@@ -30,7 +30,7 @@ class LanguageModel : public KeyedService {
// The current estimated frequency of the language share, a number between 0
// and 1 (can be understood as the probability that the next page the user
// opens is in this language). Frequencies over all LanguageInfos from
// GetTopLanguages() sum to 1.
// GetTopLanguages() sum to 1 (unless there are no top languages, yet).
float frequency;
bool operator==(const LanguageInfo& m) const {
......@@ -45,7 +45,8 @@ class LanguageModel : public KeyedService {
static void RegisterProfilePrefs(PrefRegistrySimple* registry);
// Returns a list of the languages currently tracked by the model, sorted by
// frequency in decreasing order.
// frequency in decreasing order. The list is empty, if the model has not
// enough data points.
std::vector<LanguageInfo> GetTopLanguages() const;
// Returns the estimated frequency for the given language or 0 if the language
......
......@@ -27,10 +27,12 @@ TEST(LanguageModelTest, ListSorted) {
LanguageModel::RegisterProfilePrefs(prefs.registry());
LanguageModel model(&prefs);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang2);
for (int i = 0; i < 50; i++) {
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang2);
}
EXPECT_THAT(model.GetTopLanguages(),
ElementsAre(LanguageModel::LanguageInfo{kLang1},
......@@ -42,10 +44,12 @@ TEST(LanguageModelTest, ListSortedReversed) {
LanguageModel::RegisterProfilePrefs(prefs.registry());
LanguageModel model(&prefs);
model.OnPageVisited(kLang2);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
for (int i = 0; i < 50; i++) {
model.OnPageVisited(kLang2);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
}
EXPECT_THAT(model.GetTopLanguages(),
ElementsAre(LanguageModel::LanguageInfo{kLang1},
......@@ -57,10 +61,12 @@ TEST(LanguageModelTest, RightFrequencies) {
LanguageModel::RegisterProfilePrefs(prefs.registry());
LanguageModel model(&prefs);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang2);
for (int i = 0; i < 50; i++) {
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang1);
model.OnPageVisited(kLang2);
}
// Corresponding frequencies are given by the model.
EXPECT_THAT(model.GetLanguageFrequency(kLang1), FloatEq(0.75f));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment