Commit 3c5d4238 authored by Jia's avatar Jia Committed by Commit Bot

[cros search service] Implement Find in inverted index search.

Bug: 1090132
Change-Id: Ieed0da666c9d1baae3a16cf8bd667c83a196c4fa
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2272180Reviewed-by: default avatarThanh Nguyen <thanhdng@chromium.org>
Commit-Queue: Jia Meng <jiameng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#783770}
parent a672d413
......@@ -6,11 +6,13 @@
#include <utility>
#include "base/i18n/rtl.h"
#include "base/optional.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "chrome/browser/chromeos/local_search_service/content_extraction_utils.h"
#include "chrome/browser/chromeos/local_search_service/inverted_index.h"
#include "chrome/common/string_matching/tokenized_string.h"
namespace local_search_service {
......@@ -68,11 +70,40 @@ uint32_t InvertedIndexSearch::Delete(const std::vector<std::string>& ids,
return num_deleted;
}
// TODO(jiameng): add impl.
ResponseStatus InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
std::vector<Result>* results) {
return ResponseStatus::kEmptyIndex;
DCHECK(results);
results->clear();
if (query.empty()) {
return ResponseStatus::kEmptyQuery;
}
if (GetSize() == 0u)
return ResponseStatus::kEmptyIndex;
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
// TODO(jiameng): allow thresholds to be passed in as search params.
*results = inverted_index_->FindMatchingDocumentsApproximately(
tokens, 0.1 /* prefix_threhold */, 0.6 /* block_threshold */);
if (results->size() > max_results && max_results > 0u)
results->resize(max_results);
return ResponseStatus::kSuccess;
}
std::vector<std::pair<std::string, uint32_t>>
......
......@@ -150,4 +150,110 @@ TEST_F(InvertedIndexSearchTest, Delete) {
}
}
TEST_F(InvertedIndexSearchTest, Find) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
// Nothing has been added to the index.
std::vector<Result> results;
EXPECT_EQ(
search_.Find(base::UTF8ToUTF16("network"), /*max_results=*/10, &results),
ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Data is added and then deleted from index, making the index empty.
search_.AddOrUpdate(data);
EXPECT_EQ(search_.GetSize(), 2u);
EXPECT_EQ(search_.Delete({"id1", "id2"}), 2u);
EXPECT_EQ(search_.GetSize(), 0u);
EXPECT_EQ(
search_.Find(base::UTF8ToUTF16("network"), /*max_results=*/10, &results),
ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Index is populated again, but query is empty.
search_.AddOrUpdate(data);
EXPECT_EQ(search_.GetSize(), 2u);
EXPECT_EQ(search_.Find(base::UTF8ToUTF16(""), /*max_results=*/10, &results),
ResponseStatus::kEmptyQuery);
EXPECT_TRUE(results.empty());
// No document is found for a given query.
EXPECT_EQ(search_.Find(base::UTF8ToUTF16("networkstuff"), /*max_results=*/10,
&results),
ResponseStatus::kSuccess);
EXPECT_TRUE(results.empty());
{
// A document is found.
// Query's case is normalized.
EXPECT_EQ(search_.Find(base::UTF8ToUTF16("ANOTHER networkstuff"),
/*max_results=*/10, &results),
ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
// "another" only exists in "id1".
const float expected_score = TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*num_term_occurrence_in_doc=*/1,
/*doc_length=*/7);
CheckResult(results[0], "id1", expected_score,
/*expected_number_positions=*/1);
}
{
// Two documents are found.
EXPECT_EQ(search_.Find(base::UTF8ToUTF16("another help"),
/*max_results=*/10, &results),
ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
// "id1" score comes from both "another" and "help".
const float expected_score_id1 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*num_term_occurrence_in_doc=*/1,
/*doc_length=*/7) +
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*num_term_occurrence_in_doc=*/3,
/*doc_length=*/7);
// "id2" score comes "help".
const float expected_score_id2 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*num_term_occurrence_in_doc=*/1,
/*doc_length=*/3);
EXPECT_GE(expected_score_id1, expected_score_id2);
CheckResult(results[0], "id1", expected_score_id1,
/*expected_number_positions=*/4);
CheckResult(results[1], "id2", expected_score_id2,
/*expected_number_positions=*/1);
}
{
// Same as above, but max number of results is set to 1.
EXPECT_EQ(search_.Find(base::UTF8ToUTF16("another help"), /*max_results=*/1,
&results),
ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
EXPECT_EQ(results[0].id, "id1");
}
{
// Same as above, but set max_results to 0, meaning no max.
EXPECT_EQ(search_.Find(base::UTF8ToUTF16("another help"), /*max_results=*/0,
&results),
ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
}
}
} // namespace local_search_service
......@@ -12,6 +12,7 @@
#include "base/strings/string16.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/chromeos/local_search_service/shared_structs.h"
#include "chrome/browser/chromeos/local_search_service/test_utils.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
......@@ -28,15 +29,6 @@ std::vector<float> GetScoresFromTfidfResult(
return scores;
}
void CheckResult(const Result& result,
const std::string& expected_id,
float expected_score,
size_t expected_number_token_positions) {
EXPECT_EQ(result.id, expected_id);
EXPECT_NEAR(result.score, expected_score, 0.001);
EXPECT_EQ(result.positions.size(), expected_number_token_positions);
}
} // namespace
class InvertedIndexTest : public ::testing::Test {
......
......@@ -35,4 +35,23 @@ std::vector<Data> CreateTestData(
return output;
}
void CheckResult(const Result& result,
const std::string& expected_id,
float expected_score,
size_t expected_number_positions) {
EXPECT_EQ(result.id, expected_id);
EXPECT_NEAR(result.score, expected_score, 0.001);
EXPECT_EQ(result.positions.size(), expected_number_positions);
}
float TfIdfScore(size_t num_docs,
size_t num_docs_with_term,
size_t num_term_occurrence_in_doc,
size_t doc_length) {
const float idf = 1.0 + log((1.0 + num_docs) / (1.0 + num_docs_with_term));
const float tf = static_cast<float>(num_term_occurrence_in_doc) / doc_length;
return tf * idf;
}
} // namespace local_search_service
......@@ -23,6 +23,17 @@ std::vector<Data> CreateTestData(
const std::map<std::string,
std::vector<std::pair<std::string, std::string>>>& input);
// Checks |result|'s id, score and number of matching positions are expected.
void CheckResult(const Result& result,
const std::string& expected_id,
float expected_score,
size_t expected_number_positions);
float TfIdfScore(size_t num_docs,
size_t num_docs_with_term,
size_t num_term_occurrence_in_doc,
size_t doc_length);
} // namespace local_search_service
#endif // CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_TEST_UTILS_H_
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment