Commit 4004f2ba authored by Jia's avatar Jia Committed by Commit Bot

[cros search service] Implement InvertedIndexSearch

InvertedIndexSearch is a search backend based on inverted index
and TF-IDF ranking.

This CL implements all APIs except search, which will be in
the next cl.

Bug: 1090132
Change-Id: I3ad4507fba907ed73c3b2b49d7ddb931d0421ee8
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2262101Reviewed-by: default avatarThanh Nguyen <thanhdng@chromium.org>
Commit-Queue: Jia Meng <jiameng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#781735}
parent dd4f4752
......@@ -1374,6 +1374,8 @@ source_set("chromeos") {
"local_search_service/index.h",
"local_search_service/inverted_index.cc",
"local_search_service/inverted_index.h",
"local_search_service/inverted_index_search.cc",
"local_search_service/inverted_index_search.h",
"local_search_service/linear_map_search.cc",
"local_search_service/linear_map_search.h",
"local_search_service/local_search_service.cc",
......@@ -3125,11 +3127,14 @@ source_set("unit_tests") {
"lacros/lacros_util_unittest.cc",
"local_search_service/content_extraction_utils_unittest.cc",
"local_search_service/index_unittest.cc",
"local_search_service/inverted_index_search_unittest.cc",
"local_search_service/inverted_index_unittest.cc",
"local_search_service/local_search_service_unittest.cc",
"local_search_service/proxy/local_search_service_proxy_unittest.cc",
"local_search_service/proxy/types_mojom_traits_unittest.cc",
"local_search_service/search_metrics_reporter_unittest.cc",
"local_search_service/test_utils.cc",
"local_search_service/test_utils.h",
"locale_change_guard_unittest.cc",
"lock_screen_apps/app_manager_impl_unittest.cc",
"lock_screen_apps/lock_screen_profile_creator_impl_unittest.cc",
......
......@@ -13,6 +13,7 @@
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/chromeos/local_search_service/index.h"
#include "chrome/browser/chromeos/local_search_service/shared_structs.h"
#include "chrome/browser/chromeos/local_search_service/test_utils.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace local_search_service {
......@@ -23,25 +24,6 @@ namespace {
using ResultWithIds = std::pair<std::string, std::vector<std::string>>;
using ContentWithId = std::pair<std::string, std::string>;
// Creates test data to be registered to the index. |input| is a map from
// id to contents (id and content).
std::vector<Data> CreateTestData(
const std::map<std::string, std::vector<ContentWithId>>& input) {
std::vector<Data> output;
for (const auto& item : input) {
Data data;
data.id = item.first;
std::vector<Content>& contents = data.contents;
for (const auto& content_with_id : item.second) {
const Content content(content_with_id.first,
base::UTF8ToUTF16(content_with_id.second));
contents.push_back(content);
}
output.push_back(data);
}
return output;
}
void CheckSearchParams(const SearchParams& actual,
const SearchParams& expected) {
EXPECT_DOUBLE_EQ(actual.relevance_threshold, expected.relevance_threshold);
......
......@@ -46,8 +46,11 @@ void InvertedIndex::AddDocument(const std::string& document_id,
}
}
void InvertedIndex::RemoveDocument(const std::string& document_id) {
doc_length_.erase(document_id);
uint32_t InvertedIndex::RemoveDocument(const std::string& document_id) {
const int num_erased = doc_length_.erase(document_id);
if (num_erased == 0)
return num_erased;
for (auto it = dictionary_.begin(); it != dictionary_.end();) {
if (it->second.find(document_id) != it->second.end()) {
......@@ -62,6 +65,7 @@ void InvertedIndex::RemoveDocument(const std::string& document_id) {
it++;
}
}
return num_erased;
}
std::vector<TfidfResult> InvertedIndex::GetTfidf(
......
......@@ -71,9 +71,10 @@ class InvertedIndex {
const std::vector<Token>& tokens);
// Removes a document from the inverted index. Do nothing if document_id is
// not in the index. This function doesn't modify any cache. It only removes
// not in the index. Returns number of documents deleted.
// This function doesn't modify any cache. It only removes
// documents and tokens from the index.
void RemoveDocument(const std::string& document_id);
uint32_t RemoveDocument(const std::string& document_id);
// Gets TF-IDF scores for a term. This function returns the TF-IDF score from
// the cache.
......@@ -89,6 +90,9 @@ class InvertedIndex {
// since the last time the index has been built.
bool IsInvertedIndexBuilt() const { return terms_to_be_updated_.empty(); }
// Returns number of documents in the index.
uint64_t NumberDocuments() const { return doc_length_.size(); }
private:
friend class InvertedIndexTest;
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/chromeos/local_search_service/inverted_index_search.h"
#include <utility>
#include "base/optional.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "chrome/browser/chromeos/local_search_service/content_extraction_utils.h"
#include "chrome/browser/chromeos/local_search_service/inverted_index.h"
namespace local_search_service {
namespace {
std::vector<Token> ExtractDocumentTokens(const Data& data,
const std::string& locale) {
std::vector<Token> document_tokens;
for (const Content& content : data.contents) {
const std::vector<Token> content_tokens =
ExtractContent(content.id, content.content, locale);
document_tokens.insert(document_tokens.end(), content_tokens.begin(),
content_tokens.end());
}
return ConsolidateToken(document_tokens);
}
} // namespace
InvertedIndexSearch::InvertedIndexSearch() {
inverted_index_ = std::make_unique<InvertedIndex>();
}
InvertedIndexSearch::~InvertedIndexSearch() = default;
uint64_t InvertedIndexSearch::GetSize() {
return inverted_index_->NumberDocuments();
}
void InvertedIndexSearch::AddOrUpdate(
const std::vector<local_search_service::Data>& data,
bool build_index) {
for (const Data& d : data) {
// TODO(jiameng): use different locales.
const std::vector<Token> document_tokens = ExtractDocumentTokens(d, "en");
DCHECK(!document_tokens.empty());
inverted_index_->AddDocument(d.id, document_tokens);
}
if (build_index) {
inverted_index_->BuildInvertedIndex();
}
}
uint32_t InvertedIndexSearch::Delete(const std::vector<std::string>& ids,
bool build_index) {
uint32_t num_deleted = 0u;
for (const auto& id : ids) {
DCHECK(!id.empty());
num_deleted += inverted_index_->RemoveDocument(id);
}
if (build_index) {
inverted_index_->BuildInvertedIndex();
}
return num_deleted;
}
// TODO(jiameng): add impl.
ResponseStatus InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
std::vector<Result>* results) {
return ResponseStatus::kEmptyIndex;
}
std::vector<std::pair<std::string, uint32_t>>
InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
const PostingList posting_list = inverted_index_->FindTerm(term);
std::vector<std::pair<std::string, uint32_t>> doc_with_freq;
for (const auto& kv : posting_list) {
doc_with_freq.push_back({kv.first, kv.second.size()});
}
return doc_with_freq;
}
} // namespace local_search_service
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_INVERTED_INDEX_SEARCH_H_
#define CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_INVERTED_INDEX_SEARCH_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "base/macros.h"
#include "base/strings/string16.h"
#include "chrome/browser/chromeos/local_search_service/shared_structs.h"
namespace local_search_service {
class InvertedIndex;
// A search via the inverted index backend with TF-IDF based document ranking.
class InvertedIndexSearch {
public:
InvertedIndexSearch();
~InvertedIndexSearch();
InvertedIndexSearch(const InvertedIndexSearch&) = delete;
InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete;
// Returns number of data items.
uint64_t GetSize();
// Adds or updates data.
// IDs of data should not be empty.
void AddOrUpdate(const std::vector<Data>& data, bool build_index = true);
// Deletes data with |ids| and returns number of items deleted.
// If an id doesn't exist in the InvertedIndexSearch, no operation will be
// done. IDs should not be empty.
uint32_t Delete(const std::vector<std::string>& ids, bool build_index = true);
// Returns matching results for a given query.
// Zero |max_results| means no max.
ResponseStatus Find(const base::string16& query,
uint32_t max_results,
std::vector<Result>* results);
// Returns document id and number of occurrences of |term|.
// Document ids are sorted in alphabetical order.
std::vector<std::pair<std::string, uint32_t>> FindTermForTesting(
const base::string16& term) const;
private:
std::unique_ptr<InvertedIndex> inverted_index_;
};
} // namespace local_search_service
#endif // CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_INVERTED_INDEX_SEARCH_H_
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/chromeos/local_search_service/inverted_index_search.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/chromeos/local_search_service/test_utils.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace local_search_service {
namespace {
// (content-id, content).
using ContentWithId = std::pair<std::string, std::string>;
// (document-id, number-of-occurrences).
using TermOccurrence = std::vector<std::pair<std::string, uint32_t>>;
} // namespace
class InvertedIndexSearchTest : public testing::Test {
protected:
InvertedIndexSearch search_;
};
TEST_F(InvertedIndexSearchTest, Add) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
search_.AddOrUpdate(data);
EXPECT_EQ(search_.GetSize(), 2u);
{
// "network" does not exist in the index.
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("network"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "help" exists in the index.
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("help"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 3u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
{
// "wifi" exists in the index but "wi-fi" doesn't because of normalization.
TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
doc_with_freq = search_.FindTermForTesting(base::UTF8ToUTF16("wi-fi"));
EXPECT_TRUE(doc_with_freq.empty());
// "WiFi" doesn't exist because the index stores normalized word.
doc_with_freq = search_.FindTermForTesting(base::UTF8ToUTF16("WiFi"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "this" does not exist in the index because it's a stopword
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("this"));
EXPECT_TRUE(doc_with_freq.empty());
}
}
TEST_F(InvertedIndexSearchTest, Update) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
search_.AddOrUpdate(data);
EXPECT_EQ(search_.GetSize(), 2u);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
search_.AddOrUpdate(updated_data);
EXPECT_EQ(search_.GetSize(), 3u);
{
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("bluetooth"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("google"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id3");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
}
TEST_F(InvertedIndexSearchTest, Delete) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
search_.AddOrUpdate(data);
EXPECT_EQ(search_.GetSize(), 2u);
EXPECT_EQ(search_.Delete({"id1", "id3"}), 1u);
{
const TermOccurrence doc_with_freq =
search_.FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
}
} // namespace local_search_service
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/chromeos/local_search_service/test_utils.h"
#include <map>
#include <string>
#include <utility>
#include <vector>
namespace local_search_service {
namespace {
// (content-id, content).
using ContentWithId = std::pair<std::string, std::string>;
} // namespace
std::vector<Data> CreateTestData(
const std::map<std::string, std::vector<ContentWithId>>& input) {
std::vector<Data> output;
for (const auto& item : input) {
Data data;
data.id = item.first;
std::vector<Content>& contents = data.contents;
for (const auto& content_with_id : item.second) {
const Content content(content_with_id.first,
base::UTF8ToUTF16(content_with_id.second));
contents.push_back(content);
}
output.push_back(data);
}
return output;
}
} // namespace local_search_service
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_TEST_UTILS_H_
#define CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_TEST_UTILS_H_
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "base/strings/string16.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/chromeos/local_search_service/shared_structs.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace local_search_service {
// Creates test data to be registered to the index. |input| is a map from
// id to contents (id and content).
std::vector<Data> CreateTestData(
const std::map<std::string,
std::vector<std::pair<std::string, std::string>>>& input);
} // namespace local_search_service
#endif // CHROME_BROWSER_CHROMEOS_LOCAL_SEARCH_SERVICE_TEST_UTILS_H_
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment