Commit 0b2e0bd8 authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

[local-search-service] Implement InvertedIndexSearch with callback

This CL implements InvertedIndexSearch functions that are inherited
from new Index interface.

Design doc: go/lss-sandboxing
Implementation plan: go/lss-sandboxing-impl

Bug: 1137560
Change-Id: I3a1439f9b43d9d52bbb6819308535aa28ef7fe3d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2531078Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#827660}
parent 1ade577e
......@@ -85,8 +85,8 @@ TfidfCache BuildTfidf(uint32_t num_docs_from_last_update,
}
// Removes a document from document state variables given it's ID. Don't do
// anything if the ID doesn't exist.
void RemoveDocumentIfExist(const std::string& document_id,
// anything if the ID doesn't exist. Return true if the document is removed.
bool RemoveDocumentIfExist(const std::string& document_id,
DocLength* doc_length,
Dictionary* dictionary,
TermSet* terms_to_be_updated) {
......@@ -94,13 +94,15 @@ void RemoveDocumentIfExist(const std::string& document_id,
CHECK(dictionary);
CHECK(terms_to_be_updated);
DCHECK(!content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
bool document_removed = false;
if (doc_length->find(document_id) == doc_length->end())
return;
return document_removed;
doc_length->erase(document_id);
for (auto it = dictionary->begin(); it != dictionary->end();) {
if (it->second.find(document_id) != it->second.end()) {
terms_to_be_updated->insert(it->first);
it->second.erase(document_id);
document_removed = true;
}
// Removes term from the dictionary if its posting list is empty.
......@@ -110,33 +112,42 @@ void RemoveDocumentIfExist(const std::string& document_id,
it++;
}
}
return document_removed;
}
// Given list of documents to update and document state variables, returns new
// document state variables.
DocumentStateVariables UpdateDocuments(DocumentToUpdate&& documents_to_update,
// document state variables and number of deleted documents.
std::pair<DocumentStateVariables, uint32_t> UpdateDocumentStateVariables(
DocumentToUpdate&& documents_to_update,
const DocLength& doc_length,
Dictionary&& dictionary,
TermSet&& terms_to_be_updated) {
DCHECK(!::content::BrowserThread::CurrentlyOn(::content::BrowserThread::UI));
DocLength new_doc_length(doc_length);
uint32_t num_deleted = 0u;
for (const auto& document : documents_to_update) {
const std::string document_id(document.first);
RemoveDocumentIfExist(document_id, &new_doc_length, &dictionary,
&terms_to_be_updated);
bool is_deleted = RemoveDocumentIfExist(document_id, &new_doc_length,
&dictionary, &terms_to_be_updated);
// Update the document if necessary.
if (!document.second.empty()) {
// If document content is not empty, it is being updated but not
// deleted.
is_deleted = false;
for (const auto& token : document.second) {
dictionary[token.content][document_id] = token.positions;
new_doc_length[document_id] += token.positions.size();
terms_to_be_updated.insert(token.content);
}
}
num_deleted += (is_deleted) ? 1 : 0;
}
return std::make_tuple(std::move(new_doc_length), std::move(dictionary),
std::move(terms_to_be_updated));
return std::make_pair(
std::make_tuple(std::move(new_doc_length), std::move(dictionary),
std::move(terms_to_be_updated)),
num_deleted);
}
// Given the index variables, clear all the data.
......@@ -159,13 +170,12 @@ std::pair<DocumentStateVariables, TfidfCache> ClearData(
}
} // namespace
InvertedIndex::InvertedIndex() = default;
InvertedIndex::~InvertedIndex() = default;
void InvertedIndex::RegisterIndexBuiltCallback(
base::RepeatingCallback<void()> on_index_built) {
on_index_built_ = std::move(on_index_built);
InvertedIndex::InvertedIndex() {
task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(),
base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN});
}
InvertedIndex::~InvertedIndex() = default;
PostingList InvertedIndex::FindTerm(const base::string16& term) const {
if (dictionary_.find(term) != dictionary_.end())
......@@ -238,6 +248,20 @@ void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) {
InvertedIndexController();
}
void InvertedIndex::AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback) {
if (documents.empty())
return;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnAddDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
uint32_t InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids) {
if (document_ids.empty())
......@@ -254,6 +278,35 @@ uint32_t InvertedIndex::RemoveDocuments(
return document_ids.size();
}
void InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback) {
DocumentToUpdate documents;
for (const auto& id : document_ids) {
documents.push_back({id, std::vector<Token>()});
}
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::UpdateDocuments(
const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback) {
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
std::vector<TfidfResult> InvertedIndex::GetTfidf(
const base::string16& term) const {
if (tfidf_cache_.find(term) != tfidf_cache_.end()) {
......@@ -269,12 +322,34 @@ void InvertedIndex::BuildInvertedIndex() {
InvertedIndexController();
}
void InvertedIndex::BuildInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::ClearInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
request_to_clear_index_ = true;
InvertedIndexController();
}
void InvertedIndex::ClearInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataCleared,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::InvertedIndexController() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
// TODO(thanhdng): A clear-index call should ideally cancel all other update
......@@ -285,12 +360,12 @@ void InvertedIndex::InvertedIndexController() {
if (request_to_clear_index_) {
update_in_progress_ = true;
request_to_clear_index_ = false;
base::ThreadPool::PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT},
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataCleared,
base::BindOnce(&InvertedIndex::OnDataClearedSync,
weak_ptr_factory_.GetWeakPtr()));
return;
}
......@@ -300,35 +375,33 @@ void InvertedIndex::InvertedIndexController() {
update_in_progress_ = true;
index_building_in_progress_ = true;
request_to_build_index_ = false;
base::ThreadPool::PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT},
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
base::BindOnce(&InvertedIndex::OnBuildTfidfCompleteSync,
weak_ptr_factory_.GetWeakPtr()));
} else if (terms_to_be_updated_.empty()) {
// If there's no more work to do and all changed terms have been used to
// update the index, then mark index is built and make the callback.
// update the index, then mark index is built.
is_index_built_ = true;
if (!on_index_built_.is_null())
on_index_built_.Run();
}
} else {
update_in_progress_ = true;
base::ThreadPool::PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT},
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
// TODO(jiameng): |doc_length_| can be moved since it's not used for
// document existence checking any more.
base::BindOnce(&UpdateDocuments, std::move(documents_to_update_),
doc_length_, std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
base::BindOnce(&UpdateDocumentStateVariables,
std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsCompleteSync,
weak_ptr_factory_.GetWeakPtr()));
}
}
void InvertedIndex::OnBuildTfidfComplete(TfidfCache&& new_cache) {
void InvertedIndex::OnBuildTfidfCompleteSync(TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
num_docs_from_last_update_ = doc_length_.size();
tfidf_cache_ = std::move(new_cache);
......@@ -338,18 +411,66 @@ void InvertedIndex::OnBuildTfidfComplete(TfidfCache&& new_cache) {
InvertedIndexController();
}
void InvertedIndex::OnUpdateDocumentsComplete(
DocumentStateVariables&& document_state_variables) {
void InvertedIndex::OnBuildTfidfComplete(base::OnceCallback<void()> callback,
TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(document_state_variables));
dictionary_ = std::move(std::get<1>(document_state_variables));
terms_to_be_updated_ = std::move(std::get<2>(document_state_variables));
num_docs_from_last_update_ = doc_length_.size();
tfidf_cache_ = std::move(new_cache);
std::move(callback).Run();
}
void InvertedIndex::OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
update_in_progress_ = false;
InvertedIndexController();
}
void InvertedIndex::OnDataCleared(
void InvertedIndex::OnUpdateDocumentsComplete(
base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(base::BindOnce(
[](base::OnceCallback<void(uint32_t)> callback, uint32_t num_deleted) {
std::move(callback).Run(num_deleted);
},
std::move(callback), document_state_variables_and_num_deleted.second));
}
void InvertedIndex::OnAddDocumentsComplete(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_EQ(document_state_variables_and_num_deleted.second, 0u);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(std::move(callback));
}
void InvertedIndex::OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first));
......@@ -362,5 +483,18 @@ void InvertedIndex::OnDataCleared(
InvertedIndexController();
}
void InvertedIndex::OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first));
dictionary_ = std::move(std::get<1>(inverted_index_data.first));
terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
tfidf_cache_ = std::move(inverted_index_data.second);
num_docs_from_last_update_ = 0;
std::move(callback).Run();
}
} // namespace local_search_service
} // namespace chromeos
......@@ -14,6 +14,7 @@
#include "base/callback.h"
#include "base/gtest_prod_util.h"
#include "base/memory/weak_ptr.h"
#include "base/sequenced_task_runner.h"
#include "base/strings/string16.h"
#include "chromeos/components/local_search_service/shared_structs.h"
......@@ -60,10 +61,6 @@ class InvertedIndex {
InvertedIndex(const InvertedIndex&) = delete;
InvertedIndex& operator=(const InvertedIndex&) = delete;
// |on_index_built| will be called after the index is built.
void RegisterIndexBuiltCallback(
base::RepeatingCallback<void()> on_index_built);
// Returns document ID and positions of a term.
PostingList FindTerm(const base::string16& term) const;
......@@ -79,6 +76,10 @@ class InvertedIndex {
// unique (have unique content). This function doesn't modify any cache. It
// only adds documents and tokens to the index.
void AddDocuments(const DocumentToUpdate& documents);
// Similar to the above function, but it will build TF-IDF cache after adding
// documents.
void AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback);
// Removes documents from the inverted index. Do nothing if the document id is
// not in the index.
......@@ -87,6 +88,22 @@ class InvertedIndex {
// As other operations may be running on a separate thread, this function
// returns size of |document_ids| and not actually deleted documents.
uint32_t RemoveDocuments(const std::vector<std::string>& document_ids);
// Similar to the above function, but it will build TF-IDF cache after
// removing documents.
void RemoveDocuments(const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback);
// Updates documents from the inverted index. It combines two functions:
// AddDocuments and RemoveDocument. This function will returns number of
// documents to be removed (number of documents that have empty content).
// - If a document ID is not in the index, add the document to the index.
// - If a document ID is in the index and it's new content isn't empty,
// update it's content in the index.
// - If a document ID is in the index and it's content is empty, remove it
// from the index.
// It will build TF-IDF cache after updating the documents.
void UpdateDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback);
// Gets TF-IDF scores for a term. This function returns the TF-IDF score from
// the cache.
......@@ -96,9 +113,11 @@ class InvertedIndex {
// Builds the inverted index.
void BuildInvertedIndex();
void BuildInvertedIndex(base::OnceCallback<void()> callback);
// Clears all the data from the inverted index.
void ClearInvertedIndex();
void ClearInvertedIndex(base::OnceCallback<void()> callback);
// Checks if the inverted index has been built: returns |true| if the inverted
// index is up to date, returns |false| if there are some modified document
......@@ -118,17 +137,26 @@ class InvertedIndex {
void InvertedIndexController();
// Called on the main thread after BuildTfidf is completed.
void OnBuildTfidfComplete(TfidfCache&& new_cache);
// Called on the main thread after UpdateDocuments is completed.
void OnUpdateDocumentsComplete(
DocumentStateVariables&& document_state_variables);
void OnBuildTfidfCompleteSync(TfidfCache&& new_cache);
void OnBuildTfidfComplete(base::OnceCallback<void()> callback,
TfidfCache&& new_cache);
// Called on the main thread after UpdateDocumentsStateVariables is completed.
void OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnUpdateDocumentsComplete(base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnAddDocumentsComplete(base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
void OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
base::RepeatingCallback<void()> on_index_built_;
// |is_index_built_| is only true if index's TF-IDF is consistent with the
// documents in the index. This means as soon as documents are modified
// (added, updated or deleted), |is_index_built_| will be set to false. While
......@@ -155,6 +183,7 @@ class InvertedIndex {
bool index_building_in_progress_ = false;
bool request_to_clear_index_ = false;
scoped_refptr<base::SequencedTaskRunner> task_runner_;
SEQUENCE_CHECKER(sequence_checker_);
base::WeakPtrFactory<InvertedIndex> weak_ptr_factory_{this};
......
......@@ -51,18 +51,38 @@ ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) {
ExtractedContent documents;
for (const Data& d : data) {
const std::vector<Token> document_tokens = ExtractDocumentTokens(d);
DCHECK(!document_tokens.empty());
documents.push_back({d.id, document_tokens});
}
return documents;
}
std::unordered_set<base::string16> GetTokenizedQuery(
const base::string16& query) {
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
return tokens;
}
} // namespace
InvertedIndexSearch::InvertedIndexSearch(IndexId index_id,
PrefService* local_state)
: IndexSync(index_id, Backend::kInvertedIndex, local_state),
Index(index_id, Backend::kInvertedIndex),
inverted_index_(std::make_unique<InvertedIndex>()),
blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(),
......@@ -85,7 +105,7 @@ void InvertedIndexSearch::AddOrUpdateSync(
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate,
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdateSync,
weak_ptr_factory_.GetWeakPtr()));
}
......@@ -107,7 +127,7 @@ uint32_t InvertedIndexSearch::DeleteSync(const std::vector<std::string>& ids) {
++num_queued_index_updates_;
blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete,
base::BindOnce(&InvertedIndexSearch::FinalizeDeleteSync,
weak_ptr_factory_.GetWeakPtr(), ids));
return ids.size();
......@@ -135,24 +155,9 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status;
}
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
*results = inverted_index_->FindMatchingDocumentsApproximately(
tokens, search_params_.prefix_threshold, search_params_.fuzzy_threshold);
GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results->size() > max_results && max_results > 0u)
results->resize(max_results);
......@@ -163,6 +168,71 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status;
}
void InvertedIndexSearch::GetSize(GetSizeCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
std::move(callback).Run(inverted_index_->NumberDocuments());
}
void InvertedIndexSearch::AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Delete(const std::vector<std::string>& ids,
DeleteCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!ids.empty());
blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback), ids));
}
void InvertedIndexSearch::UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeUpdateDocuments,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (query.empty()) {
std::move(callback).Run(ResponseStatus::kEmptyQuery, base::nullopt);
return;
}
if (inverted_index_->NumberDocuments() == 0u) {
std::move(callback).Run(ResponseStatus::kEmptyIndex, base::nullopt);
return;
}
std::vector<Result> results =
inverted_index_->FindMatchingDocumentsApproximately(
GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results.size() > max_results && max_results > 0u)
results.resize(max_results);
std::move(callback).Run(ResponseStatus::kSuccess, results);
}
void InvertedIndexSearch::ClearIndex(ClearIndexCallback callback) {
inverted_index_->ClearInvertedIndex();
std::move(callback).Run();
}
std::vector<std::pair<std::string, uint32_t>>
InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
......@@ -175,7 +245,7 @@ InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
return doc_with_freq;
}
void InvertedIndexSearch::FinalizeAddOrUpdate(
void InvertedIndexSearch::FinalizeAddOrUpdateSync(
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_;
......@@ -183,13 +253,34 @@ void InvertedIndexSearch::FinalizeAddOrUpdate(
MaybeBuildInvertedIndex();
}
void InvertedIndexSearch::FinalizeDelete(const std::vector<std::string>& ids) {
void InvertedIndexSearch::FinalizeDeleteSync(
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_;
inverted_index_->RemoveDocuments(ids);
MaybeBuildInvertedIndex();
}
void InvertedIndexSearch::FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->AddDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->RemoveDocuments(ids, std::move(callback));
}
void InvertedIndexSearch::FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->UpdateDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::MaybeBuildInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (num_queued_index_updates_ == 0) {
......
......@@ -14,6 +14,7 @@
#include "base/sequence_checker.h"
#include "base/sequenced_task_runner.h"
#include "base/strings/string16.h"
#include "chromeos/components/local_search_service/index.h"
#include "chromeos/components/local_search_service/index_sync.h"
#include "chromeos/components/local_search_service/shared_structs.h"
......@@ -24,7 +25,7 @@ class InvertedIndex;
// An implementation of Index.
// A search via the inverted index backend with TF-IDF based document ranking.
class InvertedIndexSearch : public IndexSync {
class InvertedIndexSearch : public IndexSync, public Index {
public:
InvertedIndexSearch(IndexId index_id, PrefService* local_state);
~InvertedIndexSearch() override;
......@@ -32,7 +33,7 @@ class InvertedIndexSearch : public IndexSync {
InvertedIndexSearch(const InvertedIndexSearch&) = delete;
InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete;
// Index overrides:
// IndexSync overrides:
uint64_t GetSizeSync() override;
// TODO(jiameng): we always build the index after documents are updated. May
// revise this strategy if there is a different use case.
......@@ -52,19 +53,47 @@ class InvertedIndexSearch : public IndexSync {
uint32_t max_results,
std::vector<Result>* results) override;
// Index overrides:
// GetSize is only accurate if the index has done updating.
void GetSize(GetSizeCallback callback) override;
void AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) override;
void Delete(const std::vector<std::string>& ids,
DeleteCallback callback) override;
void UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) override;
void Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) override;
void ClearIndex(ClearIndexCallback callback) override;
// Returns document id and number of occurrences of |term|.
// Document ids are sorted in alphabetical order.
std::vector<std::pair<std::string, uint32_t>> FindTermForTesting(
const base::string16& term) const;
private:
void FinalizeAddOrUpdateSync(
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDeleteSync is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero.
void FinalizeDeleteSync(const std::vector<std::string>& ids);
void FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDelete is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero.
void FinalizeDelete(const std::vector<std::string>& ids);
void FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids);
void FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// In order to reduce unnecessary inverted index building, we only build the
// index if there's no upcoming modification to the index's document list.
......
......@@ -13,6 +13,8 @@ namespace chromeos {
namespace local_search_service {
namespace {
// This is (data-id, content-ids).
using ResultWithIds = std::pair<std::string, std::vector<std::string>>;
// (content-id, content).
using ContentWithId = std::pair<std::string, std::string>;
......@@ -23,6 +25,102 @@ using WeightedContentWithId = std::tuple<std::string, std::string, float>;
// (document-id, number-of-occurrences).
using TermOccurrence = std::vector<std::pair<std::string, uint32_t>>;
void GetSizeAndCheckResults(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
uint32_t expectd_num_items) {
DCHECK(index);
bool callback_done = false;
uint32_t num_items = 0;
index->GetSize(base::BindOnce(
[](bool* callback_done, uint32_t* num_items, uint64_t size) {
*callback_done = true;
*num_items = size;
},
&callback_done, &num_items));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_items, expectd_num_items);
}
void AddOrUpdate(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<Data>& data) {
DCHECK(index);
bool callback_done = false;
index->AddOrUpdate(
data, base::BindOnce([](bool* callback_done) { *callback_done = true; },
&callback_done));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
}
void Delete(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<std::string>& ids,
uint32_t expect_num_deleted) {
DCHECK(index);
bool callback_done = false;
uint32_t num_deleted = 0u;
index->Delete(ids, base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void UpdateDocuments(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<Data>& data,
uint32_t expect_num_deleted) {
DCHECK(index);
bool callback_done = false;
uint32_t num_deleted = 0u;
index->UpdateDocuments(data,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
std::vector<Result> Find(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
std::string query,
int32_t max_results,
ResponseStatus expected_status) {
DCHECK(index);
bool callback_done = false;
ResponseStatus status;
std::vector<Result> results;
index->Find(
base::UTF8ToUTF16(query), max_results,
base::BindOnce(
[](bool* callback_done, ResponseStatus* status,
std::vector<Result>* results, ResponseStatus status_callback,
const base::Optional<std::vector<Result>>& results_callback) {
*callback_done = true;
*status = status_callback;
if (results_callback.has_value())
*results = results_callback.value();
},
&callback_done, &status, &results));
task_environment->RunUntilIdle();
EXPECT_TRUE(callback_done);
EXPECT_EQ(status, expected_status);
return results;
}
} // namespace
class InvertedIndexSearchTest : public testing::Test {
......@@ -40,7 +138,7 @@ class InvertedIndexSearchTest : public testing::Test {
base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
};
TEST_F(InvertedIndexSearchTest, Add) {
TEST_F(InvertedIndexSearchTest, AddSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
......@@ -96,7 +194,7 @@ TEST_F(InvertedIndexSearchTest, Add) {
}
}
TEST_F(InvertedIndexSearchTest, Update) {
TEST_F(InvertedIndexSearchTest, UpdateSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
......@@ -188,7 +286,7 @@ TEST_F(InvertedIndexSearchTest, ClearIndexSync) {
EXPECT_EQ(search_->GetSizeSync(), 0u);
}
TEST_F(InvertedIndexSearchTest, Find) {
TEST_F(InvertedIndexSearchTest, FindSync) {
const std::map<std::string, std::vector<WeightedContentWithId>>
data_to_register = {{"id1",
{{"cid_1", "This is a help wi-fi article", 0.8},
......@@ -299,7 +397,7 @@ TEST_F(InvertedIndexSearchTest, Find) {
}
}
TEST_F(InvertedIndexSearchTest, SequenceOfDeleteSyncs) {
TEST_F(InvertedIndexSearchTest, SequenceOfDeletesSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
......@@ -326,5 +424,312 @@ TEST_F(InvertedIndexSearchTest, SequenceOfDeleteSyncs) {
EXPECT_EQ(search_->GetSizeSync(), 0u);
}
TEST_F(InvertedIndexSearchTest, Add) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
{
// "network" does not exist in the index.
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("network"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "help" exists in the index.
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("help"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 3u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
{
// "wifi" exists in the index but "wi-fi" doesn't because of normalization.
TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
doc_with_freq = search_->FindTermForTesting(base::UTF8ToUTF16("wi-fi"));
EXPECT_TRUE(doc_with_freq.empty());
// "WiFi" doesn't exist because the index stores normalized word.
doc_with_freq = search_->FindTermForTesting(base::UTF8ToUTF16("WiFi"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "this" does not exist in the index because it's a stopword
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("this"));
EXPECT_TRUE(doc_with_freq.empty());
}
}
TEST_F(InvertedIndexSearchTest, Update) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
AddOrUpdate(search_.get(), &task_environment_, updated_data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 3u);
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("bluetooth"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("google"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id3");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
}
TEST_F(InvertedIndexSearchTest, Delete) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
Delete(search_.get(), &task_environment_, {"id1"}, 1u);
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
}
TEST_F(InvertedIndexSearchTest, ClearIndex) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
bool callback_done = false;
search_->ClearIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
}
TEST_F(InvertedIndexSearchTest, FindTest) {
const std::map<std::string, std::vector<WeightedContentWithId>>
data_to_register = {{"id1",
{{"cid_1", "This is a help wi-fi article", 0.8},
{"cid_2", "Another help help wi-fi", 0.6}}},
{"id2", {{"cid_3", "help article on wi-fi", 0.6}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
// Nothing has been added to the index.
std::vector<Result> results =
Find(search_.get(), &task_environment_, "network",
/*max_results=*/10, ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Data is added and then deleted from index, making the index empty.
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
Delete(search_.get(), &task_environment_, {"id1", "id2"}, 2u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
results = Find(search_.get(), &task_environment_, "network",
/*max_results=*/10, ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Index is populated again, but query is empty.
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
results = Find(search_.get(), &task_environment_, "", /*max_results=*/10,
ResponseStatus::kEmptyQuery);
EXPECT_TRUE(results.empty());
// No document is found for a given query.
results = Find(search_.get(), &task_environment_, "networkstuff",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_TRUE(results.empty());
{
// A document is found.
// Query's case is normalized.
results = Find(search_.get(), &task_environment_, "ANOTHER networkstuff",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
// "another" only exists in "id1".
const float expected_score =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/7);
CheckResult(results[0], "id1", expected_score,
/*expected_number_positions=*/1);
}
{
// Two documents are found.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
// "id1" score comes from both "another" and "help".
const float expected_score_id1 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/7) +
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/0.8 + 0.6 * 2,
/*doc_length=*/7);
// "id2" score comes "help".
const float expected_score_id2 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/3);
EXPECT_GE(expected_score_id1, expected_score_id2);
CheckResult(results[0], "id1", expected_score_id1,
/*expected_number_positions=*/4);
CheckResult(results[1], "id2", expected_score_id2,
/*expected_number_positions=*/1);
}
{
// Same as above, but max number of results is set to 1.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/1, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
EXPECT_EQ(results[0].id, "id1");
}
{
// Same as above, but set max_results to 0, meaning no max.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/0, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
}
}
TEST_F(InvertedIndexSearchTest, SequenceOfDeletes) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
AddOrUpdate(search_.get(), &task_environment_, updated_data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 3u);
Delete(search_.get(), &task_environment_, {"id1"}, 1u);
Delete(search_.get(), &task_environment_, {"id2", "id3"}, 2u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
}
TEST_F(InvertedIndexSearchTest, UpdateDocumentsTest) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id2", {}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
UpdateDocuments(search_.get(), &task_environment_, updated_data, 1u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
// Check if "id1" has been updated
std::vector<Result> results =
Find(search_.get(), &task_environment_, "bluetooth",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
// "bluetooth" only exists in "id1".
const float expected_score =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/1,
/*doc_length=*/7);
CheckResult(results[0], "id1", expected_score,
/*expected_number_positions=*/1);
}
} // namespace local_search_service
} // namespace chromeos
......@@ -38,11 +38,6 @@ constexpr double kDefaultWeight = 1.0;
class InvertedIndexTest : public ::testing::Test {
public:
InvertedIndexTest() {
index_.RegisterIndexBuiltCallback(base::BindRepeating(
&InvertedIndexTest::OnIndexBuilt, base::Unretained(this)));
}
void SetUp() override {
// All content weights are initialized to |kDefaultWeight|.
index_.doc_length_ =
......@@ -99,12 +94,64 @@ class InvertedIndexTest : public ::testing::Test {
index_.AddDocuments(documents);
}
void AddDocumentsAndCheck(const DocumentToUpdate& documents) {
bool callback_done = false;
index_.AddDocuments(
documents,
base::BindOnce([](bool* callback_done) { *callback_done = true; },
&callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
void RemoveDocuments(const std::vector<std::string>& doc_ids) {
index_.RemoveDocuments(doc_ids);
}
void RemoveDocumentsAndCheck(const std::vector<std::string>& doc_ids,
uint32_t expect_num_deleted) {
bool callback_done = false;
uint32_t num_deleted = 0u;
index_.RemoveDocuments(doc_ids,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
Wait();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void UpdateDocumentsAndCheck(const DocumentToUpdate& documents,
uint32_t expect_num_deleted) {
bool callback_done = false;
uint32_t num_deleted = 0u;
index_.UpdateDocuments(documents,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
Wait();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void ClearInvertedIndex() { index_.ClearInvertedIndex(); }
void ClearInvertedIndexAndCheck() {
bool callback_done = false;
index_.ClearInvertedIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
std::vector<TfidfResult> GetTfidf(const base::string16& term) {
return index_.GetTfidf(term);
}
......@@ -130,6 +177,14 @@ class InvertedIndexTest : public ::testing::Test {
void BuildInvertedIndex() { index_.BuildInvertedIndex(); }
void BuildInvertedIndexAndCheck() {
bool callback_done = false;
index_.BuildInvertedIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); }
std::unordered_map<base::string16, PostingList> GetDictionary() {
......@@ -159,20 +214,16 @@ class InvertedIndexTest : public ::testing::Test {
bool UpdateDocumentsCompleted() { return !index_.update_in_progress_; }
void OnIndexBuilt() { ++num_built_; }
int NumBuilt() { return num_built_; }
private:
int num_built_ = 0;
InvertedIndex index_;
protected:
base::test::TaskEnvironment task_environment_{
base::test::TaskEnvironment::MainThreadType::DEFAULT,
base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
private:
InvertedIndex index_;
};
TEST_F(InvertedIndexTest, FindTermTest) {
EXPECT_EQ(NumBuilt(), 0);
PostingList result = FindTerm(base::UTF8ToUTF16("A"));
ASSERT_EQ(result.size(), 2u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
......@@ -194,7 +245,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
AddDocuments({{"doc3",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
......@@ -206,7 +256,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
// 4 terms "A", "B", "C", "D" need to be updated.
EXPECT_EQ(GetTermToBeUpdated().size(), 4u);
......@@ -252,7 +301,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
// 7 terms "A", "B", "C", "D", "E", "F", "G" need to be updated.
EXPECT_EQ(GetTermToBeUpdated().size(), 7u);
......@@ -267,11 +315,70 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
ASSERT_EQ(result.size(), 1u);
}
TEST_F(InvertedIndexTest, AddNewDocumentTestCallback) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc3",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocLength()["doc3"], 5u);
// Find "A"
PostingList result = FindTerm(a_utf16);
ASSERT_EQ(result.size(), 3u);
EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][0].position.start, 1u);
EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc3"][1].position.start, 2u);
EXPECT_EQ(result["doc3"][2].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][2].position.start, 4u);
// Find "D"
result = FindTerm(d_utf16);
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][0].position.start, 3u);
EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc3"][1].position.start, 5u);
// Add multiple documents
AddDocumentsAndCheck({{"doc4",
{{base::UTF8ToUTF16("E"),
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{base::UTF8ToUTF16("F"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}},
{"doc5",
{{base::UTF8ToUTF16("E"),
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{base::UTF8ToUTF16("G"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
// Find "E"
result = FindTerm(base::UTF8ToUTF16("E"));
ASSERT_EQ(result.size(), 2u);
// Find "F"
result = FindTerm(base::UTF8ToUTF16("F"));
ASSERT_EQ(result.size(), 1u);
}
TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
AddDocuments({{"doc1",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
......@@ -282,7 +389,6 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
{kDefaultWeight / 5, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDocLength()["doc1"], 5u);
......@@ -311,14 +417,51 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
EXPECT_EQ(result["doc1"][1].position.start, 5u);
}
TEST_F(InvertedIndexTest, ReplaceDocumentTestCallback) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc1",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight / 3, {"header", 3, 1}},
{kDefaultWeight / 5, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocLength()["doc1"], 5u);
EXPECT_EQ(GetDocLength()["doc2"], 6u);
// Find "A"
PostingList result = FindTerm(a_utf16);
ASSERT_EQ(result.size(), 2u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc1"][0].position.start, 1u);
EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 4);
EXPECT_EQ(result["doc1"][1].position.start, 2u);
EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc1"][2].position.start, 4u);
// Find "B"
result = FindTerm(base::UTF8ToUTF16("B"));
ASSERT_EQ(result.size(), 0u);
// Find "D"
result = FindTerm(d_utf16);
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight / 3);
EXPECT_EQ(result["doc1"][0].position.start, 3u);
EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 5);
EXPECT_EQ(result["doc1"][1].position.start, 5u);
}
TEST_F(InvertedIndexTest, RemoveDocumentTest) {
EXPECT_EQ(GetDictionary().size(), 3u);
EXPECT_EQ(GetDocLength().size(), 2u);
EXPECT_EQ(NumBuilt(), 0);
RemoveDocuments({"doc1"});
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDictionary().size(), 2u);
......@@ -352,19 +495,57 @@ TEST_F(InvertedIndexTest, RemoveDocumentTest) {
// Removes multiple documents
RemoveDocuments({"doc1", "doc2", "doc3"});
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u);
}
TEST_F(InvertedIndexTest, RemoveDocumentTestCallback) {
EXPECT_EQ(GetDictionary().size(), 3u);
EXPECT_EQ(GetDocLength().size(), 2u);
RemoveDocumentsAndCheck({"doc1"}, 1u);
EXPECT_EQ(GetDictionary().size(), 2u);
EXPECT_EQ(GetDocLength().size(), 1u);
EXPECT_EQ(GetDocLength()["doc2"], 6u);
// Find "A"
PostingList result = FindTerm(base::UTF8ToUTF16("A"));
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][0].position.start, 2u);
EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][1].position.start, 4u);
// Find "B"
result = FindTerm(base::UTF8ToUTF16("B"));
ASSERT_EQ(result.size(), 0u);
// Find "C"
result = FindTerm(base::UTF8ToUTF16("C"));
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][0].position.start, 1u);
EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][1].position.start, 3u);
EXPECT_EQ(result["doc2"][2].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][2].position.start, 5u);
EXPECT_EQ(result["doc2"][3].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][3].position.start, 7u);
// Removes multiple documents, but only "doc2" is actually removed since
// "doc1" and "doc3" don't exist.
RemoveDocumentsAndCheck({"doc1", "doc2", "doc3"}, 1u);
EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u);
}
TEST_F(InvertedIndexTest, TfidfFromZeroTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
EXPECT_FALSE(IsInvertedIndexBuilt());
EXPECT_EQ(NumBuilt(), 0);
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted());
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
......@@ -389,7 +570,6 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted());
EXPECT_TRUE(IsInvertedIndexBuilt());
......@@ -406,13 +586,11 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
{kDefaultWeight, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_FALSE(IsInvertedIndexBuilt());
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 2);
EXPECT_TRUE(BuildIndexCompleted());
EXPECT_EQ(GetTfidfCache().size(), 3u);
......@@ -469,11 +647,106 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
testing::UnorderedElementsAre(expected_tfidf_D_doc1));
}
TEST_F(InvertedIndexTest, UpdateIndexTestCallback) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Replaces "doc1"
AddDocumentsAndCheck({{"doc1",
{{base::UTF8ToUTF16("A"),
{{kDefaultWeight / 2, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{base::UTF8ToUTF16("D"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight, {"body", 5, 1}}}}}}});
EXPECT_FALSE(IsInvertedIndexBuilt());
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
const double expected_tfidf_A_doc1 =
std::roundf(
TfIdfScore(
/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight / 2 +
kDefaultWeight / 4 + kDefaultWeight / 2,
/*doc_length=*/5) *
100) /
100;
const double expected_tfidf_A_doc2 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
/*doc_length=*/6) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_A_doc1,
expected_tfidf_A_doc2));
results = GetTfidf(base::UTF8ToUTF16("B"));
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre());
results = GetTfidf(base::UTF8ToUTF16("C"));
const double expected_tfidf_C_doc2 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 4,
/*doc_length=*/6) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_C_doc2));
results = GetTfidf(base::UTF8ToUTF16("D"));
const double expected_tfidf_D_doc1 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
/*doc_length=*/5) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_D_doc1));
}
TEST_F(InvertedIndexTest, UpdateDocumentsTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Replaces "doc1" and remove "doc2"
UpdateDocumentsAndCheck({{"doc1",
{{base::UTF8ToUTF16("A"),
{{kDefaultWeight / 2, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{base::UTF8ToUTF16("D"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight, {"body", 5, 1}}}}}},
{"doc2", {}}},
1u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 2u);
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("C"));
EXPECT_EQ(results.size(), 0u);
}
TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted());
EXPECT_TRUE(IsInvertedIndexBuilt());
......@@ -500,6 +773,31 @@ TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
}
TEST_F(InvertedIndexTest, ClearInvertedIndexTestCallback) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Add a document and clear the index simultaneously.
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc3",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
ClearInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u);
EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
}
TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
const double prefix_threshold = 1.0;
const double block_threshold = 1.0;
......@@ -508,10 +806,8 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
const base::string16 c_utf16(base::UTF8ToUTF16("C"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
// Replace doc1, same occurrences, just different weights.
AddDocuments({{"doc1",
AddDocumentsAndCheck({{"doc1",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight, {"header", 3, 1}},
......@@ -522,15 +818,6 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
{kDefaultWeight / 2, {"header", 6, 1}},
{kDefaultWeight / 3, {"body", 4, 1}},
{kDefaultWeight / 3, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted());
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted());
{
// "A" exists in "doc1" and "doc2". The score of each document is simply A's
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment