Commit 0b2e0bd8 authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

[local-search-service] Implement InvertedIndexSearch with callback

This CL implements InvertedIndexSearch functions that are inherited
from new Index interface.

Design doc: go/lss-sandboxing
Implementation plan: go/lss-sandboxing-impl

Bug: 1137560
Change-Id: I3a1439f9b43d9d52bbb6819308535aa28ef7fe3d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2531078Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#827660}
parent 1ade577e
...@@ -85,8 +85,8 @@ TfidfCache BuildTfidf(uint32_t num_docs_from_last_update, ...@@ -85,8 +85,8 @@ TfidfCache BuildTfidf(uint32_t num_docs_from_last_update,
} }
// Removes a document from document state variables given it's ID. Don't do // Removes a document from document state variables given it's ID. Don't do
// anything if the ID doesn't exist. // anything if the ID doesn't exist. Return true if the document is removed.
void RemoveDocumentIfExist(const std::string& document_id, bool RemoveDocumentIfExist(const std::string& document_id,
DocLength* doc_length, DocLength* doc_length,
Dictionary* dictionary, Dictionary* dictionary,
TermSet* terms_to_be_updated) { TermSet* terms_to_be_updated) {
...@@ -94,13 +94,15 @@ void RemoveDocumentIfExist(const std::string& document_id, ...@@ -94,13 +94,15 @@ void RemoveDocumentIfExist(const std::string& document_id,
CHECK(dictionary); CHECK(dictionary);
CHECK(terms_to_be_updated); CHECK(terms_to_be_updated);
DCHECK(!content::BrowserThread::CurrentlyOn(content::BrowserThread::UI)); DCHECK(!content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
bool document_removed = false;
if (doc_length->find(document_id) == doc_length->end()) if (doc_length->find(document_id) == doc_length->end())
return; return document_removed;
doc_length->erase(document_id); doc_length->erase(document_id);
for (auto it = dictionary->begin(); it != dictionary->end();) { for (auto it = dictionary->begin(); it != dictionary->end();) {
if (it->second.find(document_id) != it->second.end()) { if (it->second.find(document_id) != it->second.end()) {
terms_to_be_updated->insert(it->first); terms_to_be_updated->insert(it->first);
it->second.erase(document_id); it->second.erase(document_id);
document_removed = true;
} }
// Removes term from the dictionary if its posting list is empty. // Removes term from the dictionary if its posting list is empty.
...@@ -110,33 +112,42 @@ void RemoveDocumentIfExist(const std::string& document_id, ...@@ -110,33 +112,42 @@ void RemoveDocumentIfExist(const std::string& document_id,
it++; it++;
} }
} }
return document_removed;
} }
// Given list of documents to update and document state variables, returns new // Given list of documents to update and document state variables, returns new
// document state variables. // document state variables and number of deleted documents.
DocumentStateVariables UpdateDocuments(DocumentToUpdate&& documents_to_update, std::pair<DocumentStateVariables, uint32_t> UpdateDocumentStateVariables(
DocumentToUpdate&& documents_to_update,
const DocLength& doc_length, const DocLength& doc_length,
Dictionary&& dictionary, Dictionary&& dictionary,
TermSet&& terms_to_be_updated) { TermSet&& terms_to_be_updated) {
DCHECK(!::content::BrowserThread::CurrentlyOn(::content::BrowserThread::UI)); DCHECK(!::content::BrowserThread::CurrentlyOn(::content::BrowserThread::UI));
DocLength new_doc_length(doc_length); DocLength new_doc_length(doc_length);
uint32_t num_deleted = 0u;
for (const auto& document : documents_to_update) { for (const auto& document : documents_to_update) {
const std::string document_id(document.first); const std::string document_id(document.first);
RemoveDocumentIfExist(document_id, &new_doc_length, &dictionary, bool is_deleted = RemoveDocumentIfExist(document_id, &new_doc_length,
&terms_to_be_updated); &dictionary, &terms_to_be_updated);
// Update the document if necessary. // Update the document if necessary.
if (!document.second.empty()) { if (!document.second.empty()) {
// If document content is not empty, it is being updated but not
// deleted.
is_deleted = false;
for (const auto& token : document.second) { for (const auto& token : document.second) {
dictionary[token.content][document_id] = token.positions; dictionary[token.content][document_id] = token.positions;
new_doc_length[document_id] += token.positions.size(); new_doc_length[document_id] += token.positions.size();
terms_to_be_updated.insert(token.content); terms_to_be_updated.insert(token.content);
} }
} }
num_deleted += (is_deleted) ? 1 : 0;
} }
return std::make_tuple(std::move(new_doc_length), std::move(dictionary), return std::make_pair(
std::move(terms_to_be_updated)); std::make_tuple(std::move(new_doc_length), std::move(dictionary),
std::move(terms_to_be_updated)),
num_deleted);
} }
// Given the index variables, clear all the data. // Given the index variables, clear all the data.
...@@ -159,13 +170,12 @@ std::pair<DocumentStateVariables, TfidfCache> ClearData( ...@@ -159,13 +170,12 @@ std::pair<DocumentStateVariables, TfidfCache> ClearData(
} }
} // namespace } // namespace
InvertedIndex::InvertedIndex() = default; InvertedIndex::InvertedIndex() {
InvertedIndex::~InvertedIndex() = default; task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(),
void InvertedIndex::RegisterIndexBuiltCallback( base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN});
base::RepeatingCallback<void()> on_index_built) {
on_index_built_ = std::move(on_index_built);
} }
InvertedIndex::~InvertedIndex() = default;
PostingList InvertedIndex::FindTerm(const base::string16& term) const { PostingList InvertedIndex::FindTerm(const base::string16& term) const {
if (dictionary_.find(term) != dictionary_.end()) if (dictionary_.find(term) != dictionary_.end())
...@@ -238,6 +248,20 @@ void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) { ...@@ -238,6 +248,20 @@ void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) {
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback) {
if (documents.empty())
return;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnAddDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
uint32_t InvertedIndex::RemoveDocuments( uint32_t InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids) { const std::vector<std::string>& document_ids) {
if (document_ids.empty()) if (document_ids.empty())
...@@ -254,6 +278,35 @@ uint32_t InvertedIndex::RemoveDocuments( ...@@ -254,6 +278,35 @@ uint32_t InvertedIndex::RemoveDocuments(
return document_ids.size(); return document_ids.size();
} }
void InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback) {
DocumentToUpdate documents;
for (const auto& id : document_ids) {
documents.push_back({id, std::vector<Token>()});
}
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::UpdateDocuments(
const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback) {
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
std::vector<TfidfResult> InvertedIndex::GetTfidf( std::vector<TfidfResult> InvertedIndex::GetTfidf(
const base::string16& term) const { const base::string16& term) const {
if (tfidf_cache_.find(term) != tfidf_cache_.end()) { if (tfidf_cache_.find(term) != tfidf_cache_.end()) {
...@@ -269,12 +322,34 @@ void InvertedIndex::BuildInvertedIndex() { ...@@ -269,12 +322,34 @@ void InvertedIndex::BuildInvertedIndex() {
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::BuildInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::ClearInvertedIndex() { void InvertedIndex::ClearInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
request_to_clear_index_ = true; request_to_clear_index_ = true;
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::ClearInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataCleared,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::InvertedIndexController() { void InvertedIndex::InvertedIndexController() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
// TODO(thanhdng): A clear-index call should ideally cancel all other update // TODO(thanhdng): A clear-index call should ideally cancel all other update
...@@ -285,12 +360,12 @@ void InvertedIndex::InvertedIndexController() { ...@@ -285,12 +360,12 @@ void InvertedIndex::InvertedIndexController() {
if (request_to_clear_index_) { if (request_to_clear_index_) {
update_in_progress_ = true; update_in_progress_ = true;
request_to_clear_index_ = false; request_to_clear_index_ = false;
base::ThreadPool::PostTaskAndReplyWithResult( task_runner_->PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT}, FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_, base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_), std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)), std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataCleared, base::BindOnce(&InvertedIndex::OnDataClearedSync,
weak_ptr_factory_.GetWeakPtr())); weak_ptr_factory_.GetWeakPtr()));
return; return;
} }
...@@ -300,35 +375,33 @@ void InvertedIndex::InvertedIndexController() { ...@@ -300,35 +375,33 @@ void InvertedIndex::InvertedIndexController() {
update_in_progress_ = true; update_in_progress_ = true;
index_building_in_progress_ = true; index_building_in_progress_ = true;
request_to_build_index_ = false; request_to_build_index_ = false;
base::ThreadPool::PostTaskAndReplyWithResult( task_runner_->PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT}, FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_, base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_), dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_), tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfComplete, base::BindOnce(&InvertedIndex::OnBuildTfidfCompleteSync,
weak_ptr_factory_.GetWeakPtr())); weak_ptr_factory_.GetWeakPtr()));
} else if (terms_to_be_updated_.empty()) { } else if (terms_to_be_updated_.empty()) {
// If there's no more work to do and all changed terms have been used to // If there's no more work to do and all changed terms have been used to
// update the index, then mark index is built and make the callback. // update the index, then mark index is built.
is_index_built_ = true; is_index_built_ = true;
if (!on_index_built_.is_null())
on_index_built_.Run();
} }
} else { } else {
update_in_progress_ = true; update_in_progress_ = true;
base::ThreadPool::PostTaskAndReplyWithResult( task_runner_->PostTaskAndReplyWithResult(
FROM_HERE, {base::MayBlock(), base::TaskPriority::BEST_EFFORT}, FROM_HERE,
// TODO(jiameng): |doc_length_| can be moved since it's not used for // TODO(jiameng): |doc_length_| can be moved since it's not used for
// document existence checking any more. // document existence checking any more.
base::BindOnce(&UpdateDocuments, std::move(documents_to_update_), base::BindOnce(&UpdateDocumentStateVariables,
doc_length_, std::move(dictionary_), std::move(documents_to_update_), doc_length_,
std::move(terms_to_be_updated_)), std::move(dictionary_), std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete, base::BindOnce(&InvertedIndex::OnUpdateDocumentsCompleteSync,
weak_ptr_factory_.GetWeakPtr())); weak_ptr_factory_.GetWeakPtr()));
} }
} }
void InvertedIndex::OnBuildTfidfComplete(TfidfCache&& new_cache) { void InvertedIndex::OnBuildTfidfCompleteSync(TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
num_docs_from_last_update_ = doc_length_.size(); num_docs_from_last_update_ = doc_length_.size();
tfidf_cache_ = std::move(new_cache); tfidf_cache_ = std::move(new_cache);
...@@ -338,18 +411,66 @@ void InvertedIndex::OnBuildTfidfComplete(TfidfCache&& new_cache) { ...@@ -338,18 +411,66 @@ void InvertedIndex::OnBuildTfidfComplete(TfidfCache&& new_cache) {
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::OnUpdateDocumentsComplete( void InvertedIndex::OnBuildTfidfComplete(base::OnceCallback<void()> callback,
DocumentStateVariables&& document_state_variables) { TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(document_state_variables)); num_docs_from_last_update_ = doc_length_.size();
dictionary_ = std::move(std::get<1>(document_state_variables)); tfidf_cache_ = std::move(new_cache);
terms_to_be_updated_ = std::move(std::get<2>(document_state_variables));
std::move(callback).Run();
}
void InvertedIndex::OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
update_in_progress_ = false; update_in_progress_ = false;
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::OnDataCleared( void InvertedIndex::OnUpdateDocumentsComplete(
base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(base::BindOnce(
[](base::OnceCallback<void(uint32_t)> callback, uint32_t num_deleted) {
std::move(callback).Run(num_deleted);
},
std::move(callback), document_state_variables_and_num_deleted.second));
}
void InvertedIndex::OnAddDocumentsComplete(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_EQ(document_state_variables_and_num_deleted.second, 0u);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(std::move(callback));
}
void InvertedIndex::OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) { std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first)); doc_length_ = std::move(std::get<0>(inverted_index_data.first));
...@@ -362,5 +483,18 @@ void InvertedIndex::OnDataCleared( ...@@ -362,5 +483,18 @@ void InvertedIndex::OnDataCleared(
InvertedIndexController(); InvertedIndexController();
} }
void InvertedIndex::OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first));
dictionary_ = std::move(std::get<1>(inverted_index_data.first));
terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
tfidf_cache_ = std::move(inverted_index_data.second);
num_docs_from_last_update_ = 0;
std::move(callback).Run();
}
} // namespace local_search_service } // namespace local_search_service
} // namespace chromeos } // namespace chromeos
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "base/callback.h" #include "base/callback.h"
#include "base/gtest_prod_util.h" #include "base/gtest_prod_util.h"
#include "base/memory/weak_ptr.h" #include "base/memory/weak_ptr.h"
#include "base/sequenced_task_runner.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "chromeos/components/local_search_service/shared_structs.h" #include "chromeos/components/local_search_service/shared_structs.h"
...@@ -60,10 +61,6 @@ class InvertedIndex { ...@@ -60,10 +61,6 @@ class InvertedIndex {
InvertedIndex(const InvertedIndex&) = delete; InvertedIndex(const InvertedIndex&) = delete;
InvertedIndex& operator=(const InvertedIndex&) = delete; InvertedIndex& operator=(const InvertedIndex&) = delete;
// |on_index_built| will be called after the index is built.
void RegisterIndexBuiltCallback(
base::RepeatingCallback<void()> on_index_built);
// Returns document ID and positions of a term. // Returns document ID and positions of a term.
PostingList FindTerm(const base::string16& term) const; PostingList FindTerm(const base::string16& term) const;
...@@ -79,6 +76,10 @@ class InvertedIndex { ...@@ -79,6 +76,10 @@ class InvertedIndex {
// unique (have unique content). This function doesn't modify any cache. It // unique (have unique content). This function doesn't modify any cache. It
// only adds documents and tokens to the index. // only adds documents and tokens to the index.
void AddDocuments(const DocumentToUpdate& documents); void AddDocuments(const DocumentToUpdate& documents);
// Similar to the above function, but it will build TF-IDF cache after adding
// documents.
void AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback);
// Removes documents from the inverted index. Do nothing if the document id is // Removes documents from the inverted index. Do nothing if the document id is
// not in the index. // not in the index.
...@@ -87,6 +88,22 @@ class InvertedIndex { ...@@ -87,6 +88,22 @@ class InvertedIndex {
// As other operations may be running on a separate thread, this function // As other operations may be running on a separate thread, this function
// returns size of |document_ids| and not actually deleted documents. // returns size of |document_ids| and not actually deleted documents.
uint32_t RemoveDocuments(const std::vector<std::string>& document_ids); uint32_t RemoveDocuments(const std::vector<std::string>& document_ids);
// Similar to the above function, but it will build TF-IDF cache after
// removing documents.
void RemoveDocuments(const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback);
// Updates documents from the inverted index. It combines two functions:
// AddDocuments and RemoveDocument. This function will returns number of
// documents to be removed (number of documents that have empty content).
// - If a document ID is not in the index, add the document to the index.
// - If a document ID is in the index and it's new content isn't empty,
// update it's content in the index.
// - If a document ID is in the index and it's content is empty, remove it
// from the index.
// It will build TF-IDF cache after updating the documents.
void UpdateDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback);
// Gets TF-IDF scores for a term. This function returns the TF-IDF score from // Gets TF-IDF scores for a term. This function returns the TF-IDF score from
// the cache. // the cache.
...@@ -96,9 +113,11 @@ class InvertedIndex { ...@@ -96,9 +113,11 @@ class InvertedIndex {
// Builds the inverted index. // Builds the inverted index.
void BuildInvertedIndex(); void BuildInvertedIndex();
void BuildInvertedIndex(base::OnceCallback<void()> callback);
// Clears all the data from the inverted index. // Clears all the data from the inverted index.
void ClearInvertedIndex(); void ClearInvertedIndex();
void ClearInvertedIndex(base::OnceCallback<void()> callback);
// Checks if the inverted index has been built: returns |true| if the inverted // Checks if the inverted index has been built: returns |true| if the inverted
// index is up to date, returns |false| if there are some modified document // index is up to date, returns |false| if there are some modified document
...@@ -118,17 +137,26 @@ class InvertedIndex { ...@@ -118,17 +137,26 @@ class InvertedIndex {
void InvertedIndexController(); void InvertedIndexController();
// Called on the main thread after BuildTfidf is completed. // Called on the main thread after BuildTfidf is completed.
void OnBuildTfidfComplete(TfidfCache&& new_cache); void OnBuildTfidfCompleteSync(TfidfCache&& new_cache);
void OnBuildTfidfComplete(base::OnceCallback<void()> callback,
// Called on the main thread after UpdateDocuments is completed. TfidfCache&& new_cache);
void OnUpdateDocumentsComplete( // Called on the main thread after UpdateDocumentsStateVariables is completed.
DocumentStateVariables&& document_state_variables); void OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnUpdateDocumentsComplete(base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnAddDocumentsComplete(base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
void OnDataCleared( void OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data); std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
base::RepeatingCallback<void()> on_index_built_;
// |is_index_built_| is only true if index's TF-IDF is consistent with the // |is_index_built_| is only true if index's TF-IDF is consistent with the
// documents in the index. This means as soon as documents are modified // documents in the index. This means as soon as documents are modified
// (added, updated or deleted), |is_index_built_| will be set to false. While // (added, updated or deleted), |is_index_built_| will be set to false. While
...@@ -155,6 +183,7 @@ class InvertedIndex { ...@@ -155,6 +183,7 @@ class InvertedIndex {
bool index_building_in_progress_ = false; bool index_building_in_progress_ = false;
bool request_to_clear_index_ = false; bool request_to_clear_index_ = false;
scoped_refptr<base::SequencedTaskRunner> task_runner_;
SEQUENCE_CHECKER(sequence_checker_); SEQUENCE_CHECKER(sequence_checker_);
base::WeakPtrFactory<InvertedIndex> weak_ptr_factory_{this}; base::WeakPtrFactory<InvertedIndex> weak_ptr_factory_{this};
......
...@@ -51,18 +51,38 @@ ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) { ...@@ -51,18 +51,38 @@ ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) {
ExtractedContent documents; ExtractedContent documents;
for (const Data& d : data) { for (const Data& d : data) {
const std::vector<Token> document_tokens = ExtractDocumentTokens(d); const std::vector<Token> document_tokens = ExtractDocumentTokens(d);
DCHECK(!document_tokens.empty());
documents.push_back({d.id, document_tokens}); documents.push_back({d.id, document_tokens});
} }
return documents; return documents;
} }
std::unordered_set<base::string16> GetTokenizedQuery(
const base::string16& query) {
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
return tokens;
}
} // namespace } // namespace
InvertedIndexSearch::InvertedIndexSearch(IndexId index_id, InvertedIndexSearch::InvertedIndexSearch(IndexId index_id,
PrefService* local_state) PrefService* local_state)
: IndexSync(index_id, Backend::kInvertedIndex, local_state), : IndexSync(index_id, Backend::kInvertedIndex, local_state),
Index(index_id, Backend::kInvertedIndex),
inverted_index_(std::make_unique<InvertedIndex>()), inverted_index_(std::make_unique<InvertedIndex>()),
blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner( blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(), {base::TaskPriority::BEST_EFFORT, base::MayBlock(),
...@@ -85,7 +105,7 @@ void InvertedIndexSearch::AddOrUpdateSync( ...@@ -85,7 +105,7 @@ void InvertedIndexSearch::AddOrUpdateSync(
base::PostTaskAndReplyWithResult( base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE, blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data), base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate, base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdateSync,
weak_ptr_factory_.GetWeakPtr())); weak_ptr_factory_.GetWeakPtr()));
} }
...@@ -107,7 +127,7 @@ uint32_t InvertedIndexSearch::DeleteSync(const std::vector<std::string>& ids) { ...@@ -107,7 +127,7 @@ uint32_t InvertedIndexSearch::DeleteSync(const std::vector<std::string>& ids) {
++num_queued_index_updates_; ++num_queued_index_updates_;
blocking_task_runner_->PostTaskAndReply( blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(), FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete, base::BindOnce(&InvertedIndexSearch::FinalizeDeleteSync,
weak_ptr_factory_.GetWeakPtr(), ids)); weak_ptr_factory_.GetWeakPtr(), ids));
return ids.size(); return ids.size();
...@@ -135,24 +155,9 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query, ...@@ -135,24 +155,9 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status; return status;
} }
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
*results = inverted_index_->FindMatchingDocumentsApproximately( *results = inverted_index_->FindMatchingDocumentsApproximately(
tokens, search_params_.prefix_threshold, search_params_.fuzzy_threshold); GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results->size() > max_results && max_results > 0u) if (results->size() > max_results && max_results > 0u)
results->resize(max_results); results->resize(max_results);
...@@ -163,6 +168,71 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query, ...@@ -163,6 +168,71 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status; return status;
} }
void InvertedIndexSearch::GetSize(GetSizeCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
std::move(callback).Run(inverted_index_->NumberDocuments());
}
void InvertedIndexSearch::AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Delete(const std::vector<std::string>& ids,
DeleteCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!ids.empty());
blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback), ids));
}
void InvertedIndexSearch::UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeUpdateDocuments,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (query.empty()) {
std::move(callback).Run(ResponseStatus::kEmptyQuery, base::nullopt);
return;
}
if (inverted_index_->NumberDocuments() == 0u) {
std::move(callback).Run(ResponseStatus::kEmptyIndex, base::nullopt);
return;
}
std::vector<Result> results =
inverted_index_->FindMatchingDocumentsApproximately(
GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results.size() > max_results && max_results > 0u)
results.resize(max_results);
std::move(callback).Run(ResponseStatus::kSuccess, results);
}
void InvertedIndexSearch::ClearIndex(ClearIndexCallback callback) {
inverted_index_->ClearInvertedIndex();
std::move(callback).Run();
}
std::vector<std::pair<std::string, uint32_t>> std::vector<std::pair<std::string, uint32_t>>
InvertedIndexSearch::FindTermForTesting(const base::string16& term) const { InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
...@@ -175,7 +245,7 @@ InvertedIndexSearch::FindTermForTesting(const base::string16& term) const { ...@@ -175,7 +245,7 @@ InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
return doc_with_freq; return doc_with_freq;
} }
void InvertedIndexSearch::FinalizeAddOrUpdate( void InvertedIndexSearch::FinalizeAddOrUpdateSync(
const ExtractedContent& documents) { const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_; --num_queued_index_updates_;
...@@ -183,13 +253,34 @@ void InvertedIndexSearch::FinalizeAddOrUpdate( ...@@ -183,13 +253,34 @@ void InvertedIndexSearch::FinalizeAddOrUpdate(
MaybeBuildInvertedIndex(); MaybeBuildInvertedIndex();
} }
void InvertedIndexSearch::FinalizeDelete(const std::vector<std::string>& ids) { void InvertedIndexSearch::FinalizeDeleteSync(
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_; --num_queued_index_updates_;
inverted_index_->RemoveDocuments(ids); inverted_index_->RemoveDocuments(ids);
MaybeBuildInvertedIndex(); MaybeBuildInvertedIndex();
} }
void InvertedIndexSearch::FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->AddDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->RemoveDocuments(ids, std::move(callback));
}
void InvertedIndexSearch::FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->UpdateDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::MaybeBuildInvertedIndex() { void InvertedIndexSearch::MaybeBuildInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (num_queued_index_updates_ == 0) { if (num_queued_index_updates_ == 0) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "base/sequence_checker.h" #include "base/sequence_checker.h"
#include "base/sequenced_task_runner.h" #include "base/sequenced_task_runner.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "chromeos/components/local_search_service/index.h"
#include "chromeos/components/local_search_service/index_sync.h" #include "chromeos/components/local_search_service/index_sync.h"
#include "chromeos/components/local_search_service/shared_structs.h" #include "chromeos/components/local_search_service/shared_structs.h"
...@@ -24,7 +25,7 @@ class InvertedIndex; ...@@ -24,7 +25,7 @@ class InvertedIndex;
// An implementation of Index. // An implementation of Index.
// A search via the inverted index backend with TF-IDF based document ranking. // A search via the inverted index backend with TF-IDF based document ranking.
class InvertedIndexSearch : public IndexSync { class InvertedIndexSearch : public IndexSync, public Index {
public: public:
InvertedIndexSearch(IndexId index_id, PrefService* local_state); InvertedIndexSearch(IndexId index_id, PrefService* local_state);
~InvertedIndexSearch() override; ~InvertedIndexSearch() override;
...@@ -32,7 +33,7 @@ class InvertedIndexSearch : public IndexSync { ...@@ -32,7 +33,7 @@ class InvertedIndexSearch : public IndexSync {
InvertedIndexSearch(const InvertedIndexSearch&) = delete; InvertedIndexSearch(const InvertedIndexSearch&) = delete;
InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete; InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete;
// Index overrides: // IndexSync overrides:
uint64_t GetSizeSync() override; uint64_t GetSizeSync() override;
// TODO(jiameng): we always build the index after documents are updated. May // TODO(jiameng): we always build the index after documents are updated. May
// revise this strategy if there is a different use case. // revise this strategy if there is a different use case.
...@@ -52,19 +53,47 @@ class InvertedIndexSearch : public IndexSync { ...@@ -52,19 +53,47 @@ class InvertedIndexSearch : public IndexSync {
uint32_t max_results, uint32_t max_results,
std::vector<Result>* results) override; std::vector<Result>* results) override;
// Index overrides:
// GetSize is only accurate if the index has done updating.
void GetSize(GetSizeCallback callback) override;
void AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) override;
void Delete(const std::vector<std::string>& ids,
DeleteCallback callback) override;
void UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) override;
void Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) override;
void ClearIndex(ClearIndexCallback callback) override;
// Returns document id and number of occurrences of |term|. // Returns document id and number of occurrences of |term|.
// Document ids are sorted in alphabetical order. // Document ids are sorted in alphabetical order.
std::vector<std::pair<std::string, uint32_t>> FindTermForTesting( std::vector<std::pair<std::string, uint32_t>> FindTermForTesting(
const base::string16& term) const; const base::string16& term) const;
private: private:
void FinalizeAddOrUpdateSync(
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDeleteSync is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero.
void FinalizeDeleteSync(const std::vector<std::string>& ids);
void FinalizeAddOrUpdate( void FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents); const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDelete is called if Delete cannot be immediately done because // FinalizeDelete is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e. // there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero. // |num_queued_index_updates_| is not zero.
void FinalizeDelete(const std::vector<std::string>& ids); void FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids);
void FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// In order to reduce unnecessary inverted index building, we only build the // In order to reduce unnecessary inverted index building, we only build the
// index if there's no upcoming modification to the index's document list. // index if there's no upcoming modification to the index's document list.
......
...@@ -13,6 +13,8 @@ namespace chromeos { ...@@ -13,6 +13,8 @@ namespace chromeos {
namespace local_search_service { namespace local_search_service {
namespace { namespace {
// This is (data-id, content-ids).
using ResultWithIds = std::pair<std::string, std::vector<std::string>>;
// (content-id, content). // (content-id, content).
using ContentWithId = std::pair<std::string, std::string>; using ContentWithId = std::pair<std::string, std::string>;
...@@ -23,6 +25,102 @@ using WeightedContentWithId = std::tuple<std::string, std::string, float>; ...@@ -23,6 +25,102 @@ using WeightedContentWithId = std::tuple<std::string, std::string, float>;
// (document-id, number-of-occurrences). // (document-id, number-of-occurrences).
using TermOccurrence = std::vector<std::pair<std::string, uint32_t>>; using TermOccurrence = std::vector<std::pair<std::string, uint32_t>>;
void GetSizeAndCheckResults(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
uint32_t expectd_num_items) {
DCHECK(index);
bool callback_done = false;
uint32_t num_items = 0;
index->GetSize(base::BindOnce(
[](bool* callback_done, uint32_t* num_items, uint64_t size) {
*callback_done = true;
*num_items = size;
},
&callback_done, &num_items));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_items, expectd_num_items);
}
void AddOrUpdate(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<Data>& data) {
DCHECK(index);
bool callback_done = false;
index->AddOrUpdate(
data, base::BindOnce([](bool* callback_done) { *callback_done = true; },
&callback_done));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
}
void Delete(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<std::string>& ids,
uint32_t expect_num_deleted) {
DCHECK(index);
bool callback_done = false;
uint32_t num_deleted = 0u;
index->Delete(ids, base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void UpdateDocuments(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
const std::vector<Data>& data,
uint32_t expect_num_deleted) {
DCHECK(index);
bool callback_done = false;
uint32_t num_deleted = 0u;
index->UpdateDocuments(data,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
task_environment->RunUntilIdle();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
std::vector<Result> Find(InvertedIndexSearch* index,
base::test::TaskEnvironment* task_environment,
std::string query,
int32_t max_results,
ResponseStatus expected_status) {
DCHECK(index);
bool callback_done = false;
ResponseStatus status;
std::vector<Result> results;
index->Find(
base::UTF8ToUTF16(query), max_results,
base::BindOnce(
[](bool* callback_done, ResponseStatus* status,
std::vector<Result>* results, ResponseStatus status_callback,
const base::Optional<std::vector<Result>>& results_callback) {
*callback_done = true;
*status = status_callback;
if (results_callback.has_value())
*results = results_callback.value();
},
&callback_done, &status, &results));
task_environment->RunUntilIdle();
EXPECT_TRUE(callback_done);
EXPECT_EQ(status, expected_status);
return results;
}
} // namespace } // namespace
class InvertedIndexSearchTest : public testing::Test { class InvertedIndexSearchTest : public testing::Test {
...@@ -40,7 +138,7 @@ class InvertedIndexSearchTest : public testing::Test { ...@@ -40,7 +138,7 @@ class InvertedIndexSearchTest : public testing::Test {
base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED}; base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
}; };
TEST_F(InvertedIndexSearchTest, Add) { TEST_F(InvertedIndexSearchTest, AddSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = { const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1", {"id1",
{{"cid_1", "This is a help wi-fi article"}, {{"cid_1", "This is a help wi-fi article"},
...@@ -96,7 +194,7 @@ TEST_F(InvertedIndexSearchTest, Add) { ...@@ -96,7 +194,7 @@ TEST_F(InvertedIndexSearchTest, Add) {
} }
} }
TEST_F(InvertedIndexSearchTest, Update) { TEST_F(InvertedIndexSearchTest, UpdateSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = { const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1", {"id1",
{{"cid_1", "This is a help wi-fi article"}, {{"cid_1", "This is a help wi-fi article"},
...@@ -188,7 +286,7 @@ TEST_F(InvertedIndexSearchTest, ClearIndexSync) { ...@@ -188,7 +286,7 @@ TEST_F(InvertedIndexSearchTest, ClearIndexSync) {
EXPECT_EQ(search_->GetSizeSync(), 0u); EXPECT_EQ(search_->GetSizeSync(), 0u);
} }
TEST_F(InvertedIndexSearchTest, Find) { TEST_F(InvertedIndexSearchTest, FindSync) {
const std::map<std::string, std::vector<WeightedContentWithId>> const std::map<std::string, std::vector<WeightedContentWithId>>
data_to_register = {{"id1", data_to_register = {{"id1",
{{"cid_1", "This is a help wi-fi article", 0.8}, {{"cid_1", "This is a help wi-fi article", 0.8},
...@@ -299,7 +397,7 @@ TEST_F(InvertedIndexSearchTest, Find) { ...@@ -299,7 +397,7 @@ TEST_F(InvertedIndexSearchTest, Find) {
} }
} }
TEST_F(InvertedIndexSearchTest, SequenceOfDeleteSyncs) { TEST_F(InvertedIndexSearchTest, SequenceOfDeletesSync) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = { const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1", {"id1",
{{"cid_1", "This is a help wi-fi article"}, {{"cid_1", "This is a help wi-fi article"},
...@@ -326,5 +424,312 @@ TEST_F(InvertedIndexSearchTest, SequenceOfDeleteSyncs) { ...@@ -326,5 +424,312 @@ TEST_F(InvertedIndexSearchTest, SequenceOfDeleteSyncs) {
EXPECT_EQ(search_->GetSizeSync(), 0u); EXPECT_EQ(search_->GetSizeSync(), 0u);
} }
TEST_F(InvertedIndexSearchTest, Add) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
{
// "network" does not exist in the index.
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("network"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "help" exists in the index.
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("help"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 3u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
{
// "wifi" exists in the index but "wi-fi" doesn't because of normalization.
TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id2");
EXPECT_EQ(doc_with_freq[1].second, 1u);
doc_with_freq = search_->FindTermForTesting(base::UTF8ToUTF16("wi-fi"));
EXPECT_TRUE(doc_with_freq.empty());
// "WiFi" doesn't exist because the index stores normalized word.
doc_with_freq = search_->FindTermForTesting(base::UTF8ToUTF16("WiFi"));
EXPECT_TRUE(doc_with_freq.empty());
}
{
// "this" does not exist in the index because it's a stopword
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("this"));
EXPECT_TRUE(doc_with_freq.empty());
}
}
TEST_F(InvertedIndexSearchTest, Update) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
AddOrUpdate(search_.get(), &task_environment_, updated_data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 3u);
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("bluetooth"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("google"));
EXPECT_EQ(doc_with_freq.size(), 2u);
EXPECT_EQ(doc_with_freq[0].first, "id1");
EXPECT_EQ(doc_with_freq[0].second, 2u);
EXPECT_EQ(doc_with_freq[1].first, "id3");
EXPECT_EQ(doc_with_freq[1].second, 1u);
}
}
TEST_F(InvertedIndexSearchTest, Delete) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
Delete(search_.get(), &task_environment_, {"id1"}, 1u);
{
const TermOccurrence doc_with_freq =
search_->FindTermForTesting(base::UTF8ToUTF16("wifi"));
EXPECT_EQ(doc_with_freq.size(), 1u);
EXPECT_EQ(doc_with_freq[0].first, "id2");
EXPECT_EQ(doc_with_freq[0].second, 1u);
}
}
TEST_F(InvertedIndexSearchTest, ClearIndex) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
bool callback_done = false;
search_->ClearIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
}
TEST_F(InvertedIndexSearchTest, FindTest) {
const std::map<std::string, std::vector<WeightedContentWithId>>
data_to_register = {{"id1",
{{"cid_1", "This is a help wi-fi article", 0.8},
{"cid_2", "Another help help wi-fi", 0.6}}},
{"id2", {{"cid_3", "help article on wi-fi", 0.6}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
// Nothing has been added to the index.
std::vector<Result> results =
Find(search_.get(), &task_environment_, "network",
/*max_results=*/10, ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Data is added and then deleted from index, making the index empty.
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
Delete(search_.get(), &task_environment_, {"id1", "id2"}, 2u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
results = Find(search_.get(), &task_environment_, "network",
/*max_results=*/10, ResponseStatus::kEmptyIndex);
EXPECT_TRUE(results.empty());
// Index is populated again, but query is empty.
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
results = Find(search_.get(), &task_environment_, "", /*max_results=*/10,
ResponseStatus::kEmptyQuery);
EXPECT_TRUE(results.empty());
// No document is found for a given query.
results = Find(search_.get(), &task_environment_, "networkstuff",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_TRUE(results.empty());
{
// A document is found.
// Query's case is normalized.
results = Find(search_.get(), &task_environment_, "ANOTHER networkstuff",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
// "another" only exists in "id1".
const float expected_score =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/7);
CheckResult(results[0], "id1", expected_score,
/*expected_number_positions=*/1);
}
{
// Two documents are found.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
// "id1" score comes from both "another" and "help".
const float expected_score_id1 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/7) +
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/0.8 + 0.6 * 2,
/*doc_length=*/7);
// "id2" score comes "help".
const float expected_score_id2 =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/0.6,
/*doc_length=*/3);
EXPECT_GE(expected_score_id1, expected_score_id2);
CheckResult(results[0], "id1", expected_score_id1,
/*expected_number_positions=*/4);
CheckResult(results[1], "id2", expected_score_id2,
/*expected_number_positions=*/1);
}
{
// Same as above, but max number of results is set to 1.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/1, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
EXPECT_EQ(results[0].id, "id1");
}
{
// Same as above, but set max_results to 0, meaning no max.
results = Find(search_.get(), &task_environment_, "another help",
/*max_results=*/0, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 2u);
}
}
TEST_F(InvertedIndexSearchTest, SequenceOfDeletes) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
AddOrUpdate(search_.get(), &task_environment_, updated_data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 3u);
Delete(search_.get(), &task_environment_, {"id1"}, 1u);
Delete(search_.get(), &task_environment_, {"id2", "id3"}, 2u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 0u);
}
TEST_F(InvertedIndexSearchTest, UpdateDocumentsTest) {
const std::map<std::string, std::vector<ContentWithId>> data_to_register = {
{"id1",
{{"cid_1", "This is a help wi-fi article"},
{"cid_2", "Another help help wi-fi"}}},
{"id2", {{"cid_3", "help article on wi-fi"}}}};
const std::vector<Data> data = CreateTestData(data_to_register);
AddOrUpdate(search_.get(), &task_environment_, data);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
const std::map<std::string, std::vector<ContentWithId>> data_to_update = {
{"id1",
{{"cid_1", "This is a help bluetooth article"},
{"cid_2", "Google Playstore Google Music"}}},
{"id2", {}},
{"id3", {{"cid_3", "Google Map"}}}};
const std::vector<Data> updated_data = CreateTestData(data_to_update);
UpdateDocuments(search_.get(), &task_environment_, updated_data, 1u);
GetSizeAndCheckResults(search_.get(), &task_environment_, 2u);
// Check if "id1" has been updated
std::vector<Result> results =
Find(search_.get(), &task_environment_, "bluetooth",
/*max_results=*/10, ResponseStatus::kSuccess);
EXPECT_EQ(results.size(), 1u);
// "bluetooth" only exists in "id1".
const float expected_score =
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/1,
/*doc_length=*/7);
CheckResult(results[0], "id1", expected_score,
/*expected_number_positions=*/1);
}
} // namespace local_search_service } // namespace local_search_service
} // namespace chromeos } // namespace chromeos
...@@ -38,11 +38,6 @@ constexpr double kDefaultWeight = 1.0; ...@@ -38,11 +38,6 @@ constexpr double kDefaultWeight = 1.0;
class InvertedIndexTest : public ::testing::Test { class InvertedIndexTest : public ::testing::Test {
public: public:
InvertedIndexTest() {
index_.RegisterIndexBuiltCallback(base::BindRepeating(
&InvertedIndexTest::OnIndexBuilt, base::Unretained(this)));
}
void SetUp() override { void SetUp() override {
// All content weights are initialized to |kDefaultWeight|. // All content weights are initialized to |kDefaultWeight|.
index_.doc_length_ = index_.doc_length_ =
...@@ -99,12 +94,64 @@ class InvertedIndexTest : public ::testing::Test { ...@@ -99,12 +94,64 @@ class InvertedIndexTest : public ::testing::Test {
index_.AddDocuments(documents); index_.AddDocuments(documents);
} }
void AddDocumentsAndCheck(const DocumentToUpdate& documents) {
bool callback_done = false;
index_.AddDocuments(
documents,
base::BindOnce([](bool* callback_done) { *callback_done = true; },
&callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
void RemoveDocuments(const std::vector<std::string>& doc_ids) { void RemoveDocuments(const std::vector<std::string>& doc_ids) {
index_.RemoveDocuments(doc_ids); index_.RemoveDocuments(doc_ids);
} }
void RemoveDocumentsAndCheck(const std::vector<std::string>& doc_ids,
uint32_t expect_num_deleted) {
bool callback_done = false;
uint32_t num_deleted = 0u;
index_.RemoveDocuments(doc_ids,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
Wait();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void UpdateDocumentsAndCheck(const DocumentToUpdate& documents,
uint32_t expect_num_deleted) {
bool callback_done = false;
uint32_t num_deleted = 0u;
index_.UpdateDocuments(documents,
base::BindOnce(
[](bool* callback_done, uint32_t* num_deleted,
uint32_t num_deleted_callback) {
*callback_done = true;
*num_deleted = num_deleted_callback;
},
&callback_done, &num_deleted));
Wait();
ASSERT_TRUE(callback_done);
EXPECT_EQ(num_deleted, expect_num_deleted);
}
void ClearInvertedIndex() { index_.ClearInvertedIndex(); } void ClearInvertedIndex() { index_.ClearInvertedIndex(); }
void ClearInvertedIndexAndCheck() {
bool callback_done = false;
index_.ClearInvertedIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
std::vector<TfidfResult> GetTfidf(const base::string16& term) { std::vector<TfidfResult> GetTfidf(const base::string16& term) {
return index_.GetTfidf(term); return index_.GetTfidf(term);
} }
...@@ -130,6 +177,14 @@ class InvertedIndexTest : public ::testing::Test { ...@@ -130,6 +177,14 @@ class InvertedIndexTest : public ::testing::Test {
void BuildInvertedIndex() { index_.BuildInvertedIndex(); } void BuildInvertedIndex() { index_.BuildInvertedIndex(); }
void BuildInvertedIndexAndCheck() {
bool callback_done = false;
index_.BuildInvertedIndex(base::BindOnce(
[](bool* callback_done) { *callback_done = true; }, &callback_done));
Wait();
ASSERT_TRUE(callback_done);
}
bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); } bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); }
std::unordered_map<base::string16, PostingList> GetDictionary() { std::unordered_map<base::string16, PostingList> GetDictionary() {
...@@ -159,20 +214,16 @@ class InvertedIndexTest : public ::testing::Test { ...@@ -159,20 +214,16 @@ class InvertedIndexTest : public ::testing::Test {
bool UpdateDocumentsCompleted() { return !index_.update_in_progress_; } bool UpdateDocumentsCompleted() { return !index_.update_in_progress_; }
void OnIndexBuilt() { ++num_built_; } protected:
int NumBuilt() { return num_built_; }
private:
int num_built_ = 0;
InvertedIndex index_;
base::test::TaskEnvironment task_environment_{ base::test::TaskEnvironment task_environment_{
base::test::TaskEnvironment::MainThreadType::DEFAULT, base::test::TaskEnvironment::MainThreadType::DEFAULT,
base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED}; base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};
private:
InvertedIndex index_;
}; };
TEST_F(InvertedIndexTest, FindTermTest) { TEST_F(InvertedIndexTest, FindTermTest) {
EXPECT_EQ(NumBuilt(), 0);
PostingList result = FindTerm(base::UTF8ToUTF16("A")); PostingList result = FindTerm(base::UTF8ToUTF16("A"));
ASSERT_EQ(result.size(), 2u); ASSERT_EQ(result.size(), 2u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight); EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
...@@ -194,7 +245,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) { ...@@ -194,7 +245,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
const base::string16 a_utf16(base::UTF8ToUTF16("A")); const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D")); const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
AddDocuments({{"doc3", AddDocuments({{"doc3",
{{a_utf16, {{a_utf16,
{{kDefaultWeight, {"header", 1, 1}}, {{kDefaultWeight, {"header", 1, 1}},
...@@ -206,7 +256,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) { ...@@ -206,7 +256,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u); EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
// 4 terms "A", "B", "C", "D" need to be updated. // 4 terms "A", "B", "C", "D" need to be updated.
EXPECT_EQ(GetTermToBeUpdated().size(), 4u); EXPECT_EQ(GetTermToBeUpdated().size(), 4u);
...@@ -252,7 +301,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) { ...@@ -252,7 +301,6 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u); EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
// 7 terms "A", "B", "C", "D", "E", "F", "G" need to be updated. // 7 terms "A", "B", "C", "D", "E", "F", "G" need to be updated.
EXPECT_EQ(GetTermToBeUpdated().size(), 7u); EXPECT_EQ(GetTermToBeUpdated().size(), 7u);
...@@ -267,11 +315,70 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) { ...@@ -267,11 +315,70 @@ TEST_F(InvertedIndexTest, AddNewDocumentTest) {
ASSERT_EQ(result.size(), 1u); ASSERT_EQ(result.size(), 1u);
} }
TEST_F(InvertedIndexTest, AddNewDocumentTestCallback) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc3",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocLength()["doc3"], 5u);
// Find "A"
PostingList result = FindTerm(a_utf16);
ASSERT_EQ(result.size(), 3u);
EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][0].position.start, 1u);
EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc3"][1].position.start, 2u);
EXPECT_EQ(result["doc3"][2].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][2].position.start, 4u);
// Find "D"
result = FindTerm(d_utf16);
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc3"][0].position.start, 3u);
EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc3"][1].position.start, 5u);
// Add multiple documents
AddDocumentsAndCheck({{"doc4",
{{base::UTF8ToUTF16("E"),
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{base::UTF8ToUTF16("F"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}},
{"doc5",
{{base::UTF8ToUTF16("E"),
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{base::UTF8ToUTF16("G"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
// Find "E"
result = FindTerm(base::UTF8ToUTF16("E"));
ASSERT_EQ(result.size(), 2u);
// Find "F"
result = FindTerm(base::UTF8ToUTF16("F"));
ASSERT_EQ(result.size(), 1u);
}
TEST_F(InvertedIndexTest, ReplaceDocumentTest) { TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
const base::string16 a_utf16(base::UTF8ToUTF16("A")); const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D")); const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
AddDocuments({{"doc1", AddDocuments({{"doc1",
{{a_utf16, {{a_utf16,
{{kDefaultWeight, {"header", 1, 1}}, {{kDefaultWeight, {"header", 1, 1}},
...@@ -282,7 +389,6 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) { ...@@ -282,7 +389,6 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
{kDefaultWeight / 5, {"body", 5, 1}}}}}}}); {kDefaultWeight / 5, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted()); EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDocLength()["doc1"], 5u); EXPECT_EQ(GetDocLength()["doc1"], 5u);
...@@ -311,14 +417,51 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) { ...@@ -311,14 +417,51 @@ TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
EXPECT_EQ(result["doc1"][1].position.start, 5u); EXPECT_EQ(result["doc1"][1].position.start, 5u);
} }
TEST_F(InvertedIndexTest, ReplaceDocumentTestCallback) {
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc1",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight / 3, {"header", 3, 1}},
{kDefaultWeight / 5, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocLength()["doc1"], 5u);
EXPECT_EQ(GetDocLength()["doc2"], 6u);
// Find "A"
PostingList result = FindTerm(a_utf16);
ASSERT_EQ(result.size(), 2u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc1"][0].position.start, 1u);
EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 4);
EXPECT_EQ(result["doc1"][1].position.start, 2u);
EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight / 2);
EXPECT_EQ(result["doc1"][2].position.start, 4u);
// Find "B"
result = FindTerm(base::UTF8ToUTF16("B"));
ASSERT_EQ(result.size(), 0u);
// Find "D"
result = FindTerm(d_utf16);
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight / 3);
EXPECT_EQ(result["doc1"][0].position.start, 3u);
EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 5);
EXPECT_EQ(result["doc1"][1].position.start, 5u);
}
TEST_F(InvertedIndexTest, RemoveDocumentTest) { TEST_F(InvertedIndexTest, RemoveDocumentTest) {
EXPECT_EQ(GetDictionary().size(), 3u); EXPECT_EQ(GetDictionary().size(), 3u);
EXPECT_EQ(GetDocLength().size(), 2u); EXPECT_EQ(GetDocLength().size(), 2u);
EXPECT_EQ(NumBuilt(), 0);
RemoveDocuments({"doc1"}); RemoveDocuments({"doc1"});
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted()); EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDictionary().size(), 2u); EXPECT_EQ(GetDictionary().size(), 2u);
...@@ -352,19 +495,57 @@ TEST_F(InvertedIndexTest, RemoveDocumentTest) { ...@@ -352,19 +495,57 @@ TEST_F(InvertedIndexTest, RemoveDocumentTest) {
// Removes multiple documents // Removes multiple documents
RemoveDocuments({"doc1", "doc2", "doc3"}); RemoveDocuments({"doc1", "doc2", "doc3"});
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted()); EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_EQ(GetDictionary().size(), 0u); EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u); EXPECT_EQ(GetDocLength().size(), 0u);
} }
TEST_F(InvertedIndexTest, RemoveDocumentTestCallback) {
EXPECT_EQ(GetDictionary().size(), 3u);
EXPECT_EQ(GetDocLength().size(), 2u);
RemoveDocumentsAndCheck({"doc1"}, 1u);
EXPECT_EQ(GetDictionary().size(), 2u);
EXPECT_EQ(GetDocLength().size(), 1u);
EXPECT_EQ(GetDocLength()["doc2"], 6u);
// Find "A"
PostingList result = FindTerm(base::UTF8ToUTF16("A"));
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][0].position.start, 2u);
EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][1].position.start, 4u);
// Find "B"
result = FindTerm(base::UTF8ToUTF16("B"));
ASSERT_EQ(result.size(), 0u);
// Find "C"
result = FindTerm(base::UTF8ToUTF16("C"));
ASSERT_EQ(result.size(), 1u);
EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][0].position.start, 1u);
EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][1].position.start, 3u);
EXPECT_EQ(result["doc2"][2].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][2].position.start, 5u);
EXPECT_EQ(result["doc2"][3].weight, kDefaultWeight);
EXPECT_EQ(result["doc2"][3].position.start, 7u);
// Removes multiple documents, but only "doc2" is actually removed since
// "doc1" and "doc3" don't exist.
RemoveDocumentsAndCheck({"doc1", "doc2", "doc3"}, 1u);
EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u);
}
TEST_F(InvertedIndexTest, TfidfFromZeroTest) { TEST_F(InvertedIndexTest, TfidfFromZeroTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u); EXPECT_EQ(GetTfidfCache().size(), 0u);
EXPECT_FALSE(IsInvertedIndexBuilt()); EXPECT_FALSE(IsInvertedIndexBuilt());
EXPECT_EQ(NumBuilt(), 0);
BuildInvertedIndex(); BuildInvertedIndex();
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted()); EXPECT_TRUE(BuildIndexCompleted());
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A")); std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
...@@ -389,7 +570,6 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) { ...@@ -389,7 +570,6 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u); EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndex(); BuildInvertedIndex();
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted()); EXPECT_TRUE(BuildIndexCompleted());
EXPECT_TRUE(IsInvertedIndexBuilt()); EXPECT_TRUE(IsInvertedIndexBuilt());
...@@ -406,13 +586,11 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) { ...@@ -406,13 +586,11 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
{kDefaultWeight, {"body", 5, 1}}}}}}}); {kDefaultWeight, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(UpdateDocumentsCompleted()); EXPECT_TRUE(UpdateDocumentsCompleted());
EXPECT_FALSE(IsInvertedIndexBuilt()); EXPECT_FALSE(IsInvertedIndexBuilt());
BuildInvertedIndex(); BuildInvertedIndex();
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 2);
EXPECT_TRUE(BuildIndexCompleted()); EXPECT_TRUE(BuildIndexCompleted());
EXPECT_EQ(GetTfidfCache().size(), 3u); EXPECT_EQ(GetTfidfCache().size(), 3u);
...@@ -469,11 +647,106 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) { ...@@ -469,11 +647,106 @@ TEST_F(InvertedIndexTest, UpdateIndexTest) {
testing::UnorderedElementsAre(expected_tfidf_D_doc1)); testing::UnorderedElementsAre(expected_tfidf_D_doc1));
} }
TEST_F(InvertedIndexTest, UpdateIndexTestCallback) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Replaces "doc1"
AddDocumentsAndCheck({{"doc1",
{{base::UTF8ToUTF16("A"),
{{kDefaultWeight / 2, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{base::UTF8ToUTF16("D"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight, {"body", 5, 1}}}}}}});
EXPECT_FALSE(IsInvertedIndexBuilt());
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("A"));
const double expected_tfidf_A_doc1 =
std::roundf(
TfIdfScore(
/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight / 2 +
kDefaultWeight / 4 + kDefaultWeight / 2,
/*doc_length=*/5) *
100) /
100;
const double expected_tfidf_A_doc2 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/2,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
/*doc_length=*/6) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_A_doc1,
expected_tfidf_A_doc2));
results = GetTfidf(base::UTF8ToUTF16("B"));
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre());
results = GetTfidf(base::UTF8ToUTF16("C"));
const double expected_tfidf_C_doc2 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 4,
/*doc_length=*/6) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_C_doc2));
results = GetTfidf(base::UTF8ToUTF16("D"));
const double expected_tfidf_D_doc1 =
std::roundf(
TfIdfScore(/*num_docs=*/2,
/*num_docs_with_term=*/1,
/*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
/*doc_length=*/5) *
100) /
100;
EXPECT_THAT(GetScoresFromTfidfResult(results),
testing::UnorderedElementsAre(expected_tfidf_D_doc1));
}
TEST_F(InvertedIndexTest, UpdateDocumentsTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Replaces "doc1" and remove "doc2"
UpdateDocumentsAndCheck({{"doc1",
{{base::UTF8ToUTF16("A"),
{{kDefaultWeight / 2, {"header", 1, 1}},
{kDefaultWeight / 4, {"body", 2, 1}},
{kDefaultWeight / 2, {"header", 4, 1}}}},
{base::UTF8ToUTF16("D"),
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight, {"body", 5, 1}}}}}},
{"doc2", {}}},
1u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 2u);
std::vector<TfidfResult> results = GetTfidf(base::UTF8ToUTF16("C"));
EXPECT_EQ(results.size(), 0u);
}
TEST_F(InvertedIndexTest, ClearInvertedIndexTest) { TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
EXPECT_EQ(GetTfidfCache().size(), 0u); EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndex(); BuildInvertedIndex();
Wait(); Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted()); EXPECT_TRUE(BuildIndexCompleted());
EXPECT_TRUE(IsInvertedIndexBuilt()); EXPECT_TRUE(IsInvertedIndexBuilt());
...@@ -500,6 +773,31 @@ TEST_F(InvertedIndexTest, ClearInvertedIndexTest) { ...@@ -500,6 +773,31 @@ TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u); EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
} }
TEST_F(InvertedIndexTest, ClearInvertedIndexTestCallback) {
EXPECT_EQ(GetTfidfCache().size(), 0u);
BuildInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 3u);
// Add a document and clear the index simultaneously.
const base::string16 a_utf16(base::UTF8ToUTF16("A"));
const base::string16 d_utf16(base::UTF8ToUTF16("D"));
AddDocumentsAndCheck({{"doc3",
{{a_utf16,
{{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight / 2, {"body", 2, 1}},
{kDefaultWeight, {"header", 4, 1}}}},
{d_utf16,
{{kDefaultWeight, {"header", 3, 1}},
{kDefaultWeight / 2, {"body", 5, 1}}}}}}});
ClearInvertedIndexAndCheck();
EXPECT_EQ(GetTfidfCache().size(), 0u);
EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
EXPECT_EQ(GetDocLength().size(), 0u);
EXPECT_EQ(GetDictionary().size(), 0u);
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
}
TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) { TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
const double prefix_threshold = 1.0; const double prefix_threshold = 1.0;
const double block_threshold = 1.0; const double block_threshold = 1.0;
...@@ -508,10 +806,8 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) { ...@@ -508,10 +806,8 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
const base::string16 c_utf16(base::UTF8ToUTF16("C")); const base::string16 c_utf16(base::UTF8ToUTF16("C"));
const base::string16 d_utf16(base::UTF8ToUTF16("D")); const base::string16 d_utf16(base::UTF8ToUTF16("D"));
EXPECT_EQ(NumBuilt(), 0);
// Replace doc1, same occurrences, just different weights. // Replace doc1, same occurrences, just different weights.
AddDocuments({{"doc1", AddDocumentsAndCheck({{"doc1",
{{a_utf16, {{a_utf16,
{{kDefaultWeight, {"header", 1, 1}}, {{kDefaultWeight, {"header", 1, 1}},
{kDefaultWeight, {"header", 3, 1}}, {kDefaultWeight, {"header", 3, 1}},
...@@ -522,15 +818,6 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) { ...@@ -522,15 +818,6 @@ TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
{kDefaultWeight / 2, {"header", 6, 1}}, {kDefaultWeight / 2, {"header", 6, 1}},
{kDefaultWeight / 3, {"body", 4, 1}}, {kDefaultWeight / 3, {"body", 4, 1}},
{kDefaultWeight / 3, {"body", 5, 1}}}}}}}); {kDefaultWeight / 3, {"body", 5, 1}}}}}}});
EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
Wait();
EXPECT_EQ(NumBuilt(), 0);
EXPECT_TRUE(UpdateDocumentsCompleted());
BuildInvertedIndex();
Wait();
EXPECT_EQ(NumBuilt(), 1);
EXPECT_TRUE(BuildIndexCompleted());
{ {
// "A" exists in "doc1" and "doc2". The score of each document is simply A's // "A" exists in "doc1" and "doc2". The score of each document is simply A's
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment