Commit 0b2e0bd8 authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

[local-search-service] Implement InvertedIndexSearch with callback

This CL implements InvertedIndexSearch functions that are inherited
from new Index interface.

Design doc: go/lss-sandboxing
Implementation plan: go/lss-sandboxing-impl

Bug: 1137560
Change-Id: I3a1439f9b43d9d52bbb6819308535aa28ef7fe3d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2531078Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#827660}
parent 1ade577e
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "base/callback.h" #include "base/callback.h"
#include "base/gtest_prod_util.h" #include "base/gtest_prod_util.h"
#include "base/memory/weak_ptr.h" #include "base/memory/weak_ptr.h"
#include "base/sequenced_task_runner.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "chromeos/components/local_search_service/shared_structs.h" #include "chromeos/components/local_search_service/shared_structs.h"
...@@ -60,10 +61,6 @@ class InvertedIndex { ...@@ -60,10 +61,6 @@ class InvertedIndex {
InvertedIndex(const InvertedIndex&) = delete; InvertedIndex(const InvertedIndex&) = delete;
InvertedIndex& operator=(const InvertedIndex&) = delete; InvertedIndex& operator=(const InvertedIndex&) = delete;
// |on_index_built| will be called after the index is built.
void RegisterIndexBuiltCallback(
base::RepeatingCallback<void()> on_index_built);
// Returns document ID and positions of a term. // Returns document ID and positions of a term.
PostingList FindTerm(const base::string16& term) const; PostingList FindTerm(const base::string16& term) const;
...@@ -79,6 +76,10 @@ class InvertedIndex { ...@@ -79,6 +76,10 @@ class InvertedIndex {
// unique (have unique content). This function doesn't modify any cache. It // unique (have unique content). This function doesn't modify any cache. It
// only adds documents and tokens to the index. // only adds documents and tokens to the index.
void AddDocuments(const DocumentToUpdate& documents); void AddDocuments(const DocumentToUpdate& documents);
// Similar to the above function, but it will build TF-IDF cache after adding
// documents.
void AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback);
// Removes documents from the inverted index. Do nothing if the document id is // Removes documents from the inverted index. Do nothing if the document id is
// not in the index. // not in the index.
...@@ -87,6 +88,22 @@ class InvertedIndex { ...@@ -87,6 +88,22 @@ class InvertedIndex {
// As other operations may be running on a separate thread, this function // As other operations may be running on a separate thread, this function
// returns size of |document_ids| and not actually deleted documents. // returns size of |document_ids| and not actually deleted documents.
uint32_t RemoveDocuments(const std::vector<std::string>& document_ids); uint32_t RemoveDocuments(const std::vector<std::string>& document_ids);
// Similar to the above function, but it will build TF-IDF cache after
// removing documents.
void RemoveDocuments(const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback);
// Updates documents from the inverted index. It combines two functions:
// AddDocuments and RemoveDocument. This function will returns number of
// documents to be removed (number of documents that have empty content).
// - If a document ID is not in the index, add the document to the index.
// - If a document ID is in the index and it's new content isn't empty,
// update it's content in the index.
// - If a document ID is in the index and it's content is empty, remove it
// from the index.
// It will build TF-IDF cache after updating the documents.
void UpdateDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback);
// Gets TF-IDF scores for a term. This function returns the TF-IDF score from // Gets TF-IDF scores for a term. This function returns the TF-IDF score from
// the cache. // the cache.
...@@ -96,9 +113,11 @@ class InvertedIndex { ...@@ -96,9 +113,11 @@ class InvertedIndex {
// Builds the inverted index. // Builds the inverted index.
void BuildInvertedIndex(); void BuildInvertedIndex();
void BuildInvertedIndex(base::OnceCallback<void()> callback);
// Clears all the data from the inverted index. // Clears all the data from the inverted index.
void ClearInvertedIndex(); void ClearInvertedIndex();
void ClearInvertedIndex(base::OnceCallback<void()> callback);
// Checks if the inverted index has been built: returns |true| if the inverted // Checks if the inverted index has been built: returns |true| if the inverted
// index is up to date, returns |false| if there are some modified document // index is up to date, returns |false| if there are some modified document
...@@ -118,17 +137,26 @@ class InvertedIndex { ...@@ -118,17 +137,26 @@ class InvertedIndex {
void InvertedIndexController(); void InvertedIndexController();
// Called on the main thread after BuildTfidf is completed. // Called on the main thread after BuildTfidf is completed.
void OnBuildTfidfComplete(TfidfCache&& new_cache); void OnBuildTfidfCompleteSync(TfidfCache&& new_cache);
void OnBuildTfidfComplete(base::OnceCallback<void()> callback,
// Called on the main thread after UpdateDocuments is completed. TfidfCache&& new_cache);
void OnUpdateDocumentsComplete( // Called on the main thread after UpdateDocumentsStateVariables is completed.
DocumentStateVariables&& document_state_variables); void OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnUpdateDocumentsComplete(base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnAddDocumentsComplete(base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted);
void OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
void OnDataCleared( void OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data); std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data);
base::RepeatingCallback<void()> on_index_built_;
// |is_index_built_| is only true if index's TF-IDF is consistent with the // |is_index_built_| is only true if index's TF-IDF is consistent with the
// documents in the index. This means as soon as documents are modified // documents in the index. This means as soon as documents are modified
// (added, updated or deleted), |is_index_built_| will be set to false. While // (added, updated or deleted), |is_index_built_| will be set to false. While
...@@ -155,6 +183,7 @@ class InvertedIndex { ...@@ -155,6 +183,7 @@ class InvertedIndex {
bool index_building_in_progress_ = false; bool index_building_in_progress_ = false;
bool request_to_clear_index_ = false; bool request_to_clear_index_ = false;
scoped_refptr<base::SequencedTaskRunner> task_runner_;
SEQUENCE_CHECKER(sequence_checker_); SEQUENCE_CHECKER(sequence_checker_);
base::WeakPtrFactory<InvertedIndex> weak_ptr_factory_{this}; base::WeakPtrFactory<InvertedIndex> weak_ptr_factory_{this};
......
...@@ -51,18 +51,38 @@ ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) { ...@@ -51,18 +51,38 @@ ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) {
ExtractedContent documents; ExtractedContent documents;
for (const Data& d : data) { for (const Data& d : data) {
const std::vector<Token> document_tokens = ExtractDocumentTokens(d); const std::vector<Token> document_tokens = ExtractDocumentTokens(d);
DCHECK(!document_tokens.empty());
documents.push_back({d.id, document_tokens}); documents.push_back({d.id, document_tokens});
} }
return documents; return documents;
} }
std::unordered_set<base::string16> GetTokenizedQuery(
const base::string16& query) {
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
return tokens;
}
} // namespace } // namespace
InvertedIndexSearch::InvertedIndexSearch(IndexId index_id, InvertedIndexSearch::InvertedIndexSearch(IndexId index_id,
PrefService* local_state) PrefService* local_state)
: IndexSync(index_id, Backend::kInvertedIndex, local_state), : IndexSync(index_id, Backend::kInvertedIndex, local_state),
Index(index_id, Backend::kInvertedIndex),
inverted_index_(std::make_unique<InvertedIndex>()), inverted_index_(std::make_unique<InvertedIndex>()),
blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner( blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(), {base::TaskPriority::BEST_EFFORT, base::MayBlock(),
...@@ -85,7 +105,7 @@ void InvertedIndexSearch::AddOrUpdateSync( ...@@ -85,7 +105,7 @@ void InvertedIndexSearch::AddOrUpdateSync(
base::PostTaskAndReplyWithResult( base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE, blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data), base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate, base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdateSync,
weak_ptr_factory_.GetWeakPtr())); weak_ptr_factory_.GetWeakPtr()));
} }
...@@ -107,7 +127,7 @@ uint32_t InvertedIndexSearch::DeleteSync(const std::vector<std::string>& ids) { ...@@ -107,7 +127,7 @@ uint32_t InvertedIndexSearch::DeleteSync(const std::vector<std::string>& ids) {
++num_queued_index_updates_; ++num_queued_index_updates_;
blocking_task_runner_->PostTaskAndReply( blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(), FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete, base::BindOnce(&InvertedIndexSearch::FinalizeDeleteSync,
weak_ptr_factory_.GetWeakPtr(), ids)); weak_ptr_factory_.GetWeakPtr(), ids));
return ids.size(); return ids.size();
...@@ -135,24 +155,9 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query, ...@@ -135,24 +155,9 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status; return status;
} }
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
*results = inverted_index_->FindMatchingDocumentsApproximately( *results = inverted_index_->FindMatchingDocumentsApproximately(
tokens, search_params_.prefix_threshold, search_params_.fuzzy_threshold); GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results->size() > max_results && max_results > 0u) if (results->size() > max_results && max_results > 0u)
results->resize(max_results); results->resize(max_results);
...@@ -163,6 +168,71 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query, ...@@ -163,6 +168,71 @@ ResponseStatus InvertedIndexSearch::FindSync(const base::string16& query,
return status; return status;
} }
void InvertedIndexSearch::GetSize(GetSizeCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
std::move(callback).Run(inverted_index_->NumberDocuments());
}
void InvertedIndexSearch::AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Delete(const std::vector<std::string>& ids,
DeleteCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!ids.empty());
blocking_task_runner_->PostTaskAndReply(
FROM_HERE, base::DoNothing(),
base::BindOnce(&InvertedIndexSearch::FinalizeDelete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback), ids));
}
void InvertedIndexSearch::UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK(!data.empty());
base::PostTaskAndReplyWithResult(
blocking_task_runner_.get(), FROM_HERE,
base::BindOnce(&ExtractDocumentsContent, data),
base::BindOnce(&InvertedIndexSearch::FinalizeUpdateDocuments,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (query.empty()) {
std::move(callback).Run(ResponseStatus::kEmptyQuery, base::nullopt);
return;
}
if (inverted_index_->NumberDocuments() == 0u) {
std::move(callback).Run(ResponseStatus::kEmptyIndex, base::nullopt);
return;
}
std::vector<Result> results =
inverted_index_->FindMatchingDocumentsApproximately(
GetTokenizedQuery(query), search_params_.prefix_threshold,
search_params_.fuzzy_threshold);
if (results.size() > max_results && max_results > 0u)
results.resize(max_results);
std::move(callback).Run(ResponseStatus::kSuccess, results);
}
void InvertedIndexSearch::ClearIndex(ClearIndexCallback callback) {
inverted_index_->ClearInvertedIndex();
std::move(callback).Run();
}
std::vector<std::pair<std::string, uint32_t>> std::vector<std::pair<std::string, uint32_t>>
InvertedIndexSearch::FindTermForTesting(const base::string16& term) const { InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
...@@ -175,7 +245,7 @@ InvertedIndexSearch::FindTermForTesting(const base::string16& term) const { ...@@ -175,7 +245,7 @@ InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
return doc_with_freq; return doc_with_freq;
} }
void InvertedIndexSearch::FinalizeAddOrUpdate( void InvertedIndexSearch::FinalizeAddOrUpdateSync(
const ExtractedContent& documents) { const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_; --num_queued_index_updates_;
...@@ -183,13 +253,34 @@ void InvertedIndexSearch::FinalizeAddOrUpdate( ...@@ -183,13 +253,34 @@ void InvertedIndexSearch::FinalizeAddOrUpdate(
MaybeBuildInvertedIndex(); MaybeBuildInvertedIndex();
} }
void InvertedIndexSearch::FinalizeDelete(const std::vector<std::string>& ids) { void InvertedIndexSearch::FinalizeDeleteSync(
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
--num_queued_index_updates_; --num_queued_index_updates_;
inverted_index_->RemoveDocuments(ids); inverted_index_->RemoveDocuments(ids);
MaybeBuildInvertedIndex(); MaybeBuildInvertedIndex();
} }
void InvertedIndexSearch::FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->AddDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->RemoveDocuments(ids, std::move(callback));
}
void InvertedIndexSearch::FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const ExtractedContent& documents) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
inverted_index_->UpdateDocuments(documents, std::move(callback));
}
void InvertedIndexSearch::MaybeBuildInvertedIndex() { void InvertedIndexSearch::MaybeBuildInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (num_queued_index_updates_ == 0) { if (num_queued_index_updates_ == 0) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "base/sequence_checker.h" #include "base/sequence_checker.h"
#include "base/sequenced_task_runner.h" #include "base/sequenced_task_runner.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "chromeos/components/local_search_service/index.h"
#include "chromeos/components/local_search_service/index_sync.h" #include "chromeos/components/local_search_service/index_sync.h"
#include "chromeos/components/local_search_service/shared_structs.h" #include "chromeos/components/local_search_service/shared_structs.h"
...@@ -24,7 +25,7 @@ class InvertedIndex; ...@@ -24,7 +25,7 @@ class InvertedIndex;
// An implementation of Index. // An implementation of Index.
// A search via the inverted index backend with TF-IDF based document ranking. // A search via the inverted index backend with TF-IDF based document ranking.
class InvertedIndexSearch : public IndexSync { class InvertedIndexSearch : public IndexSync, public Index {
public: public:
InvertedIndexSearch(IndexId index_id, PrefService* local_state); InvertedIndexSearch(IndexId index_id, PrefService* local_state);
~InvertedIndexSearch() override; ~InvertedIndexSearch() override;
...@@ -32,7 +33,7 @@ class InvertedIndexSearch : public IndexSync { ...@@ -32,7 +33,7 @@ class InvertedIndexSearch : public IndexSync {
InvertedIndexSearch(const InvertedIndexSearch&) = delete; InvertedIndexSearch(const InvertedIndexSearch&) = delete;
InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete; InvertedIndexSearch& operator=(const InvertedIndexSearch&) = delete;
// Index overrides: // IndexSync overrides:
uint64_t GetSizeSync() override; uint64_t GetSizeSync() override;
// TODO(jiameng): we always build the index after documents are updated. May // TODO(jiameng): we always build the index after documents are updated. May
// revise this strategy if there is a different use case. // revise this strategy if there is a different use case.
...@@ -52,19 +53,47 @@ class InvertedIndexSearch : public IndexSync { ...@@ -52,19 +53,47 @@ class InvertedIndexSearch : public IndexSync {
uint32_t max_results, uint32_t max_results,
std::vector<Result>* results) override; std::vector<Result>* results) override;
// Index overrides:
// GetSize is only accurate if the index has done updating.
void GetSize(GetSizeCallback callback) override;
void AddOrUpdate(const std::vector<Data>& data,
AddOrUpdateCallback callback) override;
void Delete(const std::vector<std::string>& ids,
DeleteCallback callback) override;
void UpdateDocuments(const std::vector<Data>& data,
UpdateDocumentsCallback callback) override;
void Find(const base::string16& query,
uint32_t max_results,
FindCallback callback) override;
void ClearIndex(ClearIndexCallback callback) override;
// Returns document id and number of occurrences of |term|. // Returns document id and number of occurrences of |term|.
// Document ids are sorted in alphabetical order. // Document ids are sorted in alphabetical order.
std::vector<std::pair<std::string, uint32_t>> FindTermForTesting( std::vector<std::pair<std::string, uint32_t>> FindTermForTesting(
const base::string16& term) const; const base::string16& term) const;
private: private:
void FinalizeAddOrUpdateSync(
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDeleteSync is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero.
void FinalizeDeleteSync(const std::vector<std::string>& ids);
void FinalizeAddOrUpdate( void FinalizeAddOrUpdate(
AddOrUpdateCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents); const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// FinalizeDelete is called if Delete cannot be immediately done because // FinalizeDelete is called if Delete cannot be immediately done because
// there's another index updating operation before it, i.e. // there's another index updating operation before it, i.e.
// |num_queued_index_updates_| is not zero. // |num_queued_index_updates_| is not zero.
void FinalizeDelete(const std::vector<std::string>& ids); void FinalizeDelete(DeleteCallback callback,
const std::vector<std::string>& ids);
void FinalizeUpdateDocuments(
UpdateDocumentsCallback callback,
const std::vector<std::pair<std::string, std::vector<Token>>>& documents);
// In order to reduce unnecessary inverted index building, we only build the // In order to reduce unnecessary inverted index building, we only build the
// index if there's no upcoming modification to the index's document list. // index if there's no upcoming modification to the index's document list.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment