Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator

BUG=367677

Review URL: https://codereview.chromium.org/270203003

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276869 0039d316-1c4b-4281-b951-d872f2087c98
parent 8094dc81
...@@ -22,6 +22,15 @@ BreakIterator::BreakIterator(const string16& str, BreakType break_type) ...@@ -22,6 +22,15 @@ BreakIterator::BreakIterator(const string16& str, BreakType break_type)
pos_(0) { pos_(0) {
} }
BreakIterator::BreakIterator(const string16& str, const string16& rules)
: iter_(NULL),
string_(str),
rules_(rules),
break_type_(RULE_BASED),
prev_(npos),
pos_(0) {
}
BreakIterator::~BreakIterator() { BreakIterator::~BreakIterator() {
if (iter_) if (iter_)
ubrk_close(static_cast<UBreakIterator*>(iter_)); ubrk_close(static_cast<UBreakIterator*>(iter_));
...@@ -29,6 +38,7 @@ BreakIterator::~BreakIterator() { ...@@ -29,6 +38,7 @@ BreakIterator::~BreakIterator() {
bool BreakIterator::Init() { bool BreakIterator::Init() {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
UParseError parse_error;
UBreakIteratorType break_type; UBreakIteratorType break_type;
switch (break_type_) { switch (break_type_) {
case BREAK_CHARACTER: case BREAK_CHARACTER:
...@@ -39,19 +49,39 @@ bool BreakIterator::Init() { ...@@ -39,19 +49,39 @@ bool BreakIterator::Init() {
break; break;
case BREAK_LINE: case BREAK_LINE:
case BREAK_NEWLINE: case BREAK_NEWLINE:
case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
break_type = UBRK_LINE; break_type = UBRK_LINE;
break; break;
default: default:
NOTREACHED() << "invalid break_type_"; NOTREACHED() << "invalid break_type_";
return false; return false;
} }
iter_ = ubrk_open(break_type, NULL, if (break_type_ == RULE_BASED) {
string_.data(), static_cast<int32_t>(string_.size()), iter_ = ubrk_openRules(rules_.c_str(),
static_cast<int32_t>(rules_.length()),
string_.data(),
static_cast<int32_t>(string_.size()),
&parse_error,
&status);
if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
<< parse_error.line << ", offset " << parse_error.offset;
}
} else {
iter_ = ubrk_open(break_type,
NULL,
string_.data(),
static_cast<int32_t>(string_.size()),
&status); &status);
if (U_FAILURE(status)) { if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_open failed"; NOTREACHED() << "ubrk_open failed";
}
}
if (U_FAILURE(status)) {
return false; return false;
} }
// Move the iterator to the beginning of the string. // Move the iterator to the beginning of the string.
ubrk_first(static_cast<UBreakIterator*>(iter_)); ubrk_first(static_cast<UBreakIterator*>(iter_));
return true; return true;
...@@ -65,6 +95,7 @@ bool BreakIterator::Advance() { ...@@ -65,6 +95,7 @@ bool BreakIterator::Advance() {
case BREAK_CHARACTER: case BREAK_CHARACTER:
case BREAK_WORD: case BREAK_WORD:
case BREAK_LINE: case BREAK_LINE:
case RULE_BASED:
pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
if (pos == UBRK_DONE) { if (pos == UBRK_DONE) {
pos_ = npos; pos_ = npos;
...@@ -91,13 +122,28 @@ bool BreakIterator::Advance() { ...@@ -91,13 +122,28 @@ bool BreakIterator::Advance() {
} }
} }
bool BreakIterator::SetText(const base::char16* text, const size_t length) {
UErrorCode status = U_ZERO_ERROR;
ubrk_setText(static_cast<UBreakIterator*>(iter_),
text, length, &status);
pos_ = 0; // implicit when ubrk_setText is done
prev_ = npos;
if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_setText failed";
return false;
}
return true;
}
bool BreakIterator::IsWord() const { bool BreakIterator::IsWord() const {
int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
return (break_type_ == BREAK_WORD && status != UBRK_WORD_NONE); if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return false;
return status != UBRK_WORD_NONE;
} }
bool BreakIterator::IsEndOfWord(size_t position) const { bool BreakIterator::IsEndOfWord(size_t position) const {
if (break_type_ != BREAK_WORD) if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return false; return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
...@@ -107,7 +153,7 @@ bool BreakIterator::IsEndOfWord(size_t position) const { ...@@ -107,7 +153,7 @@ bool BreakIterator::IsEndOfWord(size_t position) const {
} }
bool BreakIterator::IsStartOfWord(size_t position) const { bool BreakIterator::IsStartOfWord(size_t position) const {
if (break_type_ != BREAK_WORD) if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return false; return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
......
...@@ -66,10 +66,17 @@ class BASE_I18N_EXPORT BreakIterator { ...@@ -66,10 +66,17 @@ class BASE_I18N_EXPORT BreakIterator {
BREAK_SPACE = BREAK_LINE, BREAK_SPACE = BREAK_LINE,
BREAK_NEWLINE, BREAK_NEWLINE,
BREAK_CHARACTER, BREAK_CHARACTER,
// But don't remove this one!
RULE_BASED,
}; };
// Requires |str| to live as long as the BreakIterator does. // Requires |str| to live as long as the BreakIterator does.
BreakIterator(const string16& str, BreakType break_type); BreakIterator(const string16& str, BreakType break_type);
// Make a rule-based iterator. BreakType == RULE_BASED is implied.
// TODO(andrewhayden): This signature could easily be misinterpreted as
// "(const string16& str, const string16& locale)". We should do something
// better.
BreakIterator(const string16& str, const string16& rules);
~BreakIterator(); ~BreakIterator();
// Init() must be called before any of the iterators are valid. // Init() must be called before any of the iterators are valid.
...@@ -82,6 +89,11 @@ class BASE_I18N_EXPORT BreakIterator { ...@@ -82,6 +89,11 @@ class BASE_I18N_EXPORT BreakIterator {
// last time Advance() returns true.) // last time Advance() returns true.)
bool Advance(); bool Advance();
// Updates the text used by the iterator, resetting the iterator as if
// if Init() had been called again. Any old state is lost. Returns true
// unless there is an error setting the text.
bool SetText(const base::char16* text, const size_t length);
// Under BREAK_WORD mode, returns true if the break we just hit is the // Under BREAK_WORD mode, returns true if the break we just hit is the
// end of a word. (Otherwise, the break iterator just skipped over e.g. // end of a word. (Otherwise, the break iterator just skipped over e.g.
// whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes,
...@@ -113,10 +125,13 @@ class BASE_I18N_EXPORT BreakIterator { ...@@ -113,10 +125,13 @@ class BASE_I18N_EXPORT BreakIterator {
// callers from needing access to the ICU public headers directory. // callers from needing access to the ICU public headers directory.
void* iter_; void* iter_;
// The string we're iterating over. // The string we're iterating over. Can be changed with SetText(...)
const string16& string_; const string16& string_;
// The breaking style (word/space/newline). // Rules for our iterator. Mutually exclusive with break_type_.
const string16 rules_;
// The breaking style (word/space/newline). Mutually exclusive with rules_
BreakType break_type_; BreakType break_type_;
// Previous and current iterator positions. // Previous and current iterator positions.
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <string> #include <string>
#include "base/basictypes.h" #include "base/basictypes.h"
#include "base/i18n/break_iterator.h"
#include "base/logging.h" #include "base/logging.h"
#include "base/strings/stringprintf.h" #include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h" #include "base/strings/utf_string_conversions.h"
...@@ -299,10 +300,8 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c, ...@@ -299,10 +300,8 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c,
SpellcheckWordIterator::SpellcheckWordIterator() SpellcheckWordIterator::SpellcheckWordIterator()
: text_(NULL), : text_(NULL),
length_(0),
position_(UBRK_DONE),
attribute_(NULL), attribute_(NULL),
iterator_(NULL) { iterator_() {
} }
SpellcheckWordIterator::~SpellcheckWordIterator() { SpellcheckWordIterator::~SpellcheckWordIterator() {
...@@ -315,18 +314,22 @@ bool SpellcheckWordIterator::Initialize( ...@@ -315,18 +314,22 @@ bool SpellcheckWordIterator::Initialize(
// Create a custom ICU break iterator with empty text used in this object. (We // Create a custom ICU break iterator with empty text used in this object. (We
// allow setting text later so we can re-use this iterator.) // allow setting text later so we can re-use this iterator.)
DCHECK(attribute); DCHECK(attribute);
UErrorCode open_status = U_ZERO_ERROR; const base::string16 rule(attribute->GetRuleSet(allow_contraction));
UParseError parse_status;
base::string16 rule(attribute->GetRuleSet(allow_contraction));
// If there is no rule set, the attributes were invalid. // If there is no rule set, the attributes were invalid.
if (rule.empty()) if (rule.empty())
return false; return false;
iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, scoped_ptr<base::i18n::BreakIterator> iterator(
&parse_status, &open_status); new base::i18n::BreakIterator(base::string16(), rule));
if (U_FAILURE(open_status)) if (!iterator->Init()) {
// Since we're not passing in any text, the only reason this could fail
// is if we fail to parse the rules. Since the rules are hardcoded,
// that would be a bug in this class.
NOTREACHED() << "failed to open iterator (broken rules)";
return false; return false;
}
iterator_ = iterator.Pass();
// Set the character attributes so we can normalize the words extracted by // Set the character attributes so we can normalize the words extracted by
// this iterator. // this iterator.
...@@ -335,7 +338,7 @@ bool SpellcheckWordIterator::Initialize( ...@@ -335,7 +338,7 @@ bool SpellcheckWordIterator::Initialize(
} }
bool SpellcheckWordIterator::IsInitialized() const { bool SpellcheckWordIterator::IsInitialized() const {
// Return true if we have an ICU custom iterator. // Return true iff we have an iterator.
return !!iterator_; return !!iterator_;
} }
...@@ -343,66 +346,51 @@ bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { ...@@ -343,66 +346,51 @@ bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
DCHECK(!!iterator_); DCHECK(!!iterator_);
// Set the text to be split by this iterator. // Set the text to be split by this iterator.
UErrorCode status = U_ZERO_ERROR; if (!iterator_->SetText(text, length)) {
ubrk_setText(iterator_, text, length, &status); LOG(ERROR) << "failed to set text";
if (U_FAILURE(status))
return false;
// Retrieve the position to the first word in this text. We return false if
// this text does not have any words. (For example, The input text consists
// only of Chinese characters while the spellchecker language is English.)
position_ = ubrk_first(iterator_);
if (position_ == UBRK_DONE)
return false; return false;
}
text_ = text; text_ = text;
length_ = static_cast<int>(length);
return true; return true;
} }
bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
int* word_start, int* word_start,
int* word_length) { int* word_length) {
DCHECK(!!text_ && length_ > 0); DCHECK(!!text_);
word_string->clear(); word_string->clear();
*word_start = 0; *word_start = 0;
*word_length = 0; *word_length = 0;
if (!text_ || position_ == UBRK_DONE) if (!text_) {
return false; return false;
}
// Find a word that can be checked for spelling. Our rule sets filter out // Find a word that can be checked for spelling. Our rule sets filter out
// invalid words (e.g. numbers and characters not supported by the // invalid words (e.g. numbers and characters not supported by the
// spellchecker language) so this ubrk_getRuleStatus() call returns // spellchecker language) so this ubrk_getRuleStatus() call returns
// UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
// words until we can find a valid word or reach the end of the input string. // words until we can find a valid word or reach the end of the input string.
int next = ubrk_next(iterator_); while (iterator_->Advance()) {
while (next != UBRK_DONE) { const size_t start = iterator_->prev();
if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { const size_t length = iterator_->pos() - start;
if (Normalize(position_, next - position_, word_string)) { if (iterator_->IsWord()) {
*word_start = position_; if (Normalize(start, length, word_string)) {
*word_length = next - position_; *word_start = start;
position_ = next; *word_length = length;
return true; return true;
} }
} }
position_ = next;
next = ubrk_next(iterator_);
} }
// There aren't any more words in the given text. Set the position to // There aren't any more words in the given text.
// UBRK_DONE to prevent from calling ubrk_next() next time when this function
// is called.
position_ = UBRK_DONE;
return false; return false;
} }
void SpellcheckWordIterator::Reset() { void SpellcheckWordIterator::Reset() {
if (iterator_) { iterator_.reset();
ubrk_close(iterator_);
iterator_ = NULL;
}
} }
bool SpellcheckWordIterator::Normalize(int input_start, bool SpellcheckWordIterator::Normalize(int input_start,
......
...@@ -12,10 +12,16 @@ ...@@ -12,10 +12,16 @@
#include <string> #include <string>
#include "base/basictypes.h" #include "base/basictypes.h"
#include "base/memory/scoped_ptr.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "third_party/icu/source/common/unicode/ubrk.h"
#include "third_party/icu/source/common/unicode/uscript.h" #include "third_party/icu/source/common/unicode/uscript.h"
namespace base {
namespace i18n {
class BreakIterator;
} // namespace i18n
} // namespace base
// A class which encapsulates language-specific operations used by // A class which encapsulates language-specific operations used by
// SpellcheckWordIterator. When we set the spellchecker language, this class // SpellcheckWordIterator. When we set the spellchecker language, this class
// creates rule sets that filter out the characters not supported by the // creates rule sets that filter out the characters not supported by the
...@@ -156,18 +162,12 @@ class SpellcheckWordIterator { ...@@ -156,18 +162,12 @@ class SpellcheckWordIterator {
// The pointer to the input string from which we are extracting words. // The pointer to the input string from which we are extracting words.
const base::char16* text_; const base::char16* text_;
// The length of the original string.
int length_;
// The current position in the original string.
int position_;
// The language-specific attributes used for filtering out non-word // The language-specific attributes used for filtering out non-word
// characters. // characters.
const SpellcheckCharAttribute* attribute_; const SpellcheckCharAttribute* attribute_;
// The ICU break iterator. // The break iterator.
UBreakIterator* iterator_; scoped_ptr<base::i18n::BreakIterator> iterator_;
DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment