Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator

BUG=367677 Review URL: https://codereview.chromium.org/270203003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276869 0039d316-1c4b-4281-b951-d872f2087c98

Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator
BUG=367677 Review URL: https://codereview.chromium.org/270203003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276869 0039d316-1c4b-4281-b951-d872f2087c98
20430bb3 · andrewhayden@chromium.org · 8094dc81 · 20430bb3 · 20430bb3 · 20430bb3
Commit 20430bb3 authored Jun 13, 2014 by andrewhayden@chromium.org
4 changed files
--- a/base/i18n/break_iterator.cc
+++ b/base/i18n/break_iterator.cc
@@ -22,6 +22,15 @@ BreakIterator::BreakIterator(const string16& str, BreakType break_type)
      pos_(0) {
 }

+BreakIterator::BreakIterator(const string16& str, const string16& rules)
+    : iter_(NULL),
+      string_(str),
+      rules_(rules),
+      break_type_(RULE_BASED),
+      prev_(npos),
+      pos_(0) {
+}
+
 BreakIterator::~BreakIterator() {
  if (iter_)
    ubrk_close(static_cast<UBreakIterator*>(iter_));
@@ -29,6 +38,7 @@ BreakIterator::~BreakIterator() {

 bool BreakIterator::Init() {
  UErrorCode status = U_ZERO_ERROR;
+  UParseError parse_error;
  UBreakIteratorType break_type;
  switch (break_type_) {
    case BREAK_CHARACTER:
@@ -39,19 +49,39 @@ bool BreakIterator::Init() {
      break;
    case BREAK_LINE:
    case BREAK_NEWLINE:
+    case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
      break_type = UBRK_LINE;
      break;
    default:
      NOTREACHED() << "invalid break_type_";
      return false;
  }
-  iter_ = ubrk_open(break_type, NULL,
-                    string_.data(), static_cast<int32_t>(string_.size()),
-                    &status);
+  if (break_type_ == RULE_BASED) {
+    iter_ = ubrk_openRules(rules_.c_str(),
+                           static_cast<int32_t>(rules_.length()),
+                           string_.data(),
+                           static_cast<int32_t>(string_.size()),
+                           &parse_error,
+                           &status);
+    if (U_FAILURE(status)) {
+      NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
+          << parse_error.line << ", offset " << parse_error.offset;
+    }
+  } else {
+    iter_ = ubrk_open(break_type,
+                      NULL,
+                      string_.data(),
+                      static_cast<int32_t>(string_.size()),
+                      &status);
+    if (U_FAILURE(status)) {
+      NOTREACHED() << "ubrk_open failed";
+    }
+  }
+
  if (U_FAILURE(status)) {
-    NOTREACHED() << "ubrk_open failed";
    return false;
  }
+
  // Move the iterator to the beginning of the string.
  ubrk_first(static_cast<UBreakIterator*>(iter_));
  return true;
@@ -65,6 +95,7 @@ bool BreakIterator::Advance() {
    case BREAK_CHARACTER:
    case BREAK_WORD:
    case BREAK_LINE:
+    case RULE_BASED:
      pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
      if (pos == UBRK_DONE) {
        pos_ = npos;
@@ -91,14 +122,29 @@ bool BreakIterator::Advance() {
  }
 }

+bool BreakIterator::SetText(const base::char16* text, const size_t length) {
+  UErrorCode status = U_ZERO_ERROR;
+  ubrk_setText(static_cast<UBreakIterator*>(iter_),
+               text, length, &status);
+  pos_ = 0;  // implicit when ubrk_setText is done
+  prev_ = npos;
+  if (U_FAILURE(status)) {
+    NOTREACHED() << "ubrk_setText failed";
+    return false;
+  }
+  return true;
+}
+
 bool BreakIterator::IsWord() const {
  int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
-  return (break_type_ == BREAK_WORD && status != UBRK_WORD_NONE);
+  if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
+      return false;
+  return status != UBRK_WORD_NONE;
 }

 bool BreakIterator::IsEndOfWord(size_t position) const {
-  if (break_type_ != BREAK_WORD)
-    return false;
+  if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
+      return false;

  UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
  UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
@@ -107,8 +153,8 @@ bool BreakIterator::IsEndOfWord(size_t position) const {
 }

 bool BreakIterator::IsStartOfWord(size_t position) const {
-  if (break_type_ != BREAK_WORD)
-    return false;
+  if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
+      return false;

  UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
  UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));

--- a/base/i18n/break_iterator.h
+++ b/base/i18n/break_iterator.h
@@ -66,10 +66,17 @@ class BASE_I18N_EXPORT BreakIterator {
    BREAK_SPACE = BREAK_LINE,
    BREAK_NEWLINE,
    BREAK_CHARACTER,
+    // But don't remove this one!
+    RULE_BASED,
  };

  // Requires |str| to live as long as the BreakIterator does.
  BreakIterator(const string16& str, BreakType break_type);
+  // Make a rule-based iterator. BreakType == RULE_BASED is implied.
+  // TODO(andrewhayden): This signature could easily be misinterpreted as
+  // "(const string16& str, const string16& locale)". We should do something
+  // better.
+  BreakIterator(const string16& str, const string16& rules);
  ~BreakIterator();

  // Init() must be called before any of the iterators are valid.
@@ -82,6 +89,11 @@ class BASE_I18N_EXPORT BreakIterator {
  // last time Advance() returns true.)
  bool Advance();

+  // Updates the text used by the iterator, resetting the iterator as if
+  // if Init() had been called again. Any old state is lost. Returns true
+  // unless there is an error setting the text.
+  bool SetText(const base::char16* text, const size_t length);
+
  // Under BREAK_WORD mode, returns true if the break we just hit is the
  // end of a word. (Otherwise, the break iterator just skipped over e.g.
  // whitespace or punctuation.)  Under BREAK_LINE and BREAK_NEWLINE modes,
@@ -113,10 +125,13 @@ class BASE_I18N_EXPORT BreakIterator {
  // callers from needing access to the ICU public headers directory.
  void* iter_;

-  // The string we're iterating over.
+  // The string we're iterating over. Can be changed with SetText(...)
  const string16& string_;

-  // The breaking style (word/space/newline).
+  // Rules for our iterator. Mutually exclusive with break_type_.
+  const string16 rules_;
+
+  // The breaking style (word/space/newline). Mutually exclusive with rules_
  BreakType break_type_;

  // Previous and current iterator positions.

--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -10,6 +10,7 @@
 #include <string>

 #include "base/basictypes.h"
+#include "base/i18n/break_iterator.h"
 #include "base/logging.h"
 #include "base/strings/stringprintf.h"
 #include "base/strings/utf_string_conversions.h"
@@ -299,10 +300,8 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c,

 SpellcheckWordIterator::SpellcheckWordIterator()
    : text_(NULL),
-      length_(0),
-      position_(UBRK_DONE),
      attribute_(NULL),
-      iterator_(NULL) {
+      iterator_() {
 }

 SpellcheckWordIterator::~SpellcheckWordIterator() {
@@ -315,18 +314,22 @@ bool SpellcheckWordIterator::Initialize(
  // Create a custom ICU break iterator with empty text used in this object. (We
  // allow setting text later so we can re-use this iterator.)
  DCHECK(attribute);
-  UErrorCode open_status = U_ZERO_ERROR;
-  UParseError parse_status;
-  base::string16 rule(attribute->GetRuleSet(allow_contraction));
+  const base::string16 rule(attribute->GetRuleSet(allow_contraction));

  // If there is no rule set, the attributes were invalid.
  if (rule.empty())
    return false;

-  iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,
-                             &parse_status, &open_status);
-  if (U_FAILURE(open_status))
+  scoped_ptr<base::i18n::BreakIterator> iterator(
+      new base::i18n::BreakIterator(base::string16(), rule));
+  if (!iterator->Init()) {
+    // Since we're not passing in any text, the only reason this could fail
+    // is if we fail to parse the rules. Since the rules are hardcoded,
+    // that would be a bug in this class.
+    NOTREACHED() << "failed to open iterator (broken rules)";
    return false;
+  }
+  iterator_ = iterator.Pass();

  // Set the character attributes so we can normalize the words extracted by
  // this iterator.
@@ -335,7 +338,7 @@ bool SpellcheckWordIterator::Initialize(
 }

 bool SpellcheckWordIterator::IsInitialized() const {
-  // Return true if we have an ICU custom iterator.
+  // Return true iff we have an iterator.
  return !!iterator_;
 }

@@ -343,66 +346,51 @@ bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
  DCHECK(!!iterator_);

  // Set the text to be split by this iterator.
-  UErrorCode status = U_ZERO_ERROR;
-  ubrk_setText(iterator_, text, length, &status);
-  if (U_FAILURE(status))
-    return false;
-
-  // Retrieve the position to the first word in this text. We return false if
-  // this text does not have any words. (For example, The input text consists
-  // only of Chinese characters while the spellchecker language is English.)
-  position_ = ubrk_first(iterator_);
-  if (position_ == UBRK_DONE)
+  if (!iterator_->SetText(text, length)) {
+    LOG(ERROR) << "failed to set text";
    return false;
+  }

  text_ = text;
-  length_ = static_cast<int>(length);
  return true;
 }

 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
                                         int* word_start,
                                         int* word_length) {
-  DCHECK(!!text_ && length_ > 0);
+  DCHECK(!!text_);

  word_string->clear();
  *word_start = 0;
  *word_length = 0;

-  if (!text_ || position_ == UBRK_DONE)
+  if (!text_) {
    return false;
+  }

  // Find a word that can be checked for spelling. Our rule sets filter out
  // invalid words (e.g. numbers and characters not supported by the
  // spellchecker language) so this ubrk_getRuleStatus() call returns
  // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
  // words until we can find a valid word or reach the end of the input string.
-  int next = ubrk_next(iterator_);
-  while (next != UBRK_DONE) {
-    if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {
-      if (Normalize(position_, next - position_, word_string)) {
-        *word_start = position_;
-        *word_length = next - position_;
-        position_ = next;
+  while (iterator_->Advance()) {
+    const size_t start = iterator_->prev();
+    const size_t length = iterator_->pos() - start;
+    if (iterator_->IsWord()) {
+      if (Normalize(start, length, word_string)) {
+        *word_start = start;
+        *word_length = length;
        return true;
      }
    }
-    position_ = next;
-    next = ubrk_next(iterator_);
  }

-  // There aren't any more words in the given text. Set the position to
-  // UBRK_DONE to prevent from calling ubrk_next() next time when this function
-  // is called.
-  position_ = UBRK_DONE;
+  // There aren't any more words in the given text.
  return false;
 }

 void SpellcheckWordIterator::Reset() {
-  if (iterator_) {
-    ubrk_close(iterator_);
-    iterator_ = NULL;
-  }
+  iterator_.reset();
 }

 bool SpellcheckWordIterator::Normalize(int input_start,

--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h
@@ -12,10 +12,16 @@
 #include <string>

 #include "base/basictypes.h"
+#include "base/memory/scoped_ptr.h"
 #include "base/strings/string16.h"
-#include "third_party/icu/source/common/unicode/ubrk.h"
 #include "third_party/icu/source/common/unicode/uscript.h"

+namespace base {
+namespace i18n {
+class BreakIterator;
+} // namespace i18n
+} // namespace base
+
 // A class which encapsulates language-specific operations used by
 // SpellcheckWordIterator. When we set the spellchecker language, this class
 // creates rule sets that filter out the characters not supported by the
@@ -156,18 +162,12 @@ class SpellcheckWordIterator {
  // The pointer to the input string from which we are extracting words.
  const base::char16* text_;

-  // The length of the original string.
-  int length_;
-
-  // The current position in the original string.
-  int position_;
-
  // The language-specific attributes used for filtering out non-word
  // characters.
  const SpellcheckCharAttribute* attribute_;

-  // The ICU break iterator.
-  UBreakIterator* iterator_;
+  // The break iterator.
+  scoped_ptr<base::i18n::BreakIterator> iterator_;

  DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
 };