Split CJK full names into name parts correctly.

This a a partial, somewhat naive, fix for a bug where CJK names were split the same way as western names. CJK names have the family name (surname) first, and don't usually have a space in-between the two parts. For a complete fix, we might need to improve Japanese name detection, when there is no space between the two names. It's unclear how often users actually enter their name without spaces in a form field. We might also want to fix the opposite use-case: when the user enters their first & last name in separate fields, we should infer their full name in the right order (with ordering based on the script used). BUG=89111 Review-Url: https://codereview.chromium.org/2132103002 Cr-Commit-Position: refs/heads/master@{#405997}

Split CJK full names into name parts correctly.
This a a partial, somewhat naive, fix for a bug where CJK names were split the same way as western names. CJK names have the family name (surname) first, and don't usually have a space in-between the two parts. For a complete fix, we might need to improve Japanese name detection, when there is no space between the two names. It's unclear how often users actually enter their name without spaces in a form field. We might also want to fix the opposite use-case: when the user enters their first & last name in separate fields, we should infer their full name in the right order (with ordering based on the script used). BUG=89111 Review-Url: https://codereview.chromium.org/2132103002 Cr-Commit-Position: refs/heads/master@{#405997}
4a655982 · nicolaso · Commit bot · df5e915f · 4a655982 · 4a655982
Commit 4a655982 authored Jul 18, 2016 by nicolaso Committed by Commit bot Jul 18, 2016
2 changed files
--- a/components/autofill/core/browser/autofill_data_util.cc
+++ b/components/autofill/core/browser/autofill_data_util.cc
@@ -4,12 +4,14 @@

 #include "components/autofill/core/browser/autofill_data_util.h"

+#include <algorithm>
 #include <vector>

 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "components/autofill/core/browser/field_types.h"
+#include "third_party/icu/source/common/unicode/uscript.h"

 namespace autofill {
 namespace data_util {
@@ -31,6 +33,28 @@ const char* const family_name_prefixes[] = {"d'",  "de",  "del", "der", "di",
                                            "la",  "le",  "mc",  "san", "st",
                                            "ter", "van", "von"};

+// The common and non-ambiguous CJK surnames (last names) that have more than
+// one character.
+const char* common_cjk_multi_char_surnames[] = {
+  // Korean, taken from the list of surnames:
+  // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
+  "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",
+
+  // Chinese, taken from the top 10 Chinese 2-character surnames:
+  // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
+  // Simplified Chinese (mostly mainland China)
+  "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
+  // Traditional Chinese (mostly Taiwan)
+  "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯"
+};
+
+// All Korean surnames that have more than one character, even the
+// rare/ambiguous ones.
+const char* korean_multi_char_surnames[] = {
+  "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우",
+  "소봉", "어금", "장곡", "제갈", "황목", "황보"
+};
+
 // Returns true if |set| contains |element|, modulo a final period.
 bool ContainsString(const char* const set[],
                    size_t set_size,
@@ -74,6 +98,127 @@ void StripSuffixes(std::vector<base::string16>* name_tokens) {
  }
 }

+// Find whether |name| starts with any of the strings from the array
+// |prefixes|. The returned value is the length of the prefix found, or 0 if
+// none is found.
+size_t StartsWithAny(base::StringPiece16 name, const char** prefixes,
+                     size_t prefix_count) {
+   base::string16 buffer;
+   for (size_t i = 0; i < prefix_count; i++) {
+     buffer.clear();
+     base::UTF8ToUTF16(prefixes[i], strlen(prefixes[i]), &buffer);
+     if (base::StartsWith(name, buffer, base::CompareCase::SENSITIVE)) {
+       return buffer.size();
+     }
+   }
+   return 0;
+}
+
+// Returns true if |c| is a CJK (Chinese, Japanese, Korean) character, for any
+// of the CJK alphabets.
+bool IsCJK(base::char16 c) {
+  static const std::set<UScriptCode> kCjkScripts {
+    USCRIPT_HAN, // CJK logographs, used by all 3 (but rarely for Korean)
+    USCRIPT_HANGUL, // Korean alphabet
+    USCRIPT_KATAKANA, // A Japanese syllabary
+    USCRIPT_HIRAGANA, // A Japanese syllabary
+    USCRIPT_BOPOMOFO // Chinese semisyllabary, rarely used
+  };
+  UErrorCode error = U_ZERO_ERROR;
+  UScriptCode script = uscript_getScript(c, &error);
+  return kCjkScripts.find(script) != kCjkScripts.end();
+}
+
+// Returns true if |name| looks like a CJK name (or some kind of mish-mash of
+// the three, at least). The name is considered to be a CJK name if it is only
+// CJK characters or spaces.
+//
+// Chinese and Japanese names are usually spelled out using the Han characters
+// (logographs), which constitute the "CJK Unified Ideographs" block in Unicode,
+// also referred to as Unihan. Korean names are usually spelled out in the
+// Korean alphabet (Hangul), although they do have a Han equivalent as well.
+bool IsCJKName(const base::string16& name) {
+  for (base::char16 c : name) {
+    if (!IsCJK(c) && !base::IsUnicodeWhitespace(c)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if |c| is a Korean Hangul character.
+bool IsHangul(base::char16 c) {
+  UErrorCode error = U_ZERO_ERROR;
+  return uscript_getScript(c, &error) == USCRIPT_HANGUL;
+}
+
+// Returns true if |name| looks like a Korean name, made up entirely of Hangul
+// characters or spaces.
+bool IsHangulName(const base::string16& name) {
+  for (base::char16 c : name) {
+    if (!IsHangul(c) && !base::IsUnicodeWhitespace(c)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Tries to split a Chinese, Japanese, or Korean name into its given name &
+// surname parts, and puts the result in |parts|. If splitting did not work for
+// whatever reason, returns false.
+bool SplitCJKName(const std::vector<base::string16>& name_tokens,
+                  NameParts* parts) {
+  // The convention for CJK languages is to put the surname (last name) first,
+  // and the given name (first name) second. In a continuous text, there is
+  // normally no space between the two parts of the name. When entering their
+  // name into a field, though, some people add a space to disambiguate. CJK
+  // names (almost) never have a middle name.
+  //
+  // TODO(crbug.com/89111): Foreign names in Japanese are written in Katakana,
+  // with a '・' (KATAKANA MIDDLE DOT U+30FB) character as a separator, with
+  // the *western* ordering. e.g. "ビル・ゲイツ" ("biru・geitsu" AKA Bill Gates)
+  if (name_tokens.size() == 1) {
+    // There is no space between the surname and given name. Try to infer where
+    // to separate between the two. Most Chinese and Korean surnames have only
+    // one character, but there are a few that have 2. If the name does not
+    // start with a surname from a known list, default to 1 character.
+    //
+    // TODO(crbug.com/89111): Japanese names with no space will be mis-split,
+    // since we don't have a list of Japanese last names. In the Han alphabet,
+    // it might also be difficult for us to differentiate between Chinese &
+    // Japanese names.
+    const base::string16& name = name_tokens.front();
+    const bool is_korean = IsHangulName(name);
+    size_t surname_length = 0;
+    if (is_korean && name.size() > 3) {
+      // 4-character Korean names are more likely to be 2/2 than 1/3, so use
+      // the full list of Korean 2-char surnames. (instead of only the common
+      // ones)
+      surname_length = std::max<size_t>(
+          1, StartsWithAny(name, korean_multi_char_surnames,
+                           arraysize(korean_multi_char_surnames)));
+    } else {
+      // Default to 1 character if the surname is not in
+      // |common_cjk_multi_char_surnames|.
+      surname_length = std::max<size_t>(
+          1, StartsWithAny(name, common_cjk_multi_char_surnames,
+                           arraysize(common_cjk_multi_char_surnames)));
+    }
+    parts->family = name.substr(0, surname_length);
+    parts->given = name.substr(surname_length);
+    return true;
+  }
+  if (name_tokens.size() == 2) {
+    // The user entered a space between the two name parts. This makes our job
+    // easier. Family name first, given name second.
+    parts->family = name_tokens[0];
+    parts->given = name_tokens[1];
+    return true;
+  }
+  // We don't know what to do if there are more than 2 tokens.
+  return false;
+}
+
 }  // namespace

 NameParts SplitName(const base::string16& name) {
@@ -82,12 +227,21 @@ NameParts SplitName(const base::string16& name) {
                        base::SPLIT_WANT_NONEMPTY);
  StripPrefixes(&name_tokens);

+  NameParts parts;
+
+  // TODO(crbug.com/89111): Hungarian, Tamil, Telugu, and Vietnamese also have
+  // the given name before the surname, and should be treated as special cases
+  // too.
+
+  // Treat CJK names differently.
+  if (IsCJKName(name) && SplitCJKName(name_tokens, &parts)) {
+    return parts;
+  }
+
  // Don't assume "Ma" is a suffix in John Ma.
  if (name_tokens.size() > 2)
    StripSuffixes(&name_tokens);

-  NameParts parts;
-
  if (name_tokens.empty()) {
    // Bad things have happened; just assume the whole thing is a given name.
    parts.given = name;

--- a/components/autofill/core/browser/autofill_data_util_unittest.cc
+++ b/components/autofill/core/browser/autofill_data_util_unittest.cc
@@ -32,7 +32,37 @@ TEST(AutofillDataUtilTest, SplitName) {
      // Exception to the name suffix removal.
      {"John Ma", "John", "", "Ma"},
      // Common family name prefixes not considered a middle name.
-      {"Milhouse Van Houten", "Milhouse", "", "Van Houten"}};
+      {"Milhouse Van Houten", "Milhouse", "", "Van Houten"},
+
+      // CJK names have reverse order (surname goes first, given name goes
+      // second).
+      {"홍 길동", "길동", "", "홍"}, // Korean name, Hangul
+      {"孫 德明", "德明", "", "孫"}, // Chinese name, Unihan
+      {"山田 貴洋", "貴洋", "", "山田"}, // Japanese name, Unihan
+
+      // CJK names don't usually have a space in the middle, but most of the
+      // time, the surname is only one character (in Chinese & Korean).
+      {"최성훈", "성훈", "", "최"}, // Korean name, Hangul
+      {"刘翔", "翔", "", "刘"}, // (Simplified) Chinese name, Unihan
+      {"劉翔", "翔", "", "劉"}, // (Traditional) Chinese name, Unihan
+
+      // There are a few exceptions. Occasionally, the surname has two
+      // characters.
+      {"남궁도", "도", "", "남궁"}, // Korean name, Hangul
+      {"황보혜정", "혜정", "", "황보"}, // Korean name, Hangul
+      {"歐陽靖", "靖", "", "歐陽"}, // (Traditional) Chinese name, Unihan
+
+      // In Korean, some 2-character surnames are rare/ambiguous, like "강전":
+      // "강" is a common surname, and "전" can be part of a given name. In
+      // those cases, we assume it's 1/2 for 3-character names, or 2/2 for
+      // 4-character names.
+      {"강전희", "전희", "", "강"}, // Korean name, Hangul
+      {"황목치승", "치승", "", "황목"}, // Korean name, Hangul
+
+      // It occasionally happens that a full name is 2 characters, 1/1.
+      {"이도", "도", "", "이"}, // Korean name, Hangul
+      {"孫文", "文", "", "孫"} // Chinese name, Unihan
+  };

  for (TestCase test_case : test_cases) {
    NameParts name_parts = SplitName(base::UTF8ToUTF16(test_case.full_name));