Update ReplaceIllegalCharactersInPath to handle quirks in HFS+ and VFAT

This change: - Re-introduces U+200C and U+200D as illegal characters since these are ignored on HFS+ and can interfere with filename sanitization. All code points in the Cf general category are now considered illegal in a filename. - Leading and trailing WSpace and '.' characters are now considered illegal. - Due to being confused for short names on VFAT filesystems, the tilde ('~') is now considered illegal. - Prior to this change, only ASCII whitespace were trimmed from filenames on Mac OSX. All UTF-8 encoded WSpace characters are now handled on Mac OSX. - Instead of trimming leading and trailing whitespace, they are now replaced by the replacement character. Trimming could cause a previously hidden extension or basename to be exposed. BUG=444102 BUG=446538 Review URL: https://codereview.chromium.org/869823003 Cr-Commit-Position: refs/heads/master@{#314041}

Update ReplaceIllegalCharactersInPath to handle quirks in HFS+ and VFAT
This change: - Re-introduces U+200C and U+200D as illegal characters since these are ignored on HFS+ and can interfere with filename sanitization. All code points in the Cf general category are now considered illegal in a filename. - Leading and trailing WSpace and '.' characters are now considered illegal. - Due to being confused for short names on VFAT filesystems, the tilde ('~') is now considered illegal. - Prior to this change, only ASCII whitespace were trimmed from filenames on Mac OSX. All UTF-8 encoded WSpace characters are now handled on Mac OSX. - Instead of trimming leading and trailing whitespace, they are now replaced by the replacement character. Trimming could cause a previously hidden extension or basename to be exposed. BUG=444102 BUG=446538 Review URL: https://codereview.chromium.org/869823003 Cr-Commit-Position: refs/heads/master@{#314041}
92a354fc · asanka · Commit bot · 745ff1db · 92a354fc · 92a354fc
Commit 92a354fc authored Jan 30, 2015 by asanka Committed by Commit bot Jan 31, 2015
6 changed files
--- a/base/i18n/file_util_icu.cc
+++ b/base/i18n/file_util_icu.cc
@@ -30,12 +30,19 @@ class IllegalCharacters {
    return Singleton<IllegalCharacters>::get();
  }

-  bool contains(UChar32 ucs4) {
-    return !!set->contains(ucs4);
+  bool DisallowedEverywhere(UChar32 ucs4) {
+    return !!illegal_anywhere_->contains(ucs4);
  }

-  bool containsNone(const string16 &s) {
-    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
+  bool DisallowedLeadingOrTrailing(UChar32 ucs4) {
+    return !!illegal_at_ends_->contains(ucs4);
+  }
+
+  bool IsAllowedName(const string16& s) {
+    return s.empty() || (!!illegal_anywhere_->containsNone(
+                             icu::UnicodeString(s.c_str(), s.size())) &&
+                         !illegal_at_ends_->contains(*s.begin()) &&
+                         !illegal_at_ends_->contains(*s.rbegin()));
  }

 private:
@@ -45,60 +52,61 @@ class IllegalCharacters {
  IllegalCharacters();
  ~IllegalCharacters() { }

-  scoped_ptr<icu::UnicodeSet> set;
+  // set of characters considered invalid anywhere inside a filename.
+  scoped_ptr<icu::UnicodeSet> illegal_anywhere_;
+
+  // set of characters considered invalid at either end of a filename.
+  scoped_ptr<icu::UnicodeSet> illegal_at_ends_;

  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
 };

 IllegalCharacters::IllegalCharacters() {
-  UErrorCode status = U_ZERO_ERROR;
-  // Control characters, formatting characters, non-characters, and
-  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
+  UErrorCode everywhere_status = U_ZERO_ERROR;
+  UErrorCode ends_status = U_ZERO_ERROR;
+  // Control characters, formatting characters, non-characters, path separators,
+  // and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
-  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
-  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
-  // elsewhere, they can be confusing/problematic.
-  // Also, consider wrapping the set with our Singleton class to create and
-  // freeze it only once. Note that there's a trade-off between memory and
-  // speed.
-#if defined(WCHAR_T_IS_UTF16)
-  set.reset(new icu::UnicodeSet(icu::UnicodeString(
-      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
-#else
-  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
-      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
-      status));
-#endif
-  DCHECK(U_SUCCESS(status));
+  // Note that code points in the "Other, Format" (Cf) category are ignored on
+  // HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
+  // legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
+  // also excluded due to the possibility of interacting poorly with short
+  // filenames on VFAT. (Related to CVE-2014-9390)
+  illegal_anywhere_.reset(new icu::UnicodeSet(
+      UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\|][:Cc:][:Cf:]]"),
+      everywhere_status));
+  illegal_at_ends_.reset(new icu::UnicodeSet(
+      UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status));
+  DCHECK(U_SUCCESS(everywhere_status));
+  DCHECK(U_SUCCESS(ends_status));
+
  // Add non-characters. If this becomes a performance bottleneck by
  // any chance, do not add these to |set| and change IsFilenameLegal()
  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
-  // containsNone().
-  set->add(0xFDD0, 0xFDEF);
+  // IsAllowedName().
+  illegal_anywhere_->add(0xFDD0, 0xFDEF);
  for (int i = 0; i <= 0x10; ++i) {
    int plane_base = 0x10000 * i;
-    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
+    illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
  }
-  set->freeze();
+  illegal_anywhere_->freeze();
+  illegal_at_ends_->freeze();
 }

 }  // namespace

 bool IsFilenameLegal(const string16& file_name) {
-  return IllegalCharacters::GetInstance()->containsNone(file_name);
+  return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
 }

 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
                                    char replace_char) {
-  DCHECK(file_name);
-
-  DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
+  IllegalCharacters* illegal = IllegalCharacters::GetInstance();

-  // Remove leading and trailing whitespace.
-  TrimWhitespace(*file_name, TRIM_ALL, file_name);
+  DCHECK(!(illegal->DisallowedEverywhere(replace_char)));
+  DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char)));

-  IllegalCharacters* illegal = IllegalCharacters::GetInstance();
  int cursor = 0;  // The ICU macros expect an int.
  while (cursor < static_cast<int>(file_name->size())) {
    int char_begin = cursor;
@@ -122,7 +130,9 @@ void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
    NOTREACHED();
 #endif

-    if (illegal->contains(code_point)) {
+    if (illegal->DisallowedEverywhere(code_point) ||
+        ((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
+         illegal->DisallowedLeadingOrTrailing(code_point))) {
      file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
      // We just made the potentially multi-byte/word char into one that only
      // takes one byte/word, so need to adjust the cursor to point to the next

--- a/base/i18n/file_util_icu.h
+++ b/base/i18n/file_util_icu.h
@@ -18,13 +18,23 @@ namespace i18n {
 // param has the same restriction as that for ReplaceIllegalCharacters.
 BASE_I18N_EXPORT bool IsFilenameLegal(const string16& file_name);

-// Replaces characters in 'file_name' that are illegal for file names with
-// 'replace_char'. 'file_name' must not be a full or relative path, but just the
+// Replaces characters in |file_name| that are illegal for file names with
+// |replace_char|. |file_name| must not be a full or relative path, but just the
 // file name component (since slashes are considered illegal). Any leading or
-// trailing whitespace in 'file_name' is removed.
+// trailing whitespace or periods in |file_name| is also replaced with the
+// |replace_char|.
+//
 // Example:
-//   file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
-//   'replace_char' is '-'.
+//   "bad:file*name?.txt" will be turned into "bad_file_name_.txt" when
+//   |replace_char| is '_'.
+//
+// Warning: Do not use this function as the sole means of sanitizing a filename.
+//   While the resulting filename itself would be legal, it doesn't necessarily
+//   mean that the file will behave safely. On Windows, certain reserved names
+//   refer to devices rather than files (E.g. LPT1), and some filenames could be
+//   interpreted as shell namespace extensions (E.g. Foo.{<GUID>}).
+//
+// TODO(asanka): Move full filename sanitization logic here.
 BASE_I18N_EXPORT void ReplaceIllegalCharactersInPath(
    FilePath::StringType* file_name,
    char replace_char);

--- a/base/i18n/file_util_icu_unittest.cc
+++ b/base/i18n/file_util_icu_unittest.cc
@@ -20,24 +20,28 @@ class FileUtilICUTest : public PlatformTest {
 #if defined(OS_POSIX) && !defined(OS_MACOSX)

 // Linux disallows some evil ASCII characters, but passes all non-ASCII.
-static const struct goodbad_pair {
+static const struct GoodBadPairLinux {
  const char* bad_name;
  const char* good_name;
-} kIllegalCharacterCases[] = {
-  {"bad*file:name?.jpg", "bad-file-name-.jpg"},
+} kLinuxIllegalCharacterCases[] = {
+  {"bad*\\/file:name?.jpg", "bad---file-name-.jpg"},
  {"**********::::.txt", "--------------.txt"},
  {"\xe9\xf0zzzz.\xff", "\xe9\xf0zzzz.\xff"},
+  {" _ ", "-_-"},
+  {".", "-"},
+  {" .( ). ", "-.( ).-"},
+  {"     ", "-   -"},
 };

 TEST_F(FileUtilICUTest, ReplaceIllegalCharacersInPathLinuxTest) {
-  for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
-    std::string bad_name(kIllegalCharacterCases[i].bad_name);
+  for (size_t i = 0; i < arraysize(kLinuxIllegalCharacterCases); ++i) {
+    std::string bad_name(kLinuxIllegalCharacterCases[i].bad_name);
    ReplaceIllegalCharactersInPath(&bad_name, '-');
-    EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
+    EXPECT_EQ(kLinuxIllegalCharacterCases[i].good_name, bad_name);
  }
 }

-#else
+#endif

 // For Mac & Windows, which both do Unicode validation on filenames. These
 // characters are given as wide strings since its more convenient to specify
@@ -46,29 +50,36 @@ static const struct goodbad_pair {
  const wchar_t* bad_name;
  const wchar_t* good_name;
 } kIllegalCharacterCases[] = {
-  {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
-  {L"**********::::.txt", L"--------------.txt"},
-  // We can't use UCNs (universal character names) for C0/C1 characters and
-  // U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
-  {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
-#if defined(OS_WIN)
-  {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
-  {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
-#elif defined(OS_MACOSX)
-  {L"bad*file?name.jpg", L"bad-file-name.jpg"},
-  {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
-#endif
-  {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
-  {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
-  {L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"},
-  {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
-  // Unassigned codepoints are ok.
-  {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
-  // Non-characters are not allowed.
-  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
-  {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
+    {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
+    {L"**********::::.txt", L"--------------.txt"},
+    // We can't use UCNs (universal character names) for C0/C1 characters and
+    // U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
+    {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
+    {L"bad*file\\?name.jpg", L"bad-file--name.jpg"},
+    {L"\t  bad*file\\name/.jpg", L"-  bad-file-name-.jpg"},
+    {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
+    {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
+    {L"\u0635\u200C\u0644.mp3", L"\u0635-\u0644.mp3"},
+    {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
+    // Unassigned codepoints are ok.
+    {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
+    // Non-characters are not allowed.
+    {L"bad\uFFFFfile\U0010FFFEname.jpg", L"bad-file-name.jpg"},
+    {L"bad\uFDD0file\uFDEFname.jpg", L"bad-file-name.jpg"},
+    // CVE-2014-9390
+    {L"(\u200C.\u200D.\u200E.\u200F.\u202A.\u202B.\u202C.\u202D.\u202E.\u206A."
+     L"\u206B.\u206C.\u206D.\u206F.\uFEFF)",
+     L"(-.-.-.-.-.-.-.-.-.-.-.-.-.-.-)"},
+    {L"config~1", L"config-1"},
+    {L" _ ", L"-_-"},
+    {L" ", L"-"},
+    {L"\u2008.(\u2007).\u3000", L"-.(\u2007).-"},
+    {L"     ", L"-   -"},
+    {L".    ", L"-   -"}
 };

+#if defined(OS_WIN) || defined(OS_MACOSX)
+
 TEST_F(FileUtilICUTest, ReplaceIllegalCharactersInPathTest) {
  for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
 #if defined(OS_WIN)
@@ -85,6 +96,19 @@ TEST_F(FileUtilICUTest, ReplaceIllegalCharactersInPathTest) {

 #endif

+TEST_F(FileUtilICUTest, IsFilenameLegalTest) {
+  EXPECT_TRUE(IsFilenameLegal(string16()));
+
+  for (const auto& test_case : kIllegalCharacterCases) {
+    string16 bad_name = WideToUTF16(test_case.bad_name);
+    string16 good_name = WideToUTF16(test_case.good_name);
+
+    EXPECT_TRUE(IsFilenameLegal(good_name)) << good_name;
+    if (good_name != bad_name)
+      EXPECT_FALSE(IsFilenameLegal(bad_name)) << bad_name;
+  }
+}
+
 #if defined(OS_CHROMEOS)
 static const struct normalize_name_encoding_test_cases {
  const char* original_path;

--- a/content/browser/download/save_package.cc
+++ b/content/browser/download/save_package.cc
@@ -1242,7 +1242,7 @@ base::FilePath SavePackage::GetSuggestedNameForSaveAs(
    name_with_proper_ext = EnsureHtmlExtension(name_with_proper_ext);

  base::FilePath::StringType file_name = name_with_proper_ext.value();
-  base::i18n::ReplaceIllegalCharactersInPath(&file_name, ' ');
+  base::i18n::ReplaceIllegalCharactersInPath(&file_name, '_');
  return base::FilePath(file_name);
 }


--- a/content/browser/download/save_package_unittest.cc
+++ b/content/browser/download/save_package_unittest.cc
@@ -382,7 +382,7 @@ static const struct SuggestedSaveNameTestCase {
  // A URL-like title that does not match the title is respected in full.
  { "http://foo.com",
    base::ASCIIToUTF16("http://www.foo.com/path/title.txt"),
-    FPL("http   www.foo.com path title.txt"),
+    FPL("http___www.foo.com_path_title.txt"),
    false
  },
 };

--- a/net/base/filename_util_unittest.cc
+++ b/net/base/filename_util_unittest.cc