Commit 92a354fc authored by asanka's avatar asanka Committed by Commit bot

Update ReplaceIllegalCharactersInPath to handle quirks in HFS+ and VFAT

This change:

- Re-introduces U+200C and U+200D as illegal characters since these are
  ignored on HFS+ and can interfere with filename sanitization. All code
  points in the Cf general category are now considered illegal in a
  filename.

- Leading and trailing WSpace and '.' characters are now considered
  illegal.

- Due to being confused for short names on VFAT filesystems, the tilde
  ('~') is now considered illegal.

- Prior to this change, only ASCII whitespace were trimmed from
  filenames on Mac OSX. All UTF-8 encoded WSpace characters are now
  handled on Mac OSX.

- Instead of trimming leading and trailing whitespace, they are now
  replaced by the replacement character. Trimming could cause a
  previously hidden extension or basename to be exposed.

BUG=444102
BUG=446538

Review URL: https://codereview.chromium.org/869823003

Cr-Commit-Position: refs/heads/master@{#314041}
parent 745ff1db
......@@ -30,12 +30,19 @@ class IllegalCharacters {
return Singleton<IllegalCharacters>::get();
}
bool contains(UChar32 ucs4) {
return !!set->contains(ucs4);
bool DisallowedEverywhere(UChar32 ucs4) {
return !!illegal_anywhere_->contains(ucs4);
}
bool containsNone(const string16 &s) {
return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
bool DisallowedLeadingOrTrailing(UChar32 ucs4) {
return !!illegal_at_ends_->contains(ucs4);
}
bool IsAllowedName(const string16& s) {
return s.empty() || (!!illegal_anywhere_->containsNone(
icu::UnicodeString(s.c_str(), s.size())) &&
!illegal_at_ends_->contains(*s.begin()) &&
!illegal_at_ends_->contains(*s.rbegin()));
}
private:
......@@ -45,60 +52,61 @@ class IllegalCharacters {
IllegalCharacters();
~IllegalCharacters() { }
scoped_ptr<icu::UnicodeSet> set;
// set of characters considered invalid anywhere inside a filename.
scoped_ptr<icu::UnicodeSet> illegal_anywhere_;
// set of characters considered invalid at either end of a filename.
scoped_ptr<icu::UnicodeSet> illegal_at_ends_;
DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
};
IllegalCharacters::IllegalCharacters() {
UErrorCode status = U_ZERO_ERROR;
// Control characters, formatting characters, non-characters, and
// some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
UErrorCode everywhere_status = U_ZERO_ERROR;
UErrorCode ends_status = U_ZERO_ERROR;
// Control characters, formatting characters, non-characters, path separators,
// and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
// See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
// and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
// TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
// are legitimate in Arabic and some S/SE Asian scripts. However, when used
// elsewhere, they can be confusing/problematic.
// Also, consider wrapping the set with our Singleton class to create and
// freeze it only once. Note that there's a trade-off between memory and
// speed.
#if defined(WCHAR_T_IS_UTF16)
set.reset(new icu::UnicodeSet(icu::UnicodeString(
L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
#else
set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
status));
#endif
DCHECK(U_SUCCESS(status));
// Note that code points in the "Other, Format" (Cf) category are ignored on
// HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
// legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
// also excluded due to the possibility of interacting poorly with short
// filenames on VFAT. (Related to CVE-2014-9390)
illegal_anywhere_.reset(new icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\|][:Cc:][:Cf:]]"),
everywhere_status));
illegal_at_ends_.reset(new icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status));
DCHECK(U_SUCCESS(everywhere_status));
DCHECK(U_SUCCESS(ends_status));
// Add non-characters. If this becomes a performance bottleneck by
// any chance, do not add these to |set| and change IsFilenameLegal()
// to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
// containsNone().
set->add(0xFDD0, 0xFDEF);
// IsAllowedName().
illegal_anywhere_->add(0xFDD0, 0xFDEF);
for (int i = 0; i <= 0x10; ++i) {
int plane_base = 0x10000 * i;
set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
}
set->freeze();
illegal_anywhere_->freeze();
illegal_at_ends_->freeze();
}
} // namespace
bool IsFilenameLegal(const string16& file_name) {
return IllegalCharacters::GetInstance()->containsNone(file_name);
return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
}
void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
char replace_char) {
DCHECK(file_name);
DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
IllegalCharacters* illegal = IllegalCharacters::GetInstance();
// Remove leading and trailing whitespace.
TrimWhitespace(*file_name, TRIM_ALL, file_name);
DCHECK(!(illegal->DisallowedEverywhere(replace_char)));
DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char)));
IllegalCharacters* illegal = IllegalCharacters::GetInstance();
int cursor = 0; // The ICU macros expect an int.
while (cursor < static_cast<int>(file_name->size())) {
int char_begin = cursor;
......@@ -122,7 +130,9 @@ void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
NOTREACHED();
#endif
if (illegal->contains(code_point)) {
if (illegal->DisallowedEverywhere(code_point) ||
((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
illegal->DisallowedLeadingOrTrailing(code_point))) {
file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
// We just made the potentially multi-byte/word char into one that only
// takes one byte/word, so need to adjust the cursor to point to the next
......
......@@ -18,13 +18,23 @@ namespace i18n {
// param has the same restriction as that for ReplaceIllegalCharacters.
BASE_I18N_EXPORT bool IsFilenameLegal(const string16& file_name);
// Replaces characters in 'file_name' that are illegal for file names with
// 'replace_char'. 'file_name' must not be a full or relative path, but just the
// Replaces characters in |file_name| that are illegal for file names with
// |replace_char|. |file_name| must not be a full or relative path, but just the
// file name component (since slashes are considered illegal). Any leading or
// trailing whitespace in 'file_name' is removed.
// trailing whitespace or periods in |file_name| is also replaced with the
// |replace_char|.
//
// Example:
// file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
// 'replace_char' is '-'.
// "bad:file*name?.txt" will be turned into "bad_file_name_.txt" when
// |replace_char| is '_'.
//
// Warning: Do not use this function as the sole means of sanitizing a filename.
// While the resulting filename itself would be legal, it doesn't necessarily
// mean that the file will behave safely. On Windows, certain reserved names
// refer to devices rather than files (E.g. LPT1), and some filenames could be
// interpreted as shell namespace extensions (E.g. Foo.{<GUID>}).
//
// TODO(asanka): Move full filename sanitization logic here.
BASE_I18N_EXPORT void ReplaceIllegalCharactersInPath(
FilePath::StringType* file_name,
char replace_char);
......
......@@ -20,24 +20,28 @@ class FileUtilICUTest : public PlatformTest {
#if defined(OS_POSIX) && !defined(OS_MACOSX)
// Linux disallows some evil ASCII characters, but passes all non-ASCII.
static const struct goodbad_pair {
static const struct GoodBadPairLinux {
const char* bad_name;
const char* good_name;
} kIllegalCharacterCases[] = {
{"bad*file:name?.jpg", "bad-file-name-.jpg"},
} kLinuxIllegalCharacterCases[] = {
{"bad*\\/file:name?.jpg", "bad---file-name-.jpg"},
{"**********::::.txt", "--------------.txt"},
{"\xe9\xf0zzzz.\xff", "\xe9\xf0zzzz.\xff"},
{" _ ", "-_-"},
{".", "-"},
{" .( ). ", "-.( ).-"},
{" ", "- -"},
};
TEST_F(FileUtilICUTest, ReplaceIllegalCharacersInPathLinuxTest) {
for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
std::string bad_name(kIllegalCharacterCases[i].bad_name);
for (size_t i = 0; i < arraysize(kLinuxIllegalCharacterCases); ++i) {
std::string bad_name(kLinuxIllegalCharacterCases[i].bad_name);
ReplaceIllegalCharactersInPath(&bad_name, '-');
EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
EXPECT_EQ(kLinuxIllegalCharacterCases[i].good_name, bad_name);
}
}
#else
#endif
// For Mac & Windows, which both do Unicode validation on filenames. These
// characters are given as wide strings since its more convenient to specify
......@@ -46,29 +50,36 @@ static const struct goodbad_pair {
const wchar_t* bad_name;
const wchar_t* good_name;
} kIllegalCharacterCases[] = {
{L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
{L"**********::::.txt", L"--------------.txt"},
// We can't use UCNs (universal character names) for C0/C1 characters and
// U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
{L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
#if defined(OS_WIN)
{L"bad*file\\name.jpg", L"bad-file-name.jpg"},
{L"\t bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
#elif defined(OS_MACOSX)
{L"bad*file?name.jpg", L"bad-file-name.jpg"},
{L"\t bad*file?name/.jpg ", L"bad-file-name-.jpg"},
#endif
{L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
{L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
{L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"},
{L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
// Unassigned codepoints are ok.
{L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
// Non-characters are not allowed.
{L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
{L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
{L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
{L"**********::::.txt", L"--------------.txt"},
// We can't use UCNs (universal character names) for C0/C1 characters and
// U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
{L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
{L"bad*file\\?name.jpg", L"bad-file--name.jpg"},
{L"\t bad*file\\name/.jpg", L"- bad-file-name-.jpg"},
{L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
{L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
{L"\u0635\u200C\u0644.mp3", L"\u0635-\u0644.mp3"},
{L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
// Unassigned codepoints are ok.
{L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
// Non-characters are not allowed.
{L"bad\uFFFFfile\U0010FFFEname.jpg", L"bad-file-name.jpg"},
{L"bad\uFDD0file\uFDEFname.jpg", L"bad-file-name.jpg"},
// CVE-2014-9390
{L"(\u200C.\u200D.\u200E.\u200F.\u202A.\u202B.\u202C.\u202D.\u202E.\u206A."
L"\u206B.\u206C.\u206D.\u206F.\uFEFF)",
L"(-.-.-.-.-.-.-.-.-.-.-.-.-.-.-)"},
{L"config~1", L"config-1"},
{L" _ ", L"-_-"},
{L" ", L"-"},
{L"\u2008.(\u2007).\u3000", L"-.(\u2007).-"},
{L" ", L"- -"},
{L". ", L"- -"}
};
#if defined(OS_WIN) || defined(OS_MACOSX)
TEST_F(FileUtilICUTest, ReplaceIllegalCharactersInPathTest) {
for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
#if defined(OS_WIN)
......@@ -85,6 +96,19 @@ TEST_F(FileUtilICUTest, ReplaceIllegalCharactersInPathTest) {
#endif
TEST_F(FileUtilICUTest, IsFilenameLegalTest) {
EXPECT_TRUE(IsFilenameLegal(string16()));
for (const auto& test_case : kIllegalCharacterCases) {
string16 bad_name = WideToUTF16(test_case.bad_name);
string16 good_name = WideToUTF16(test_case.good_name);
EXPECT_TRUE(IsFilenameLegal(good_name)) << good_name;
if (good_name != bad_name)
EXPECT_FALSE(IsFilenameLegal(bad_name)) << bad_name;
}
}
#if defined(OS_CHROMEOS)
static const struct normalize_name_encoding_test_cases {
const char* original_path;
......
......@@ -1242,7 +1242,7 @@ base::FilePath SavePackage::GetSuggestedNameForSaveAs(
name_with_proper_ext = EnsureHtmlExtension(name_with_proper_ext);
base::FilePath::StringType file_name = name_with_proper_ext.value();
base::i18n::ReplaceIllegalCharactersInPath(&file_name, ' ');
base::i18n::ReplaceIllegalCharactersInPath(&file_name, '_');
return base::FilePath(file_name);
}
......
......@@ -382,7 +382,7 @@ static const struct SuggestedSaveNameTestCase {
// A URL-like title that does not match the title is respected in full.
{ "http://foo.com",
base::ASCIIToUTF16("http://www.foo.com/path/title.txt"),
FPL("http www.foo.com path title.txt"),
FPL("http___www.foo.com_path_title.txt"),
false
},
};
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment