Commit 7e4619a2 authored by Vaclav Brozek's avatar Vaclav Brozek Committed by Commit Bot

CSVTable should only understand CRLF and LF as EOL

So far, CSVTable understood any combination of CR and LF characters as
the end of line. After a discussion linked to in
https://crbug.com/923811, this CL changes that to explicitly only
allowing CRLF and LF.

Moreover, CRLF is no longer converted to LF inside quoted (escaped)
strings.

Performance impact, as measured with the benchmark from
https://crrev.com/c/1459642:
 * Run time of the benchmark decreased from about 10s to about 9.5s
 * Peak memory consumption over 5 runs decreased from about 168700 kB
   to about 168350 kB

Bug: 923811
Change-Id: I979d05e9f296f285db99dc22662bae29c907ea2a
Reviewed-on: https://chromium-review.googlesource.com/c/1461042
Commit-Queue: Vaclav Brozek <vabr@chromium.org>
Reviewed-by: default avatarJan Wilken Dörrie <jdoerrie@chromium.org>
Cr-Commit-Position: refs/heads/master@{#630708}
parent 44827fbd
...@@ -16,24 +16,27 @@ ...@@ -16,24 +16,27 @@
namespace { namespace {
// Returns all the characters from the start of |input| until the first '\n', // Returns all the characters from the start of |input| until the first '\n',
// '\r' (exclusive) or the end of |input|. Cuts the returned part (inclusive the // "\r\n" (exclusive) or the end of |input|. Cuts the returned part (inclusive
// line breaks) from |input|. Skips blocks of matching quotes. Examples: // the line breaks) from |input|. Skips blocks of matching quotes. Examples:
// old input -> returned value, new input // old input -> returned value, new input
// "ab\ncd" -> "ab", "cd" // "ab\ncd" -> "ab", "cd"
// "\r\n" -> "", "\n" // "\r\n" -> "", ""
// "abcd" -> "abcd", "" // "abcd" -> "abcd", ""
// "\r" -> "\r", ""
// "a\"\n\"b" -> "a\"\n\"b", "" // "a\"\n\"b" -> "a\"\n\"b", ""
base::StringPiece ConsumeLine(base::StringPiece* input) { base::StringPiece ConsumeLine(base::StringPiece* input) {
DCHECK(input); DCHECK(input);
DCHECK(!input->empty()); DCHECK(!input->empty());
bool inside_quotes = false; bool inside_quotes = false;
bool last_char_was_CR = false;
for (size_t current = 0; current < input->size(); ++current) { for (size_t current = 0; current < input->size(); ++current) {
switch ((*input)[current]) { char c = (*input)[current];
switch (c) {
case '\n': case '\n':
case '\r':
if (!inside_quotes) { if (!inside_quotes) {
base::StringPiece ret(input->data(), current); const size_t eol_start = last_char_was_CR ? current - 1 : current;
base::StringPiece ret(input->data(), eol_start);
*input = input->substr(current + 1); *input = input->substr(current + 1);
return ret; return ret;
} }
...@@ -44,6 +47,7 @@ base::StringPiece ConsumeLine(base::StringPiece* input) { ...@@ -44,6 +47,7 @@ base::StringPiece ConsumeLine(base::StringPiece* input) {
default: default:
break; break;
} }
last_char_was_CR = (c == '\r');
} }
// The whole |*input| is one line. // The whole |*input| is one line.
...@@ -296,12 +300,8 @@ bool CSVTable::ReadCSV(base::StringPiece csv) { ...@@ -296,12 +300,8 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
records_.clear(); records_.clear();
column_names_.clear(); column_names_.clear();
// Normalize EOL sequences so that we uniformly use a single LF character.
std::string normalized_csv(csv);
base::ReplaceSubstringsAfterOffset(&normalized_csv, 0, "\r\n", "\n");
// Read header row. // Read header row.
CSVParser parser(normalized_csv); CSVParser parser(csv);
if (!parser.HasMoreRows()) { if (!parser.HasMoreRows()) {
// The empty CSV is a special case. It can be seen as having one row, with a // The empty CSV is a special case. It can be seen as having one row, with a
// single field, which is an empty string. // single field, which is an empty string.
...@@ -316,10 +316,9 @@ bool CSVTable::ReadCSV(base::StringPiece csv) { ...@@ -316,10 +316,9 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
while (parser.HasMoreRows()) { while (parser.HasMoreRows()) {
if (!parser.ParseNextCSVRow(&fields)) if (!parser.ParseNextCSVRow(&fields))
return false; return false;
// If there are more line-breaking characters ('\r' or '\n') in sequence, // If there are more line-breaking characters in sequence, the row parser
// the row parser will see an empty row in between each successive two of // will see an empty row in between each successive two of those. Discard
// those. Discard such results, because those are useless for importing // such results, because those are useless for importing passwords.
// passwords.
if (fields.size() == 1 && fields[0].empty()) if (fields.size() == 1 && fields[0].empty())
continue; continue;
......
...@@ -25,8 +25,7 @@ class CSVTable { ...@@ -25,8 +25,7 @@ class CSVTable {
// defined in RFC 4180, with the following limitations/relaxations: // defined in RFC 4180, with the following limitations/relaxations:
// * The input should be UTF-8 encoded. No code points should be escaped. // * The input should be UTF-8 encoded. No code points should be escaped.
// * The first line must be a header that contains the column names. // * The first line must be a header that contains the column names.
// * Records may be separated by either LF or CRLF sequences. Each CRLF will // * Records may be separated by either LF or CRLF sequences.
// be converted to LF characters inside quotes.
// * Inconsistent number of fields within records is handled gracefully. // * Inconsistent number of fields within records is handled gracefully.
// Extra fields are ignored. Missing fields will have no corresponding // Extra fields are ignored. Missing fields will have no corresponding
// key-value pair in the record. // key-value pair in the record.
......
...@@ -104,7 +104,7 @@ TEST(CSVReaderTest, Positive) { ...@@ -104,7 +104,7 @@ TEST(CSVReaderTest, Positive) {
"\",\",\",,\"\n", "\",\",\",,\"\n",
{"left", "right"}, {"left", "right"},
{{{"left", "A\rB"}, {"right", "B\nC"}}, {{{"left", "A\rB"}, {"right", "B\nC"}},
{{"left", "C\nD"}, {"right", "D\n"}}, {{"left", "C\r\nD"}, {"right", "D\n"}},
{{"left", ","}, {"right", ",,"}}}, {{"left", ","}, {"right", ",,"}}},
}, },
{ {
...@@ -143,11 +143,19 @@ TEST(CSVReaderTest, Positive) { ...@@ -143,11 +143,19 @@ TEST(CSVReaderTest, Positive) {
{{"left", ""}, {"middle", ""}, {"right", "gamma"}}}, {{"left", ""}, {"middle", ""}, {"right", "gamma"}}},
}, },
{ {
"CRLFTreatedAsAndConvertedToLF", "CRLFTreatedAsLF",
"left,right\r\n" "left,right\r\n"
"\"\r\",\"\r\n\"\r\n", "\"\r\",\"\r\n\"\r\n",
{"left", "right"}, {"left", "right"},
{{{"left", "\r"}, {"right", "\n"}}}, {{{"left", "\r"}, {"right", "\r\n"}}},
},
{
"CRAloneIgnored",
"left,right\r"
"A,B\r\n"
"1,2,3",
{"left", "right\rA", "B"},
{{{"B", "3"}, {"left", "1"}, {"right\rA", "2"}}},
}, },
{ {
"LastValueForRepeatedColumnNamesIsPreserved", "LastValueForRepeatedColumnNamesIsPreserved",
...@@ -167,13 +175,13 @@ TEST(CSVReaderTest, Positive) { ...@@ -167,13 +175,13 @@ TEST(CSVReaderTest, Positive) {
"foo,bar\n" "foo,bar\n"
"\n" "\n"
"a,b\n" "a,b\n"
"\r" "\r\n"
"c,d\r\r\r\r\r\r\r\r\n" "c,d\r\r\r\r\r\r\r\r\n"
"e,f", "e,f",
{"foo", "bar"}, {"foo", "bar"},
{ {
{{"bar", "b"}, {"foo", "a"}}, {{"bar", "b"}, {"foo", "a"}},
{{"bar", "d"}, {"foo", "c"}}, {{"bar", "d\r\r\r\r\r\r\r"}, {"foo", "c"}},
{{"bar", "f"}, {"foo", "e"}}, {{"bar", "f"}, {"foo", "e"}},
}, },
}, },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment