Commit 7e4619a2 authored by Vaclav Brozek's avatar Vaclav Brozek Committed by Commit Bot

CSVTable should only understand CRLF and LF as EOL

So far, CSVTable understood any combination of CR and LF characters as
the end of line. After a discussion linked to in
https://crbug.com/923811, this CL changes that to explicitly only
allowing CRLF and LF.

Moreover, CRLF is no longer converted to LF inside quoted (escaped)
strings.

Performance impact, as measured with the benchmark from
https://crrev.com/c/1459642:
 * Run time of the benchmark decreased from about 10s to about 9.5s
 * Peak memory consumption over 5 runs decreased from about 168700 kB
   to about 168350 kB

Bug: 923811
Change-Id: I979d05e9f296f285db99dc22662bae29c907ea2a
Reviewed-on: https://chromium-review.googlesource.com/c/1461042
Commit-Queue: Vaclav Brozek <vabr@chromium.org>
Reviewed-by: default avatarJan Wilken Dörrie <jdoerrie@chromium.org>
Cr-Commit-Position: refs/heads/master@{#630708}
parent 44827fbd
......@@ -16,24 +16,27 @@
namespace {
// Returns all the characters from the start of |input| until the first '\n',
// '\r' (exclusive) or the end of |input|. Cuts the returned part (inclusive the
// line breaks) from |input|. Skips blocks of matching quotes. Examples:
// "\r\n" (exclusive) or the end of |input|. Cuts the returned part (inclusive
// the line breaks) from |input|. Skips blocks of matching quotes. Examples:
// old input -> returned value, new input
// "ab\ncd" -> "ab", "cd"
// "\r\n" -> "", "\n"
// "\r\n" -> "", ""
// "abcd" -> "abcd", ""
// "\r" -> "\r", ""
// "a\"\n\"b" -> "a\"\n\"b", ""
base::StringPiece ConsumeLine(base::StringPiece* input) {
DCHECK(input);
DCHECK(!input->empty());
bool inside_quotes = false;
bool last_char_was_CR = false;
for (size_t current = 0; current < input->size(); ++current) {
switch ((*input)[current]) {
char c = (*input)[current];
switch (c) {
case '\n':
case '\r':
if (!inside_quotes) {
base::StringPiece ret(input->data(), current);
const size_t eol_start = last_char_was_CR ? current - 1 : current;
base::StringPiece ret(input->data(), eol_start);
*input = input->substr(current + 1);
return ret;
}
......@@ -44,6 +47,7 @@ base::StringPiece ConsumeLine(base::StringPiece* input) {
default:
break;
}
last_char_was_CR = (c == '\r');
}
// The whole |*input| is one line.
......@@ -296,12 +300,8 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
records_.clear();
column_names_.clear();
// Normalize EOL sequences so that we uniformly use a single LF character.
std::string normalized_csv(csv);
base::ReplaceSubstringsAfterOffset(&normalized_csv, 0, "\r\n", "\n");
// Read header row.
CSVParser parser(normalized_csv);
CSVParser parser(csv);
if (!parser.HasMoreRows()) {
// The empty CSV is a special case. It can be seen as having one row, with a
// single field, which is an empty string.
......@@ -316,10 +316,9 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
while (parser.HasMoreRows()) {
if (!parser.ParseNextCSVRow(&fields))
return false;
// If there are more line-breaking characters ('\r' or '\n') in sequence,
// the row parser will see an empty row in between each successive two of
// those. Discard such results, because those are useless for importing
// passwords.
// If there are more line-breaking characters in sequence, the row parser
// will see an empty row in between each successive two of those. Discard
// such results, because those are useless for importing passwords.
if (fields.size() == 1 && fields[0].empty())
continue;
......
......@@ -25,8 +25,7 @@ class CSVTable {
// defined in RFC 4180, with the following limitations/relaxations:
// * The input should be UTF-8 encoded. No code points should be escaped.
// * The first line must be a header that contains the column names.
// * Records may be separated by either LF or CRLF sequences. Each CRLF will
// be converted to LF characters inside quotes.
// * Records may be separated by either LF or CRLF sequences.
// * Inconsistent number of fields within records is handled gracefully.
// Extra fields are ignored. Missing fields will have no corresponding
// key-value pair in the record.
......
......@@ -104,7 +104,7 @@ TEST(CSVReaderTest, Positive) {
"\",\",\",,\"\n",
{"left", "right"},
{{{"left", "A\rB"}, {"right", "B\nC"}},
{{"left", "C\nD"}, {"right", "D\n"}},
{{"left", "C\r\nD"}, {"right", "D\n"}},
{{"left", ","}, {"right", ",,"}}},
},
{
......@@ -143,11 +143,19 @@ TEST(CSVReaderTest, Positive) {
{{"left", ""}, {"middle", ""}, {"right", "gamma"}}},
},
{
"CRLFTreatedAsAndConvertedToLF",
"CRLFTreatedAsLF",
"left,right\r\n"
"\"\r\",\"\r\n\"\r\n",
{"left", "right"},
{{{"left", "\r"}, {"right", "\n"}}},
{{{"left", "\r"}, {"right", "\r\n"}}},
},
{
"CRAloneIgnored",
"left,right\r"
"A,B\r\n"
"1,2,3",
{"left", "right\rA", "B"},
{{{"B", "3"}, {"left", "1"}, {"right\rA", "2"}}},
},
{
"LastValueForRepeatedColumnNamesIsPreserved",
......@@ -167,13 +175,13 @@ TEST(CSVReaderTest, Positive) {
"foo,bar\n"
"\n"
"a,b\n"
"\r"
"\r\n"
"c,d\r\r\r\r\r\r\r\r\n"
"e,f",
{"foo", "bar"},
{
{{"bar", "b"}, {"foo", "a"}},
{{"bar", "d"}, {"foo", "c"}},
{{"bar", "d\r\r\r\r\r\r\r"}, {"foo", "c"}},
{{"bar", "f"}, {"foo", "e"}},
},
},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment