CSVTable should only understand CRLF and LF as EOL

So far, CSVTable understood any combination of CR and LF characters as the end of line. After a discussion linked to in https://crbug.com/923811, this CL changes that to explicitly only allowing CRLF and LF. Moreover, CRLF is no longer converted to LF inside quoted (escaped) strings. Performance impact, as measured with the benchmark from https://crrev.com/c/1459642: * Run time of the benchmark decreased from about 10s to about 9.5s * Peak memory consumption over 5 runs decreased from about 168700 kB to about 168350 kB Bug: 923811 Change-Id: I979d05e9f296f285db99dc22662bae29c907ea2a Reviewed-on: https://chromium-review.googlesource.com/c/1461042 Commit-Queue: Vaclav Brozek <vabr@chromium.org> Reviewed-by: Jan Wilken Dörrie <jdoerrie@chromium.org> Cr-Commit-Position: refs/heads/master@{#630708}

CSVTable should only understand CRLF and LF as EOL
So far, CSVTable understood any combination of CR and LF characters as the end of line. After a discussion linked to in https://crbug.com/923811, this CL changes that to explicitly only allowing CRLF and LF. Moreover, CRLF is no longer converted to LF inside quoted (escaped) strings. Performance impact, as measured with the benchmark from https://crrev.com/c/1459642: * Run time of the benchmark decreased from about 10s to about 9.5s * Peak memory consumption over 5 runs decreased from about 168700 kB to about 168350 kB Bug: 923811 Change-Id: I979d05e9f296f285db99dc22662bae29c907ea2a Reviewed-on: https://chromium-review.googlesource.com/c/1461042 Commit-Queue: Vaclav Brozek <vabr@chromium.org> Reviewed-by: Jan Wilken Dörrie <jdoerrie@chromium.org> Cr-Commit-Position: refs/heads/master@{#630708}
7e4619a2 · Vaclav Brozek · Commit Bot · 44827fbd · 7e4619a2 · 7e4619a2
Commit 7e4619a2 authored Feb 11, 2019 by Vaclav Brozek Committed by Commit Bot Feb 11, 2019
3 changed files
--- a/components/password_manager/core/browser/import/csv_reader.cc
+++ b/components/password_manager/core/browser/import/csv_reader.cc
@@ -16,24 +16,27 @@
 namespace {
 // Returns all the characters from the start of |input| until the first '\n',
-// '\r' (exclusive) or the end of |input|. Cuts the returned part (inclusive the
+// "\r\n" (exclusive) or the end of |input|. Cuts the returned part (inclusive
-// line breaks) from |input|. Skips blocks of matching quotes. Examples:
+// the line breaks) from |input|. Skips blocks of matching quotes. Examples:
 // old input -> returned value, new input
 // "ab\ncd" -> "ab", "cd"
-// "\r\n" -> "", "\n"
+// "\r\n" -> "", ""
 // "abcd" -> "abcd", ""
+// "\r" -> "\r", ""
 // "a\"\n\"b" -> "a\"\n\"b", ""
 base::StringPiece ConsumeLine(base::StringPiece* input) {
  DCHECK(input);
  DCHECK(!input->empty());
  bool inside_quotes = false;
+  bool last_char_was_CR = false;
  for (size_t current = 0; current < input->size(); ++current) {
-    switch ((*input)[current]) {
+    char c = (*input)[current];
+    switch (c) {
      case '\n':
-      case '\r':
        if (!inside_quotes) {
-          base::StringPiece ret(input->data(), current);
+          const size_t eol_start = last_char_was_CR ? current - 1 : current;
+          base::StringPiece ret(input->data(), eol_start);
          *input = input->substr(current + 1);
          return ret;
        }
@@ -44,6 +47,7 @@ base::StringPiece ConsumeLine(base::StringPiece* input) {
      default:
        break;
    }
+    last_char_was_CR = (c == '\r');
  }
  // The whole |*input| is one line.
@@ -296,12 +300,8 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
  records_.clear();
  column_names_.clear();
-  // Normalize EOL sequences so that we uniformly use a single LF character.
-  std::string normalized_csv(csv);
-  base::ReplaceSubstringsAfterOffset(&normalized_csv, 0, "\r\n", "\n");
  // Read header row.
-  CSVParser parser(normalized_csv);
+  CSVParser parser(csv);
  if (!parser.HasMoreRows()) {
    // The empty CSV is a special case. It can be seen as having one row, with a
    // single field, which is an empty string.
@@ -316,10 +316,9 @@ bool CSVTable::ReadCSV(base::StringPiece csv) {
  while (parser.HasMoreRows()) {
    if (!parser.ParseNextCSVRow(&fields))
      return false;
-    // If there are more line-breaking characters ('\r' or '\n') in sequence,
+    // If there are more line-breaking characters in sequence, the row parser
-    // the row parser will see an empty row in between each successive two of
+    // will see an empty row in between each successive two of those. Discard
-    // those. Discard such results, because those are useless for importing
+    // such results, because those are useless for importing passwords.
-    // passwords.
    if (fields.size() == 1 && fields[0].empty())
      continue;

--- a/components/password_manager/core/browser/import/csv_reader.h
+++ b/components/password_manager/core/browser/import/csv_reader.h
@@ -25,8 +25,7 @@ class CSVTable {
  // defined in RFC 4180, with the following limitations/relaxations:
  //   * The input should be UTF-8 encoded. No code points should be escaped.
  //   * The first line must be a header that contains the column names.
-  //   * Records may be separated by either LF or CRLF sequences. Each CRLF will
+  //   * Records may be separated by either LF or CRLF sequences.
-  //     be converted to LF characters inside quotes.
  //   * Inconsistent number of fields within records is handled gracefully.
  //     Extra fields are ignored. Missing fields will have no corresponding
  //     key-value pair in the record.

--- a/components/password_manager/core/browser/import/csv_reader_unittest.cc
+++ b/components/password_manager/core/browser/import/csv_reader_unittest.cc
@@ -104,7 +104,7 @@ TEST(CSVReaderTest, Positive) {
          "\",\",\",,\"\n",
          {"left", "right"},
          {{{"left", "A\rB"}, {"right", "B\nC"}},
-           {{"left", "C\nD"}, {"right", "D\n"}},
+           {{"left", "C\r\nD"}, {"right", "D\n"}},
           {{"left", ","}, {"right", ",,"}}},
      },
      {
@@ -143,11 +143,19 @@ TEST(CSVReaderTest, Positive) {
           {{"left", ""}, {"middle", ""}, {"right", "gamma"}}},
      },
      {
-          "CRLFTreatedAsAndConvertedToLF",
+          "CRLFTreatedAsLF",
          "left,right\r\n"
          "\"\r\",\"\r\n\"\r\n",
          {"left", "right"},
-          {{{"left", "\r"}, {"right", "\n"}}},
+          {{{"left", "\r"}, {"right", "\r\n"}}},
+      },
+      {
+          "CRAloneIgnored",
+          "left,right\r"
+          "A,B\r\n"
+          "1,2,3",
+          {"left", "right\rA", "B"},
+          {{{"B", "3"}, {"left", "1"}, {"right\rA", "2"}}},
      },
      {
          "LastValueForRepeatedColumnNamesIsPreserved",
@@ -167,13 +175,13 @@ TEST(CSVReaderTest, Positive) {
          "foo,bar\n"
          "\n"
          "a,b\n"
-          "\r"
+          "\r\n"
          "c,d\r\r\r\r\r\r\r\r\n"
          "e,f",
          {"foo", "bar"},
          {
              {{"bar", "b"}, {"foo", "a"}},
-              {{"bar", "d"}, {"foo", "c"}},
+              {{"bar", "d\r\r\r\r\r\r\r"}, {"foo", "c"}},
              {{"bar", "f"}, {"foo", "e"}},
          },
      },