Commit 04729322 authored by Daniel Hosseinian's avatar Daniel Hosseinian Committed by Chromium LUCI CQ

Add a function to parse PDF dates

The function converts the string format described in section 7.9.4
"Dates" of the ISO 32000-1 standard to a base::Time, which will be
necessary when localizing dates for display.

Bug: 93619
Change-Id: I9c580ca201e6058032778c5d72a1a713478937cb
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2610624
Commit-Queue: Daniel Hosseinian <dhoss@chromium.org>
Reviewed-by: default avatarK. Moon <kmoon@chromium.org>
Reviewed-by: default avatarTom Sepez <tsepez@chromium.org>
Cr-Commit-Position: refs/heads/master@{#843344}
parent 23c110b5
......@@ -103,6 +103,8 @@ if (enable_pdf) {
"pdf_init.h",
"pdf_transform.cc",
"pdf_transform.h",
"pdf_utils/dates.cc",
"pdf_utils/dates.h",
"pdf_view_plugin_base.cc",
"pdf_view_plugin_base.h",
"pdfium/pdfium_api_string_buffer_adapter.cc",
......@@ -319,6 +321,7 @@ if (enable_pdf) {
"out_of_process_instance_unittest.cc",
"page_orientation_unittest.cc",
"pdf_transform_unittest.cc",
"pdf_utils/dates_unittest.cc",
"pdfium/accessibility_unittest.cc",
"pdfium/findtext_unittest.cc",
"pdfium/pdfium_engine_exports_unittest.cc",
......
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdf_utils/dates.h"
#include <stdint.h>
#include "base/optional.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "base/time/time.h"
namespace chrome_pdf {
namespace {
class DateDeserializer final {
public:
// `parsing` must outlive `this` because `base::StringPiece` has reference
// semantics.
explicit DateDeserializer(base::StringPiece parsing)
: deserializing_(parsing) {}
~DateDeserializer() = default;
// Pops the first `num_digits` characters from the string and converts them to
// an int if possible. Popping too many characters or characters that cannot
// be converted puts the deserializer in a stopped state.
base::Optional<int> PopDigits(size_t num_digits) {
if (stopped_)
return base::nullopt;
// `base::StringToUint()` allows leading sign characters, so also verify
// that the front character is a digit.
uint32_t value;
if (deserializing_.size() < num_digits ||
!base::IsAsciiDigit(deserializing_.front()) ||
!base::StringToUint(deserializing_.substr(0, num_digits), &value)) {
stopped_ = true;
return base::nullopt;
}
// Pop front characters.
deserializing_ = deserializing_.substr(num_digits);
return value;
}
// Pops the front character if it is not a digit. Otherwise, does not change
// the state of the deserializer and returns `base::nullopt`.
base::Optional<char> TryPopNonDigit() {
if (stopped_ || deserializing_.empty())
return base::nullopt;
const char front = deserializing_.front();
if (base::IsAsciiDigit(front))
return base::nullopt;
deserializing_ = deserializing_.substr(1);
return front;
}
// Takes the deserializer out of a stopped state.
void unstop() { stopped_ = false; }
private:
base::StringPiece deserializing_;
bool stopped_ = false;
};
// Parses the offset info in `deserializer`, which is the time offset portion of
// the date format provided in section 7.9.4 "Dates" of the ISO 32000-1
// standard. An input is expected to look like "HH'mm", such that "HH" is the
// hour and "mm" is the minute.
base::TimeDelta ParseOffset(DateDeserializer& deserializer) {
base::TimeDelta offset;
// UTC is assumed if no time zone information is provided.
const base::Optional<char> sign = deserializer.TryPopNonDigit();
if (!sign.has_value() || (sign.value() != '+' && sign.value() != '-'))
return offset;
offset += base::TimeDelta::FromHours(deserializer.PopDigits(2).value_or(0));
// The spec requires that the hours offset be followed by an apostrophe, but
// don't be strict about its presence.
const base::Optional<char> apostrophe = deserializer.TryPopNonDigit();
if (apostrophe.has_value() && apostrophe.value() != '\'')
return sign.value() == '+' ? offset : -offset;
// The minutes offset follows the hours offset. Be lenient about anything
// following the minutes offset. One reason for the leniency is the apostrophe
// following the minues, which is only mentioned in earlier versions of the
// spec.
offset += base::TimeDelta::FromMinutes(deserializer.PopDigits(2).value_or(0));
return sign.value() == '+' ? offset : -offset;
}
} // namespace
base::Time ParsePdfDate(base::StringPiece date) {
// The prefix "D:" is required according to the spec, but don't require it as
// earlier versions of the spec weren't strict about it.
if (date.substr(0, 2) == "D:")
date = date.substr(2);
DateDeserializer deserializer(date);
// Year is the only required part of a valid date.
const base::Optional<int> deserialized_year = deserializer.PopDigits(4);
if (!deserialized_year.has_value())
return base::Time();
// Month and day default to 1. The rest of the parts of a date default to 0.
base::Time::Exploded exploded = {};
exploded.year = deserialized_year.value();
exploded.month = deserializer.PopDigits(2).value_or(1);
exploded.day_of_month = deserializer.PopDigits(2).value_or(1);
exploded.hour = deserializer.PopDigits(2).value_or(0);
exploded.minute = deserializer.PopDigits(2).value_or(0);
exploded.second = deserializer.PopDigits(2).value_or(0);
base::Time parsed;
if (!base::Time::FromUTCExploded(exploded, &parsed))
return base::Time();
// `base::Time` is in UTC, so `parsed` must be normalized if there is an
// offset.
deserializer.unstop();
return parsed - ParseOffset(deserializer);
}
} // namespace chrome_pdf
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef PDF_PDF_UTILS_DATES_H_
#define PDF_PDF_UTILS_DATES_H_
#include "base/strings/string_piece.h"
namespace base {
class Time;
} // namespace base
namespace chrome_pdf {
// Parses a string in the PDF date format (see section 7.9.4 "Dates" of the ISO
// 32000-1 standard). If `date` cannot be parsed, returns a "null" time (one for
// which `base::Time::is_null()` returns `true`).
base::Time ParsePdfDate(base::StringPiece date);
} // namespace chrome_pdf
#endif // PDF_PDF_UTILS_DATES_H_
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdf_utils/dates.h"
#include "base/strings/string_piece.h"
#include "base/time/time.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace chrome_pdf {
namespace {
bool IsInvalidPdfDate(base::StringPiece input) {
return ParsePdfDate(input).is_null();
}
} // namespace
TEST(DatesTest, ParsePdfDateNotADate) {
EXPECT_PRED1(IsInvalidPdfDate, "NotADate");
}
TEST(DatesTest, ParsePdfDateInvalidDateValues) {
EXPECT_PRED1(IsInvalidPdfDate, "D:20202460909090");
}
TEST(DatesTest, ParsePdfDateLeapSeconds) {
// TODO(dhoss): Explore whether leap seconds should be supported. They are
// currently not supported by other PDF readers.
EXPECT_PRED1(IsInvalidPdfDate, "D:20150630235960");
}
TEST(DatesTest, ParsePdfDateBadPrefix) {
EXPECT_PRED1(IsInvalidPdfDate, "A:20200604214109");
EXPECT_PRED1(IsInvalidPdfDate, "D20200604214109");
EXPECT_PRED1(IsInvalidPdfDate, "D;20200604214109");
}
TEST(DatesTest, ParsePdfDateNoValidYear) {
EXPECT_PRED1(IsInvalidPdfDate, "");
EXPECT_PRED1(IsInvalidPdfDate, "");
EXPECT_PRED1(IsInvalidPdfDate, "D:999");
}
TEST(DatesTest, ParsePdfDateNoPrefix) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2012-03-30 01:47:52", &expected));
EXPECT_EQ(ParsePdfDate("20120330014752"), expected);
}
TEST(DatesTest, ParsePdfDateNoTimeOffset) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("1997-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:1997"), expected);
ASSERT_TRUE(base::Time::FromUTCString("1998-12-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:199812"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2002-10-12 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:20021012"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2004-08-10 19:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2004081019"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2007-03-08 06:52:00", &expected));
EXPECT_EQ(ParsePdfDate("D:200703080652"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 21:41:09", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214109"), expected);
}
TEST(DatesTest, ParsePdfDateWithUtcOffset) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2020-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2020Z"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2021-01-02 03:04:05", &expected));
EXPECT_EQ(ParsePdfDate("D:20210102030405Z"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2021-01-02 03:04:05", &expected));
EXPECT_EQ(ParsePdfDate("D:20210102030405Z08'00"), expected);
}
TEST(DatesTest, ParsePdfDateWithTimeOffset) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2020-06-05 05:41:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120-08'"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 13:41:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120+08'"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-05 09:11:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120-11'30"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 10:11:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120+11'30"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 21:41:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120+6'45'"), expected);
}
TEST(DatesTest, ParsePdfDateWithSecondsOffset) {
// Seconds offset is not supported.
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 21:41:20", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214120+6'45'56'"), expected);
}
TEST(DatesTest, ParsePdfDateWithTimeOffsetNoApostrophe) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2004-08-11 03:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2004081019-08"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2007-03-07 22:52:00", &expected));
EXPECT_EQ(ParsePdfDate("D:200703080652+08"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 09:07:09", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214109+1234"), expected);
}
TEST(DatesTest, ParsePdfDateWithTimeOffsetNonDigitDelimiter) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2002-01-15 11:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:20020115120000+01[23"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2002-01-15 13:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:20020115120000-01]23"), expected);
}
TEST(DatesTest, ParsePdfDateTruncatedFields) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("1997-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:19973"), expected);
}
TEST(DatesTest, ParsePdfDateMissingFields) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("1997-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:1997"), expected);
ASSERT_TRUE(base::Time::FromUTCString("1998-12-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:199812"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2002-10-12 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:20021012'"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2004-08-10 19:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2004081019"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2007-03-08 06:52:00", &expected));
EXPECT_EQ(ParsePdfDate("D:200703080652"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 21:41:09", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214109"), expected);
}
TEST(DatesTest, ParsePdfDateFieldsWithLeadingSigns) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("2007-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2007+3"), expected);
}
TEST(DatesTest, ParsePdfDateNonNumericalFields) {
base::Time expected;
ASSERT_TRUE(base::Time::FromUTCString("1998-12-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:199812abcd"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2002-10-12 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:20021012hello'"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2004-08-10 19:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2004081019goodbye"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2007-03-08 06:52:00", &expected));
EXPECT_EQ(ParsePdfDate("D:200703080652john"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2020-06-04 21:41:09", &expected));
EXPECT_EQ(ParsePdfDate("D:20200604214109paul"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2021-01-01 00:00:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2021FB20095906"), expected);
ASSERT_TRUE(base::Time::FromUTCString("2025-07-29 23:26:00", &expected));
EXPECT_EQ(ParsePdfDate("D:2025073012+1234foo"), expected);
}
} // namespace chrome_pdf
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment