From e75ececf0453df61dac7abb1365c1e1ea149b355 Mon Sep 17 00:00:00 2001 From: Pearu Peterson Date: Tue, 9 Jun 2026 23:30:54 +0300 Subject: [PATCH] GH-41488: [C++][Python] Apply timestamp_parsers as fallback when parsing CSV date and time columns CSV columns explicitly typed as date32, date64, time32 or time64 could only be parsed from strict ISO-8601 strings; ConvertOptions::timestamp_parsers was consulted only for timestamp columns. Make the user-defined timestamp parsers act as a fallback for these column types: the built-in ISO-8601 parser is tried first (preserving existing behavior), then each configured parser in order. A timestamp produced by a fallback parser is floored to the day boundary for dates and reduced to the time of day for times, consistent with casting a timestamp to a date or time type. Type inference of date and time columns is deliberately unaffected: inference keeps using strict ISO-8601 parsing, otherwise a value with a time-of-day part could be inferred as a date and silently truncated. Also provide C-locale name tables to the vendored musl strptime used on Windows, where nl_langinfo() is unavailable: this makes %a/%A/%b/%B/%h/ %p/%c/%r/%x/%X work on Windows (matching musl's C locale), so that the month-name formats from the original issue reports parse on all platforms. Closes GH-28303. Co-Authored-By: Claude Fable 5 --- cpp/src/arrow/csv/converter.cc | 139 ++++++++++++++++++++++--- cpp/src/arrow/csv/converter_test.cc | 108 +++++++++++++++++++ cpp/src/arrow/csv/inference_internal.h | 23 +++- cpp/src/arrow/csv/options.h | 7 ++ cpp/src/arrow/vendored/musl/strptime.c | 19 +++- docs/source/cpp/csv.rst | 15 +++ python/pyarrow/_csv.pyx | 8 ++ python/pyarrow/tests/test_csv.py | 59 +++++++++++ 8 files changed, 361 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index bb59d02cd206..fb039d76fed4 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -17,6 +17,7 @@ #include "arrow/csv/converter.h" +#include #include #include #include @@ -440,6 +441,13 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder { const TimestampParser& parser_; }; +std::vector GetTimestampParsers(const ConvertOptions& options) { + std::vector parsers(options.timestamp_parsers.size()); + std::ranges::transform(options.timestamp_parsers, parsers.begin(), + [](const auto& parser) { return parser.get(); }); + return parsers; +} + struct MultipleParsersTimestampValueDecoder : public ValueDecoder { using value_type = int64_t; @@ -449,7 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()), - parsers_(GetParsers(options_)) {} + parsers_(GetTimestampParsers(options_)) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { bool zone_offset_present = false; @@ -464,18 +472,93 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { } protected: - using ParserVector = std::vector; + TimeUnit::type unit_; + bool expect_timezone_; + std::vector parsers_; +}; + +// +// Value decoder for dates and times, with fallback to user-defined +// timestamp parsers +// + +// Tries the ISO-8601 format first, then the user-defined timestamp parsers. +// A timestamp produced by a user-defined parser is floored to the day +// boundary for dates, and reduced to the time of day for times (consistent +// with casting a timestamp to date32/date64/time32/time64). +template +struct DateTimeWithParsersValueDecoder : public ValueDecoder { + using value_type = typename T::c_type; + + DateTimeWithParsersValueDecoder(const std::shared_ptr& type, + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), + concrete_type_(checked_cast(*type)), + parse_unit_(GetParseUnit(concrete_type_)), + ticks_per_day_(TicksPerDay(parse_unit_)), + parsers_(GetTimestampParsers(options_)) {} - static ParserVector GetParsers(const ConvertOptions& options) { - ParserVector parsers(options.timestamp_parsers.size()); - for (size_t i = 0; i < options.timestamp_parsers.size(); ++i) { - parsers[i] = options.timestamp_parsers[i].get(); + Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { + TrimWhiteSpace(&data, &size); + if (ARROW_PREDICT_TRUE(string_converter_.Convert( + concrete_type_, reinterpret_cast(data), size, out))) { + return Status::OK(); + } + for (const auto& parser : parsers_) { + int64_t timestamp = 0; + bool zone_offset_present = false; + if (parser->operator()(reinterpret_cast(data), size, parse_unit_, + ×tamp, &zone_offset_present) && + !zone_offset_present) { + // Floor division, to handle values before the epoch + int64_t days = timestamp / ticks_per_day_; + days -= (timestamp % ticks_per_day_) < 0; + if constexpr (std::is_same_v) { + *out = static_cast(days); + } else if constexpr (std::is_same_v) { + *out = days * kMillisPerDay; + } else { + static_assert(is_time_type::value); + *out = static_cast(timestamp - days * ticks_per_day_); + } + return Status::OK(); + } } - return parsers; + return GenericConversionError(type_, data, size); } - TimeUnit::type unit_; - bool expect_timezone_; + protected: + static constexpr int64_t kMillisPerDay = 86400000; + + static TimeUnit::type GetParseUnit(const T& type) { + if constexpr (is_time_type::value) { + // Parse in the time type's own unit, so that the time of day can be + // extracted without further conversion + return type.unit(); + } else { + return TimeUnit::SECOND; + } + } + + static int64_t TicksPerDay(TimeUnit::type unit) { + switch (unit) { + case TimeUnit::SECOND: + return 86400LL; + case TimeUnit::MILLI: + return 86400000LL; + case TimeUnit::MICRO: + return 86400000000LL; + case TimeUnit::NANO: + return 86400000000000LL; + } + return -1; // unreachable + } + + const T& concrete_type_; + arrow::internal::StringConverter string_converter_; + const TimeUnit::type parse_unit_; + const int64_t ticks_per_day_; std::vector parsers_; }; @@ -672,6 +755,24 @@ std::shared_ptr MakeTimestampConverter(const std::shared_ptr class ConverterType, typename T> +std::shared_ptr MakeDateTimeConverter(const std::shared_ptr& type, + const ConvertOptions& options, + MemoryPool* pool) { + if (options.timestamp_parsers.empty()) { + // Default to ISO-8601 + return std::make_shared>>(type, options, + pool); + } + // Try ISO-8601 first, then the user-defined timestamp parsers + return std::make_shared>>( + type, options, pool); +} + // // Concrete Converter factory for reals // @@ -743,10 +844,6 @@ Result> Converter::Make(const std::shared_ptr)) CONVERTER_CASE(Type::BINARY, @@ -760,6 +857,22 @@ Result> Converter::Make(const std::shared_ptr(type, options, pool); break; + case Type::DATE32: + ptr = MakeDateTimeConverter(type, options, pool); + break; + + case Type::DATE64: + ptr = MakeDateTimeConverter(type, options, pool); + break; + + case Type::TIME32: + ptr = MakeDateTimeConverter(type, options, pool); + break; + + case Type::TIME64: + ptr = MakeDateTimeConverter(type, options, pool); + break; + case Type::STRING: if (options.check_utf8) { ptr = std::make_shared>>( diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index 5dc078e7fd80..462d0868da2f 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -472,6 +472,40 @@ TEST(Date32Conversion, Errors) { AssertConversionError(date32(), {"2020-13-01\n"}, {0}); } +TEST(Date32Conversion, UserDefinedParsers) { + auto options = ConvertOptions::Defaults(); + const auto type = date32(); + + // Test a single parser + options.timestamp_parsers = {TimestampParser::MakeStrptime("%d/%m/%y")}; + AssertConversion(type, {"15/10/15,18/06/90\n"}, {{16723}, {7473}}, + options); + + // ISO-8601 values are still accepted when parsers are given + AssertConversion(type, {"2020-03-15,15/10/15\n"}, + {{18336}, {16723}}, options); + + // Test multiple parsers, with a pre-epoch value + options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%d-%m-%Y")); + AssertConversion(type, {"15/10/15,08-05-1945\n"}, + {{16723}, {-9004}}, options); + + // Test month names, parsed case-insensitively + options.timestamp_parsers = {TimestampParser::MakeStrptime("%d-%b-%y")}; + AssertConversion(type, {"15-OCT-15,18-Jun-90\n"}, + {{16723}, {7473}}, options); + + // Parsed timestamps are floored to the day boundary, also before the epoch + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %H:%M")}; + AssertConversion(type, {"03/15/2020 14:30,05/08/1945 14:30\n"}, + {{18336}, {-9004}}, options); + + // Test errors + AssertConversionError(type, {"24-12-2020\n"}, {0}, options); + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")}; + AssertConversionError(type, {"01/02/1970 +0000\n"}, {0}, options); +} + TEST(Date64Conversion, Basics) { AssertConversion(date64(), {"1945-05-08\n", "2020-03-15\n"}, {{-777945600000LL, 1584230400000LL}}); @@ -487,6 +521,38 @@ TEST(Date64Conversion, Errors) { AssertConversionError(date64(), {"2020-13-01\n"}, {0}); } +TEST(Date64Conversion, UserDefinedParsers) { + auto options = ConvertOptions::Defaults(); + const auto type = date64(); + + // Test a single parser + options.timestamp_parsers = {TimestampParser::MakeStrptime("%d/%m/%y")}; + AssertConversion(type, {"15/10/15,18/06/90\n"}, + {{1444867200000LL}, {645667200000LL}}, options); + + // ISO-8601 values are still accepted when parsers are given + AssertConversion(type, {"2020-03-15,15/10/15\n"}, + {{1584230400000LL}, {1444867200000LL}}, options); + + // Test multiple parsers, with a pre-epoch value + options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%d-%m-%Y")); + AssertConversion(type, {"15/10/15,08-05-1945\n"}, + {{1444867200000LL}, {-777945600000LL}}, options); + + // Test month names, parsed case-insensitively + options.timestamp_parsers = {TimestampParser::MakeStrptime("%d-%b-%y")}; + AssertConversion(type, {"15-OCT-15,18-Jun-90\n"}, + {{1444867200000LL}, {645667200000LL}}, options); + + // Parsed timestamps are floored to the day boundary, also before the epoch + options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %H:%M")}; + AssertConversion(type, {"03/15/2020 14:30,05/08/1945 14:30\n"}, + {{1584230400000LL}, {-777945600000LL}}, options); + + // Test errors + AssertConversionError(type, {"24-12-2020\n"}, {0}, options); +} + TEST(Time32Conversion, Seconds) { const auto type = time32(TimeUnit::SECOND); @@ -513,6 +579,30 @@ TEST(Time32Conversion, Millis) { AssertConversionError(type, {"23:59:60\n"}, {0}); } +TEST(Time32Conversion, UserDefinedParsers) { + auto options = ConvertOptions::Defaults(); + + // Test a single parser, with non-zero-padded hours + options.timestamp_parsers = {TimestampParser::MakeStrptime("%H:%M:%S")}; + AssertConversion(time32(TimeUnit::SECOND), {"7:55:00,12:01:02\n"}, + {{28500}, {43262}}, options); + AssertConversion(time32(TimeUnit::MILLI), {"7:55:00\n"}, + {{28500000}}, options); + + // ISO-8601 values are still accepted when parsers are given + AssertConversion(time32(TimeUnit::SECOND), {"07:55:00,7:55:00\n"}, + {{28500}, {28500}}, options); + + // The time of day is extracted from parsed timestamps, also before the epoch + options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y-%m-%d %H:%M")); + AssertConversion(time32(TimeUnit::SECOND), + {"2020-03-15 07:55,1945-05-08 07:55\n"}, + {{28500}, {28500}}, options); + + // Test errors + AssertConversionError(time32(TimeUnit::SECOND), {"24:00:00\n"}, {0}, options); +} + TEST(Time64Conversion, Micros) { const auto type = time64(TimeUnit::MICRO); @@ -539,6 +629,24 @@ TEST(Time64Conversion, Nanos) { AssertConversionError(type, {"23:59:60\n"}, {0}); } +TEST(Time64Conversion, UserDefinedParsers) { + auto options = ConvertOptions::Defaults(); + + // Test a single parser, with non-zero-padded hours + options.timestamp_parsers = {TimestampParser::MakeStrptime("%H:%M:%S")}; + AssertConversion(time64(TimeUnit::MICRO), {"7:55:00\n"}, + {{28500000000LL}}, options); + AssertConversion(time64(TimeUnit::NANO), {"7:55:00\n"}, + {{28500000000000LL}}, options); + + // ISO-8601 values are still accepted when parsers are given + AssertConversion(time64(TimeUnit::MICRO), {"07:55:00.123456\n"}, + {{28500123456LL}}, options); + + // Test errors + AssertConversionError(time64(TimeUnit::MICRO), {"24:00:00\n"}, {0}, options); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h index c08407ff7f6f..94b60e647cb5 100644 --- a/cpp/src/arrow/csv/inference_internal.h +++ b/cpp/src/arrow/csv/inference_internal.h @@ -46,7 +46,15 @@ enum class InferKind { class InferStatus { public: explicit InferStatus(const ConvertOptions& options) - : kind_(InferKind::Null), can_loosen_type_(true), options_(options) {} + : kind_(InferKind::Null), can_loosen_type_(true), options_(options) { + if (!options.timestamp_parsers.empty()) { + // Date and time inference must not use the user-defined timestamp parsers, + // otherwise a value with a time-of-day (resp. date) part could be inferred + // as a date (resp. time) and be silently truncated. + date_time_options_ = std::make_unique(options); + date_time_options_->timestamp_parsers.clear(); + } + } InferKind kind() const { return kind_; } @@ -106,6 +114,12 @@ class InferStatus { return Converter::Make(type, options_, pool); }; + auto make_date_time_converter = + [&](std::shared_ptr type) -> Result> { + return Converter::Make(type, date_time_options_ ? *date_time_options_ : options_, + pool); + }; + auto make_dict_converter = [&](std::shared_ptr type) -> Result> { ARROW_ASSIGN_OR_RAISE(auto dict_converter, @@ -122,9 +136,9 @@ class InferStatus { case InferKind::Boolean: return make_converter(boolean()); case InferKind::Date: - return make_converter(date32()); + return make_date_time_converter(date32()); case InferKind::Time: - return make_converter(time32(TimeUnit::SECOND)); + return make_date_time_converter(time32(TimeUnit::SECOND)); case InferKind::Timestamp: return make_converter(timestamp(TimeUnit::SECOND)); case InferKind::TimestampNS: @@ -159,6 +173,9 @@ class InferStatus { InferKind kind_; bool can_loosen_type_; const ConvertOptions& options_; + // Copy of options_ with timestamp_parsers cleared, for date and time inference. + // Only allocated when custom timestamp parsers are configured. + std::unique_ptr date_time_options_; }; } // namespace csv diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index f0b923d0f323..b9391d63bbfd 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -129,6 +129,13 @@ struct ARROW_EXPORT ConvertOptions { /// the CSV conversion logic will try parsing values starting from the /// beginning of this vector. If no parsers are specified, we use the default /// built-in ISO-8601 parser. + /// + /// These parsers are also used as a fallback for columns explicitly typed + /// as date32, date64, time32 or time64, after the built-in ISO-8601 parser + /// failed on a value. A timestamp produced by a fallback parser is floored + /// to the day boundary for dates, and reduced to the time of day for times + /// (like casting a timestamp to a date or time type). Type inference of + /// date and time columns is not affected. std::vector> timestamp_parsers; /// Create conversion options with default values, including conventional diff --git a/cpp/src/arrow/vendored/musl/strptime.c b/cpp/src/arrow/vendored/musl/strptime.c index 41912fd1bbf4..7982dde2dc54 100644 --- a/cpp/src/arrow/vendored/musl/strptime.c +++ b/cpp/src/arrow/vendored/musl/strptime.c @@ -17,12 +17,29 @@ #undef HAVE_LANGINFO -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(ARROW_TEST_FALLBACK_LANGINFO) #define HAVE_LANGINFO 1 #endif #ifdef HAVE_LANGINFO #include +#else +// C-locale replacement for nl_langinfo(), restricted to the items used below. +// The constants follow the glibc/musl layout that the symbolic_range loop +// relies on: full names directly follow the abbreviated ones. +enum { ABDAY_1 = 0, DAY_1 = 7, ABMON_1 = 14, MON_1 = 26, AM_STR = 38, PM_STR = 39, + D_T_FMT = 40, D_FMT = 41, T_FMT = 42, T_FMT_AMPM = 43 }; +static const char *arrow_c_langinfo[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", + "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December", + "AM", "PM", + "%a %b %e %T %Y", "%m/%d/%y", "%H:%M:%S", "%I:%M:%S %p"}; +#define nl_langinfo(x) arrow_c_langinfo[x] +#define HAVE_LANGINFO 1 #endif #define strptime arrow_strptime diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst index 74ee0bb4fbbf..cffa7f992c8d 100644 --- a/docs/source/cpp/csv.rst +++ b/docs/source/cpp/csv.rst @@ -348,6 +348,21 @@ values afterwards using the ``assume_timezone`` compute function. | | 2021-01-01T00:00:00Z | | +-------------------+------------------------------+-------------------+ +Date and time parsing +--------------------- + +Columns explicitly typed as date32, date64, time32 or time64 are parsed +from the ISO-8601 formats (e.g. ``2021-01-01`` and ``01:23:45``, +respectively). If a value fails to parse as such, the user-defined parsers +in :member:`ConvertOptions::timestamp_parsers` are then tried in order. +A timestamp produced by such a parser is floored to the day boundary for +dates, and reduced to the time of day for times, like casting a timestamp +to a date or time type. For example, with the parser ``"%m/%d/%Y %H:%M"``, +the value ``03/15/2020 14:30`` in a date32 column yields ``2020-03-15``. + +Type inference of date and time columns always uses the ISO-8601 formats +only, regardless of :member:`ConvertOptions::timestamp_parsers`. + Nulls ----- diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index f2cefb8ff3fb..1e1c455328a2 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -700,6 +700,14 @@ cdef class ConvertOptions(_Weakrefable): value ISO8601() can also be given). By default, a fast built-in ISO-8601 parser is used. + These parsers are also used as a fallback when converting columns + explicitly typed (using `column_types`) as date32, date64, time32 + or time64, after the built-in ISO-8601 parser failed on a value. + A timestamp produced by a fallback parser is floored to the day + boundary for dates, and reduced to the time of day for times (like + casting a timestamp to a date or time type). Type inference of + date and time columns is not affected. + Examples -------- diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index ac9012ebdf63..8fb410be663d 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1164,6 +1164,65 @@ def test_dates(self): 'b': [datetime(1970, 1, 2), datetime(1971, 1, 2)], } + def test_dates_with_timestamp_parsers(self): + # Custom timestamp parsers are used as a fallback for explicitly-typed + # date columns (GH-28303, GH-41488) + rows = b"a,b\n15/10/2015,1970-01-02\n18/06/1990,15/10/2015\n" + opts = ConvertOptions() + opts.column_types = {'a': pa.date32(), 'b': pa.date64()} + with pytest.raises(pa.ArrowInvalid, + match="CSV conversion error to date"): + self.read_bytes(rows, convert_options=opts) + + opts.timestamp_parsers = ['%d/%m/%Y'] + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.date32()), + ('b', pa.date64())]) + assert table.schema == schema + # ISO values keep working alongside custom-parsed ones + assert table.to_pydict() == { + 'a': [date(2015, 10, 15), date(1990, 6, 18)], + 'b': [date(1970, 1, 2), date(2015, 10, 15)], + } + + # Month names are parsed case-insensitively + rows = b"a\n15-OCT-15\n18-Jun-90\n" + opts = ConvertOptions(column_types={'a': pa.date32()}, + timestamp_parsers=['%d-%b-%y']) + table = self.read_bytes(rows, convert_options=opts) + assert table.to_pydict() == { + 'a': [date(2015, 10, 15), date(1990, 6, 18)], + } + + # Inference is unaffected by the fallback: a non-ISO date string + # still infers as timestamp, not date32 + rows = b"a\n15/10/2015\n" + opts = ConvertOptions(timestamp_parsers=['%d/%m/%Y']) + table = self.read_bytes(rows, convert_options=opts) + assert table.schema == pa.schema([('a', pa.timestamp('s'))]) + + def test_times_with_timestamp_parsers(self): + # Custom timestamp parsers are used as a fallback for explicitly-typed + # time columns (GH-41488) + from datetime import time + + rows = b"a,b\n7:55:00,12:01:02\n23:59:59,0:00:01\n" + opts = ConvertOptions() + opts.column_types = {'a': pa.time32('s'), 'b': pa.time64('us')} + with pytest.raises(pa.ArrowInvalid, + match="CSV conversion error to time"): + self.read_bytes(rows, convert_options=opts) + + opts.timestamp_parsers = ['%H:%M:%S'] + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.time32('s')), + ('b', pa.time64('us'))]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [time(7, 55, 0), time(23, 59, 59)], + 'b': [time(12, 1, 2), time(0, 0, 1)], + } + def test_times(self): # Times are inferred as time32[s] by default from datetime import time