Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 126 additions & 13 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "arrow/csv/converter.h"

#include <algorithm>
#include <array>
#include <cstring>
#include <limits>
Expand Down Expand Up @@ -440,6 +441,13 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder {
const TimestampParser& parser_;
};

std::vector<const TimestampParser*> GetTimestampParsers(const ConvertOptions& options) {
std::vector<const TimestampParser*> parsers(options.timestamp_parsers.size());
std::ranges::transform(options.timestamp_parsers, parsers.begin(),
[](const auto& parser) { return parser.get(); });
return parsers;
}

struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
using value_type = int64_t;

Expand All @@ -449,7 +457,7 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
: ValueDecoder(type, options, trie_cache),
unit_(checked_cast<const TimestampType&>(*type_).unit()),
expect_timezone_(!checked_cast<const TimestampType&>(*type_).timezone().empty()),
parsers_(GetParsers(options_)) {}
parsers_(GetTimestampParsers(options_)) {}

Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
bool zone_offset_present = false;
Expand All @@ -464,18 +472,93 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
}

protected:
using ParserVector = std::vector<const TimestampParser*>;
TimeUnit::type unit_;
bool expect_timezone_;
std::vector<const TimestampParser*> parsers_;
};

//
// Value decoder for dates and times, with fallback to user-defined
// timestamp parsers
//

// Tries the ISO-8601 format first, then the user-defined timestamp parsers.
// A timestamp produced by a user-defined parser is floored to the day
// boundary for dates, and reduced to the time of day for times (consistent
// with casting a timestamp to date32/date64/time32/time64).
template <typename T>
struct DateTimeWithParsersValueDecoder : public ValueDecoder {
using value_type = typename T::c_type;

DateTimeWithParsersValueDecoder(const std::shared_ptr<DataType>& type,
const ConvertOptions& options,
const TrieCache* trie_cache)
: ValueDecoder(type, options, trie_cache),
concrete_type_(checked_cast<const T&>(*type)),
parse_unit_(GetParseUnit(concrete_type_)),
ticks_per_day_(TicksPerDay(parse_unit_)),
parsers_(GetTimestampParsers(options_)) {}

static ParserVector GetParsers(const ConvertOptions& options) {
ParserVector parsers(options.timestamp_parsers.size());
for (size_t i = 0; i < options.timestamp_parsers.size(); ++i) {
parsers[i] = options.timestamp_parsers[i].get();
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
TrimWhiteSpace(&data, &size);
if (ARROW_PREDICT_TRUE(string_converter_.Convert(
concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
return Status::OK();
}
for (const auto& parser : parsers_) {
int64_t timestamp = 0;
bool zone_offset_present = false;
if (parser->operator()(reinterpret_cast<const char*>(data), size, parse_unit_,
&timestamp, &zone_offset_present) &&
!zone_offset_present) {
// Floor division, to handle values before the epoch
int64_t days = timestamp / ticks_per_day_;
days -= (timestamp % ticks_per_day_) < 0;
if constexpr (std::is_same_v<T, Date32Type>) {
*out = static_cast<value_type>(days);
} else if constexpr (std::is_same_v<T, Date64Type>) {
*out = days * kMillisPerDay;
} else {
static_assert(is_time_type<T>::value);
*out = static_cast<value_type>(timestamp - days * ticks_per_day_);
}
return Status::OK();
}
}
return parsers;
return GenericConversionError(type_, data, size);
}

TimeUnit::type unit_;
bool expect_timezone_;
protected:
static constexpr int64_t kMillisPerDay = 86400000;

static TimeUnit::type GetParseUnit(const T& type) {
if constexpr (is_time_type<T>::value) {
// Parse in the time type's own unit, so that the time of day can be
// extracted without further conversion
return type.unit();
} else {
return TimeUnit::SECOND;
}
}

static int64_t TicksPerDay(TimeUnit::type unit) {
switch (unit) {
case TimeUnit::SECOND:
return 86400LL;
case TimeUnit::MILLI:
return 86400000LL;
case TimeUnit::MICRO:
return 86400000000LL;
case TimeUnit::NANO:
return 86400000000000LL;
}
return -1; // unreachable
}

const T& concrete_type_;
arrow::internal::StringConverter<T> string_converter_;
const TimeUnit::type parse_unit_;
const int64_t ticks_per_day_;
std::vector<const TimestampParser*> parsers_;
};

Expand Down Expand Up @@ -672,6 +755,24 @@ std::shared_ptr<Converter> MakeTimestampConverter(const std::shared_ptr<DataType
}
}

//
// Concrete Converter factory for dates and times
//

template <template <typename, typename> class ConverterType, typename T>
std::shared_ptr<Converter> MakeDateTimeConverter(const std::shared_ptr<DataType>& type,
const ConvertOptions& options,
MemoryPool* pool) {
if (options.timestamp_parsers.empty()) {
// Default to ISO-8601
return std::make_shared<ConverterType<T, NumericValueDecoder<T>>>(type, options,
pool);
}
// Try ISO-8601 first, then the user-defined timestamp parsers
return std::make_shared<ConverterType<T, DateTimeWithParsersValueDecoder<T>>>(
type, options, pool);
}

//
// Concrete Converter factory for reals
//
Expand Down Expand Up @@ -743,10 +844,6 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
NUMERIC_CONVERTER_CASE(Type::FLOAT, FloatType)
NUMERIC_CONVERTER_CASE(Type::DOUBLE, DoubleType)
REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type)
NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
NUMERIC_CONVERTER_CASE(Type::DURATION, DurationType)
CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
CONVERTER_CASE(Type::BINARY,
Expand All @@ -760,6 +857,22 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
ptr = MakeTimestampConverter<PrimitiveConverter>(type, options, pool);
break;

case Type::DATE32:
ptr = MakeDateTimeConverter<PrimitiveConverter, Date32Type>(type, options, pool);
break;

case Type::DATE64:
ptr = MakeDateTimeConverter<PrimitiveConverter, Date64Type>(type, options, pool);
break;

case Type::TIME32:
ptr = MakeDateTimeConverter<PrimitiveConverter, Time32Type>(type, options, pool);
break;

case Type::TIME64:
ptr = MakeDateTimeConverter<PrimitiveConverter, Time64Type>(type, options, pool);
break;

case Type::STRING:
if (options.check_utf8) {
ptr = std::make_shared<PrimitiveConverter<StringType, BinaryValueDecoder<true>>>(
Expand Down
108 changes: 108 additions & 0 deletions cpp/src/arrow/csv/converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,40 @@ TEST(Date32Conversion, Errors) {
AssertConversionError(date32(), {"2020-13-01\n"}, {0});
}

TEST(Date32Conversion, UserDefinedParsers) {
auto options = ConvertOptions::Defaults();
const auto type = date32();

// Test a single parser
options.timestamp_parsers = {TimestampParser::MakeStrptime("%d/%m/%y")};
AssertConversion<Date32Type, int32_t>(type, {"15/10/15,18/06/90\n"}, {{16723}, {7473}},
options);

// ISO-8601 values are still accepted when parsers are given
AssertConversion<Date32Type, int32_t>(type, {"2020-03-15,15/10/15\n"},
{{18336}, {16723}}, options);

// Test multiple parsers, with a pre-epoch value
options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%d-%m-%Y"));
AssertConversion<Date32Type, int32_t>(type, {"15/10/15,08-05-1945\n"},
{{16723}, {-9004}}, options);

// Test month names, parsed case-insensitively
options.timestamp_parsers = {TimestampParser::MakeStrptime("%d-%b-%y")};
AssertConversion<Date32Type, int32_t>(type, {"15-OCT-15,18-Jun-90\n"},
{{16723}, {7473}}, options);

// Parsed timestamps are floored to the day boundary, also before the epoch
options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %H:%M")};
AssertConversion<Date32Type, int32_t>(type, {"03/15/2020 14:30,05/08/1945 14:30\n"},
{{18336}, {-9004}}, options);

// Test errors
AssertConversionError(type, {"24-12-2020\n"}, {0}, options);
options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %z")};
AssertConversionError(type, {"01/02/1970 +0000\n"}, {0}, options);
}

TEST(Date64Conversion, Basics) {
AssertConversion<Date64Type, int64_t>(date64(), {"1945-05-08\n", "2020-03-15\n"},
{{-777945600000LL, 1584230400000LL}});
Expand All @@ -487,6 +521,38 @@ TEST(Date64Conversion, Errors) {
AssertConversionError(date64(), {"2020-13-01\n"}, {0});
}

TEST(Date64Conversion, UserDefinedParsers) {
auto options = ConvertOptions::Defaults();
const auto type = date64();

// Test a single parser
options.timestamp_parsers = {TimestampParser::MakeStrptime("%d/%m/%y")};
AssertConversion<Date64Type, int64_t>(type, {"15/10/15,18/06/90\n"},
{{1444867200000LL}, {645667200000LL}}, options);

// ISO-8601 values are still accepted when parsers are given
AssertConversion<Date64Type, int64_t>(type, {"2020-03-15,15/10/15\n"},
{{1584230400000LL}, {1444867200000LL}}, options);

// Test multiple parsers, with a pre-epoch value
options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%d-%m-%Y"));
AssertConversion<Date64Type, int64_t>(type, {"15/10/15,08-05-1945\n"},
{{1444867200000LL}, {-777945600000LL}}, options);

// Test month names, parsed case-insensitively
options.timestamp_parsers = {TimestampParser::MakeStrptime("%d-%b-%y")};
AssertConversion<Date64Type, int64_t>(type, {"15-OCT-15,18-Jun-90\n"},
{{1444867200000LL}, {645667200000LL}}, options);

// Parsed timestamps are floored to the day boundary, also before the epoch
options.timestamp_parsers = {TimestampParser::MakeStrptime("%m/%d/%Y %H:%M")};
AssertConversion<Date64Type, int64_t>(type, {"03/15/2020 14:30,05/08/1945 14:30\n"},
{{1584230400000LL}, {-777945600000LL}}, options);

// Test errors
AssertConversionError(type, {"24-12-2020\n"}, {0}, options);
}

TEST(Time32Conversion, Seconds) {
const auto type = time32(TimeUnit::SECOND);

Expand All @@ -513,6 +579,30 @@ TEST(Time32Conversion, Millis) {
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(Time32Conversion, UserDefinedParsers) {
auto options = ConvertOptions::Defaults();

// Test a single parser, with non-zero-padded hours
options.timestamp_parsers = {TimestampParser::MakeStrptime("%H:%M:%S")};
AssertConversion<Time32Type, int32_t>(time32(TimeUnit::SECOND), {"7:55:00,12:01:02\n"},
{{28500}, {43262}}, options);
AssertConversion<Time32Type, int32_t>(time32(TimeUnit::MILLI), {"7:55:00\n"},
{{28500000}}, options);

// ISO-8601 values are still accepted when parsers are given
AssertConversion<Time32Type, int32_t>(time32(TimeUnit::SECOND), {"07:55:00,7:55:00\n"},
{{28500}, {28500}}, options);

// The time of day is extracted from parsed timestamps, also before the epoch
options.timestamp_parsers.push_back(TimestampParser::MakeStrptime("%Y-%m-%d %H:%M"));
AssertConversion<Time32Type, int32_t>(time32(TimeUnit::SECOND),
{"2020-03-15 07:55,1945-05-08 07:55\n"},
{{28500}, {28500}}, options);

// Test errors
AssertConversionError(time32(TimeUnit::SECOND), {"24:00:00\n"}, {0}, options);
}

TEST(Time64Conversion, Micros) {
const auto type = time64(TimeUnit::MICRO);

Expand All @@ -539,6 +629,24 @@ TEST(Time64Conversion, Nanos) {
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(Time64Conversion, UserDefinedParsers) {
auto options = ConvertOptions::Defaults();

// Test a single parser, with non-zero-padded hours
options.timestamp_parsers = {TimestampParser::MakeStrptime("%H:%M:%S")};
AssertConversion<Time64Type, int64_t>(time64(TimeUnit::MICRO), {"7:55:00\n"},
{{28500000000LL}}, options);
AssertConversion<Time64Type, int64_t>(time64(TimeUnit::NANO), {"7:55:00\n"},
{{28500000000000LL}}, options);

// ISO-8601 values are still accepted when parsers are given
AssertConversion<Time64Type, int64_t>(time64(TimeUnit::MICRO), {"07:55:00.123456\n"},
{{28500123456LL}}, options);

// Test errors
AssertConversionError(time64(TimeUnit::MICRO), {"24:00:00\n"}, {0}, options);
}

TEST(TimestampConversion, Basics) {
auto type = timestamp(TimeUnit::SECOND);

Expand Down
23 changes: 20 additions & 3 deletions cpp/src/arrow/csv/inference_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,15 @@ enum class InferKind {
class InferStatus {
public:
explicit InferStatus(const ConvertOptions& options)
: kind_(InferKind::Null), can_loosen_type_(true), options_(options) {}
: kind_(InferKind::Null), can_loosen_type_(true), options_(options) {
if (!options.timestamp_parsers.empty()) {
// Date and time inference must not use the user-defined timestamp parsers,
// otherwise a value with a time-of-day (resp. date) part could be inferred
// as a date (resp. time) and be silently truncated.
date_time_options_ = std::make_unique<ConvertOptions>(options);
date_time_options_->timestamp_parsers.clear();
}
}

InferKind kind() const { return kind_; }

Expand Down Expand Up @@ -106,6 +114,12 @@ class InferStatus {
return Converter::Make(type, options_, pool);
};

auto make_date_time_converter =
[&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
return Converter::Make(type, date_time_options_ ? *date_time_options_ : options_,
pool);
};

auto make_dict_converter =
[&](std::shared_ptr<DataType> type) -> Result<std::shared_ptr<Converter>> {
ARROW_ASSIGN_OR_RAISE(auto dict_converter,
Expand All @@ -122,9 +136,9 @@ class InferStatus {
case InferKind::Boolean:
return make_converter(boolean());
case InferKind::Date:
return make_converter(date32());
return make_date_time_converter(date32());
case InferKind::Time:
return make_converter(time32(TimeUnit::SECOND));
return make_date_time_converter(time32(TimeUnit::SECOND));
case InferKind::Timestamp:
return make_converter(timestamp(TimeUnit::SECOND));
case InferKind::TimestampNS:
Expand Down Expand Up @@ -159,6 +173,9 @@ class InferStatus {
InferKind kind_;
bool can_loosen_type_;
const ConvertOptions& options_;
// Copy of options_ with timestamp_parsers cleared, for date and time inference.
// Only allocated when custom timestamp parsers are configured.
std::unique_ptr<ConvertOptions> date_time_options_;
};

} // namespace csv
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,13 @@ struct ARROW_EXPORT ConvertOptions {
/// the CSV conversion logic will try parsing values starting from the
/// beginning of this vector. If no parsers are specified, we use the default
/// built-in ISO-8601 parser.
///
/// These parsers are also used as a fallback for columns explicitly typed
/// as date32, date64, time32 or time64, after the built-in ISO-8601 parser
/// failed on a value. A timestamp produced by a fallback parser is floored
/// to the day boundary for dates, and reduced to the time of day for times
/// (like casting a timestamp to a date or time type). Type inference of
/// date and time columns is not affected.
std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;

/// Create conversion options with default values, including conventional
Expand Down
Loading
Loading