-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-57526][SQL] Add the timestamp_nanos function to create nanosecond-precision timestamps from numeric nanoseconds
#56616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
8b14b71
e56f3b6
0a83aff
e81da36
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -759,6 +759,98 @@ case class MicrosToTimestamp(child: Expression) | |
| copy(child = newChild) | ||
| } | ||
|
|
||
| // scalastyle:off line.size.limit line.contains.tab | ||
| @ExpressionDescription( | ||
| usage = "_FUNC_(nanoseconds) - Creates timestamp with the local time zone and nanosecond precision (TIMESTAMP_LTZ(9)) from the number of nanoseconds since UTC epoch.", | ||
| examples = """ | ||
| Examples: | ||
| > SET spark.sql.timestampNanosTypes.enabled=true; | ||
| spark.sql.timestampNanosTypes.enabled true | ||
| > SELECT _FUNC_(1230219000123456789); | ||
| 2008-12-25 07:30:00.123456789 | ||
| """, | ||
| group = "datetime_funcs", | ||
| since = "4.3.0") | ||
| // scalastyle:on line.size.limit line.contains.tab | ||
| case class NanosToTimestamp(child: Expression) | ||
| extends UnaryExpression with ExpectsInputTypes { | ||
| override def nullIntolerant: Boolean = true | ||
|
|
||
| // Accepts an integral or DECIMAL nanosecond count only. DECIMAL is required to span the full | ||
| // [0001, 9999] calendar range: nanos for year 9999 (~2.5e20) overflow a 64-bit BIGINT, the same | ||
| // reason the inverse `unix_nanos` returns DECIMAL(21, 0); an integral argument is widened to | ||
| // BigInteger directly. FLOAT/DOUBLE/STRING are intentionally rejected at analysis rather than | ||
| // implicitly coerced: a fractional or string nanosecond count is not meaningful, and the implicit | ||
| // DECIMAL coercion (FLOAT -> DECIMAL(14, 7), DOUBLE -> DECIMAL(30, 15)) would silently overflow | ||
| // for realistic magnitudes. | ||
| override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(IntegralType, DecimalType)) | ||
|
|
||
| override def dataType: DataType = TimestampLTZNanosType(9) | ||
|
|
||
| // Maps the integer nanosecond count to the (epochMicros, nanosWithinMicro) pair with floor | ||
| // semantics, so the sub-microsecond remainder is always in [0, 999] (matching the negative-input | ||
| // behavior of `floorDiv`/`floorMod`). When `epochMicros` overflows 64 bits -- i.e. the input is | ||
| // outside the representable timestamp range -- `longValueExact` throws, which is surfaced as a | ||
| // DATETIME_OVERFLOW error. | ||
| // | ||
| // Like the sibling `timestamp_micros`/`timestamp_millis`/`timestamp_seconds` constructors, the | ||
| // result is not validated against the [0001, 9999] calendar range: only the 64-bit `epochMicros` | ||
| // boundary is guarded, so a count whose `epochMicros` still fits in a long but lands past year | ||
| // 9999 (up to the long-micros maximum, ~year 294247) yields an out-of-range value rather than an | ||
| // error. This is intentional, keeping the nanosecond constructor consistent with its micro peers. | ||
| override def nullSafeEval(input: Any): Any = { | ||
| val n = child.dataType match { | ||
| case _: DecimalType => | ||
| input.asInstanceOf[Decimal].toJavaBigDecimal | ||
| .setScale(0, java.math.RoundingMode.FLOOR).toBigInteger | ||
| case _: IntegralType => | ||
| BigInteger.valueOf(input.asInstanceOf[Number].longValue()) | ||
| } | ||
| val thousand = BigInteger.valueOf(NANOS_PER_MICROS) | ||
| val rem = n.mod(thousand) | ||
| val micros = try { | ||
| n.subtract(rem).divide(thousand).longValueExact() | ||
| } catch { | ||
| case _: ArithmeticException => throw QueryExecutionErrors.timestampNanosOverflowError(n) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One question here for my curiosity: Overflow guard only catches epochMicros not fitting in a 64-bit long, not the documented calendar range. This is consistent with timestamp_micros (which also does no calendar-range validation); so I'm wondering - is it intentional? Inputs whose epochMicros fits in a long but represents a year > 9999 (or < 0001) — up to ~year 292471 — silently produce an out-of-range TimestampNanosVal, since fromParts validates only nanosWithinMicro.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentional. It matches the sibling |
||
| } | ||
| TimestampNanosVal.fromParts(micros, rem.shortValueExact()) | ||
| } | ||
|
|
||
| override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| nullSafeCodeGen(ctx, ev, c => { | ||
| val n = ctx.freshName("nanos") | ||
| val thousand = ctx.freshName("thousand") | ||
| val rem = ctx.freshName("rem") | ||
| val micros = ctx.freshName("micros") | ||
| val toBigInteger = child.dataType match { | ||
| case _: DecimalType => | ||
| s"$c.toJavaBigDecimal().setScale(0, java.math.RoundingMode.FLOOR).toBigInteger()" | ||
| case _: IntegralType => | ||
| s"java.math.BigInteger.valueOf((long) $c)" | ||
| } | ||
| val errors = QueryExecutionErrors.getClass.getName.stripSuffix("$") | ||
| s""" | ||
| |java.math.BigInteger $n = $toBigInteger; | ||
| |java.math.BigInteger $thousand = java.math.BigInteger.valueOf(${NANOS_PER_MICROS}L); | ||
| |java.math.BigInteger $rem = $n.mod($thousand); | ||
| |long $micros; | ||
| |try { | ||
| | $micros = $n.subtract($rem).divide($thousand).longValueExact(); | ||
| |} catch (java.lang.ArithmeticException e) { | ||
| | throw $errors.timestampNanosOverflowError($n); | ||
| |} | ||
| |${ev.value} = org.apache.spark.unsafe.types.TimestampNanosVal.fromParts( | ||
| | $micros, $rem.shortValueExact()); | ||
| |""".stripMargin | ||
| }) | ||
| } | ||
|
|
||
| override def prettyName: String = "timestamp_nanos" | ||
|
|
||
| override protected def withNewChildInternal(newChild: Expression): NanosToTimestamp = | ||
| copy(child = newChild) | ||
| } | ||
|
|
||
| abstract class TimestampToLongBase extends UnaryExpression | ||
| with ExpectsInputTypes { | ||
| override def nullIntolerant: Boolean = true | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1743,6 +1743,65 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
| } | ||
| } | ||
|
|
||
| test("SPARK-57526: timestamp_nanos builds a TIMESTAMP_LTZ(9) from nanoseconds") { | ||
| import org.apache.spark.sql.catalyst.util.TimestampNanosTestUtils._ | ||
|
|
||
| // DECIMAL input is accepted as-is; a wide DECIMAL(38, 0) holds every input below. | ||
| def tsNanos(n: BigInt): NanosToTimestamp = | ||
| NanosToTimestamp(Literal.create(Decimal(BigDecimal(n), 38, 0), DecimalType(38, 0))) | ||
|
|
||
| assert(tsNanos(0).dataType === TimestampLTZNanosType(9)) | ||
|
|
||
| // The JIRA example: 1230219000123456789 ns -> 1230219000123456 micros + 789 ns. | ||
| checkEvaluation(tsNanos(BigInt("1230219000123456789")), nanosVal(1230219000123456L, 789)) | ||
|
|
||
| // An integral argument is accepted directly (widened to BigInteger), exercising the | ||
| // IntegralType eval/codegen path rather than the DECIMAL one. Cover every integral width | ||
| // (TINYINT/SMALLINT/INT/BIGINT) so the `(long)` codegen cast is checked for each. | ||
| checkEvaluation(NanosToTimestamp(Literal(2.toByte)), nanosVal(0L, 2)) | ||
| checkEvaluation(NanosToTimestamp(Literal(1000.toShort)), nanosVal(1L, 0)) | ||
| checkEvaluation(NanosToTimestamp(Literal(1000)), nanosVal(1L, 0)) | ||
| checkEvaluation( | ||
| NanosToTimestamp(Literal(1230219000123456789L)), nanosVal(1230219000123456L, 789)) | ||
| checkEvaluation(NanosToTimestamp(Literal(-1L)), nanosVal(-1L, 999)) | ||
|
|
||
| // FLOAT/DOUBLE/STRING are rejected at analysis: a fractional or string nanosecond count is not | ||
| // meaningful, and the implicit DECIMAL coercion would silently overflow for realistic values. | ||
| Seq(Literal(1.0f), Literal(1.0d), Literal("1")).foreach { lit => | ||
| val mismatch = NanosToTimestamp(lit).checkInputDataTypes().asInstanceOf[DataTypeMismatch] | ||
| assert(mismatch.errorSubClass == "UNEXPECTED_INPUT_TYPE") | ||
| } | ||
|
|
||
| // Pre-epoch / negative inputs use floor semantics, so nanosWithinMicro stays in [0, 999]: | ||
| // -1 ns floors to epochMicros = -1 with a 999 ns remainder. | ||
| checkEvaluation(tsNanos(BigInt(-1)), nanosVal(-1L, 999)) | ||
| checkEvaluation(tsNanos(BigInt(-1000)), nanosVal(-1L, 0)) | ||
| checkEvaluation(tsNanos(BigInt(-1500)), nanosVal(-2L, 500)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
|
||
| // NULL input. | ||
| checkEvaluation( | ||
| NanosToTimestamp(Literal.create(null, DecimalType(38, 0))), null) | ||
|
|
||
| // Full [0001, 9999] range: a DECIMAL nanosecond count far beyond a 64-bit BIGINT decodes | ||
| // losslessly back to the original value (proving the function spans the whole calendar range). | ||
| Seq( | ||
| localDateTimeToNanosVal(timestampNTZ(9999, 12, 31, 23, 59, 59, 999999999)), | ||
| localDateTimeToNanosVal(timestampNTZ(1, 1, 1, 0, 0, 0, 1)) | ||
| ).foreach { v => | ||
| val n = BigInt(v.epochMicros) * NANOS_PER_MICROS + v.nanosWithinMicro.toInt | ||
| checkEvaluation(tsNanos(n), v) | ||
| // Round-trips with the inverse unix_nanos for the same full-range values. | ||
| checkEvaluation(UnixNanos(tsNanos(n)), Decimal(BigDecimal(n), 21, 0)) | ||
| } | ||
|
|
||
| // Out-of-range input: epochMicros overflows a 64-bit long, surfaced as DATETIME_OVERFLOW. | ||
| checkErrorInExpression[SparkArithmeticException]( | ||
| tsNanos(BigInt("10000000000000000000000000")), | ||
| condition = "DATETIME_OVERFLOW", | ||
| parameters = Map("operation" -> | ||
| "create a TIMESTAMP_LTZ(9) from 10000000000000000000000000 nanoseconds since the epoch")) | ||
| } | ||
|
|
||
| test("TIMESTAMP_SECONDS") { | ||
| def testIntegralFunc(value: Number): Unit = { | ||
| checkEvaluation( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TypeCollection(IntegralType, DecimalType)accepts a DECIMAL of any scale (no implicit coercion, since this mixes inExpectsInputTypes), sotimestamp_nanos(CAST(x AS DECIMAL(p, s>0)))passes analysis and the fraction is silently floored bysetScale(0, FLOOR)below. The rationale comment justifies rejecting fractional FLOAT/DOUBLE as "not meaningful" — could we extend it to say a fractional DECIMAL is accepted and floored to whole nanoseconds, so the asymmetry is deliberate? (Notetimestamp_seconds's DECIMAL branch errors on sub-resolution input instead — a sentence on why nanos floors would help.)