apache · akshatshenoi-eng · May 29, 2026 · May 29, 2026
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2695,6 +2695,16 @@ object SQLConf {
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("128MB") // parquet.block.size
 
+  val ARCHIVE_FORMAT_ENABLED = buildConf("spark.sql.files.archive.enabled")
+    .doc("When true, the CSV data source can read tar archives (.tar, .tar.gz, .tgz): each " +
+      "archive is read as a single split and its entries are streamed through the CSV parser " +
+      "(never unpacked to disk), as if the entries were separate CSV files. Only the CSV data " +
+      "source supports reading archives.")
+    .version("4.3.0")
+    .withBindingPolicy(ConfigBindingPolicy.SESSION)
+    .booleanConf
+    .createWithDefault(false)
+
   val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes")
     .internal()
     .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" +

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -293,6 +293,10 @@
       <artifactId>bcprov-jdk18on</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcpkix-jdk18on</artifactId>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{Closeable, FilterInputStream, InputStream}
+import java.util.Locale
+import java.util.zip.GZIPInputStream
+
+import scala.util.control.NonFatal
+
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.TaskContext
+
+/**
+ * Streaming reader for tar archives: plain `.tar`, gzipped `.tar.gz`, and `.tgz`.
+ *
+ * The archive is opened once and decompressed/unpacked as a stream -- entries are never
+ * materialized to local disk. [[readEntries]] hands each entry's bytes to a caller-supplied
+ * parse function as a bounded [[InputStream]] and concatenates the per-entry results into a
+ * single iterator, advancing to the next entry only once the current one is fully consumed. At
+ * most one entry is in flight at a time, so memory stays bounded regardless of archive size.
+ *
+ * This is format-agnostic: a `FileFormat` whose per-file reader can consume an `InputStream`
+ * (e.g. CSV via `UnivocityParser`) wires up archive support by calling [[readEntries]] from its
+ * read/inference paths and supplying a `parseEntry` that turns one entry stream into rows (or
+ * tokens). Formats that need random access within a file (Parquet/ORC footers) cannot use this
+ * streaming path.
+ *
+ * Gzip handling: Hadoop's `CompressionCodecFactory` matches the trailing `.gz` extension and
+ * auto-decompresses `.tar.gz` via `CodecStreams`, so we just wrap that stream in
+ * `TarArchiveInputStream`. `.tgz` is not a registered Hadoop codec extension, so the gzip layer
+ * is unwrapped explicitly here.
+ */
+object ArchiveReader {
+
+  def isArchivePath(path: Path): Boolean = {
+    val name = path.getName.toLowerCase(Locale.ROOT)
+    name.endsWith(".tar") || name.endsWith(".tar.gz") || name.endsWith(".tgz")
+  }
+
+  // Paths Hadoop's codec factory won't auto-decompress: we apply the gzip layer here.
+  private def needsExplicitGunzip(path: Path): Boolean =
+    path.getName.toLowerCase(Locale.ROOT).endsWith(".tgz")
+
+  private def shouldSkipEntry(entry: TarArchiveEntry): Boolean = {
+    if (entry.isDirectory) return true
+    val name = entry.getName
+    val basename = name.substring(name.lastIndexOf('/') + 1)
+    basename.startsWith(".")
+  }
+
+  /** Opens `path` as a tar stream, transparently decompressing `.tar.gz` / `.tgz`. */
+  private def openTarStream(conf: Configuration, path: Path): TarArchiveInputStream = {
+    val base = CodecStreams.createInputStreamWithCloseResource(conf, path)
+    val tarBytes = if (needsExplicitGunzip(path)) new GZIPInputStream(base) else base
+    new TarArchiveInputStream(tarBytes)
+  }
+
+  /**
+   * A view over the shared tar stream that reads exactly the current entry's bytes
+   * (`TarArchiveInputStream.read` returns -1 at the entry boundary) and ignores `close()`, so a
+   * parser closing its input does not close the underlying archive. Any unread remainder of an
+   * entry is skipped by `getNextEntry()` when advancing.
+   */
+  private final class EntryInputStream(tar: TarArchiveInputStream)
+    extends FilterInputStream(tar) {
+    override def close(): Unit = ()
+  }
+
+  /**
+   * Streams `path` entry by entry, applying `parseEntry` to each non-skipped entry's
+   * `(name, stream)` and concatenating the results into a single iterator. Directories and OS
+   * sidecar dotfiles (basename starting with `.`, e.g. macOS `._x` / `.DS_Store`) are skipped.
+   *
+   * The next entry is opened only once the current entry's iterator is exhausted, so nothing is
+   * buffered to disk and at most one entry's bytes are read at a time. The archive stream is
+   * closed when the returned iterator is exhausted, when [[Closeable.close]] is called on it, and
+   * (defensively) on task completion.
+   */
+  def readEntries[T](
+      path: Path,
+      conf: Configuration)(
+      parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] = {
+    val tar = openTarStream(conf, path)
+    var closed = false
+
+    def cleanup(): Unit = {
+      if (!closed) {
+        closed = true
+        try tar.close() catch { case NonFatal(_) => }
+      }
+    }
+
+    Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => cleanup()))
+
+    new Iterator[T] with Closeable {
+      private var currentIter: Iterator[T] = Iterator.empty
+      private var done = false
+
+      // Move to the next entry whose iterator has elements (releasing each exhausted entry's
+      // reader and skipping any unread bytes), or mark the stream done once entries run out.
+      // Advancing here -- driven by `hasNext` -- rather than eagerly after producing a row in
+      // `next` is essential for parsers like `UnivocityParser` that reuse a single mutable row and
+      // look ahead on `hasNext`: probing the current entry right after returning a row would
+      // overwrite that row's contents before the caller has copied it.
+      private def advance(): Unit = {
+        while (!done && !currentIter.hasNext) {
+          currentIter match {
+            case c: Closeable => try c.close() catch { case NonFatal(_) => }
+            case _ =>
+          }
+          var entry = tar.getNextEntry
+          while (entry != null && shouldSkipEntry(entry)) entry = tar.getNextEntry
+          if (entry == null) {
+            done = true
+            cleanup()
+          } else {
+            currentIter = parseEntry(entry.getName, new EntryInputStream(tar))
+          }
+        }
+      }
+
+      // Open the first entry eagerly so construction reflects the archive's first entry.
+      advance()
+
+      override def hasNext: Boolean = {
+        advance()
+        !done && currentIter.hasNext
+      }
+
+      override def next(): T = {
+        if (!hasNext) throw new NoSuchElementException
+        currentIter.next()
+      }
+
+      override def close(): Unit = {
+        done = true
+        currentIter = Iterator.empty
+        cleanup()
+      }
+    }
+  }
+}