-
Notifications
You must be signed in to change notification settings - Fork 86
#805 Enable the option to cache VRL indexes for better performance when same files are processed multiple times #806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer | |
| import za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder | ||
| import za.co.absa.cobrix.spark.cobol.utils.{HDFSUtils, SparkUtils} | ||
|
|
||
| import java.util.concurrent.ConcurrentHashMap | ||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| /** | ||
|
|
@@ -45,18 +46,24 @@ import scala.collection.mutable.ArrayBuffer | |
| * In a nutshell, ideally, there will be as many partitions as are there are indexes. | ||
| */ | ||
| private[cobol] object IndexBuilder extends Logging { | ||
| private[cobol] val indexCache = new ConcurrentHashMap[String, Array[SparseIndexEntry]]() | ||
|
|
||
| def buildIndex(filesList: Array[FileWithOrder], | ||
| cobolReader: Reader, | ||
| sqlContext: SQLContext) | ||
| sqlContext: SQLContext, | ||
| cachingAllowed: Boolean) | ||
| (localityParams: LocalityParameters): RDD[SparseIndexEntry] = { | ||
| val fs = new Path(filesList.head.filePath).getFileSystem(sqlContext.sparkSession.sparkContext.hadoopConfiguration) | ||
|
|
||
| cobolReader match { | ||
| case reader: VarLenReader if reader.isIndexGenerationNeeded && localityParams.improveLocality && isDataLocalitySupported(fs) => | ||
| logger.info("Building indexes with data locality...") | ||
| buildIndexForVarLenReaderWithFullLocality(filesList, reader, sqlContext, localityParams.optimizeAllocation) | ||
| case reader: VarLenReader => | ||
| buildIndexForVarLenReader(filesList, reader, sqlContext) | ||
| logger.info("Building indexes for variable record length input files...") | ||
| buildIndexForVarLenReader(filesList, reader, sqlContext, cachingAllowed) | ||
| case _ => | ||
| logger.info("Generating indexes for full files...") | ||
| buildIndexForFullFiles(filesList, sqlContext) | ||
| } | ||
| } | ||
|
|
@@ -112,24 +119,58 @@ private[cobol] object IndexBuilder extends Logging { | |
| */ | ||
| private[cobol] def buildIndexForVarLenReader(filesList: Array[FileWithOrder], | ||
| reader: VarLenReader, | ||
| sqlContext: SQLContext): RDD[SparseIndexEntry] = { | ||
| sqlContext: SQLContext, | ||
| cachingAllowed: Boolean): RDD[SparseIndexEntry] = { | ||
| val conf = sqlContext.sparkContext.hadoopConfiguration | ||
| val sconf = new SerializableConfiguration(conf) | ||
|
|
||
| if (reader.getReaderProperties.enableSelfChecks && filesList.nonEmpty) { | ||
| selfCheckForIndexCompatibility(reader, filesList.head.filePath, conf) | ||
| // Splitting between files for which indexes are cached and the list of files for which indexes are not cached | ||
| val cachedFiles = if (cachingAllowed) { | ||
| filesList.filter(f => indexCache.containsKey(f.filePath)) | ||
| } else { | ||
| Array.empty[FileWithOrder] | ||
| } | ||
|
|
||
| val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length) | ||
| val nonCachedFiles = filesList.diff(cachedFiles) | ||
|
|
||
| val indexRDD = filesRDD.mapPartitions( | ||
| partition => { | ||
| partition.flatMap(row => { | ||
| generateIndexEntry(row, sconf.value, reader) | ||
| }) | ||
| }).cache() | ||
| // Getting indexes for files for which indexes are not in the cache | ||
| val newIndexes = if (nonCachedFiles.length > 0) { | ||
| if (reader.getReaderProperties.enableSelfChecks) { | ||
| selfCheckForIndexCompatibility(reader, nonCachedFiles.head.filePath, conf) | ||
| } | ||
|
|
||
| repartitionIndexes(indexRDD) | ||
| val filesRDD = sqlContext.sparkContext.parallelize(nonCachedFiles, nonCachedFiles.length) | ||
| filesRDD.mapPartitions( | ||
| partition => { | ||
| partition.flatMap(row => { | ||
| generateIndexEntry(row, sconf.value, reader) | ||
| }) | ||
| }).collect() | ||
| } else { | ||
| Array.empty[SparseIndexEntry] | ||
| } | ||
|
|
||
| // Storing new indexes in the cache | ||
| if (cachingAllowed && newIndexes.length > 0) { | ||
| newIndexes.groupBy(_.fileId).foreach { case (fileId, indexEntries) => | ||
| val filePathOpt = filesList.find(_.order == fileId).map(_.filePath) | ||
|
|
||
| filePathOpt.foreach { filePath => | ||
| logger.info(s"Index stored to cache for file: $filePath.") | ||
| indexCache.put(filePath, indexEntries.sortBy(_.offsetFrom)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Getting indexes for files for which indexes are in the cache | ||
| val cachedIndexes = cachedFiles.flatMap { f => | ||
| logger.info("Index fetched from cache for file: " + f.filePath) | ||
| indexCache.get(f.filePath) | ||
| .map(ind => ind.copy(fileId = f.order)) | ||
| } | ||
|
|
||
| // Creating the final RDD with all indexes | ||
| createIndexRDD(cachedIndexes ++ newIndexes, sqlContext) | ||
| } | ||
|
Comment on lines
+127
to
174
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainVerify cache key includes file modification tracking. The caching logic filters files using only Run the following script to check if file modification time is tracked elsewhere in the codebase: If modification time is not tracked, the cache key should be changed from just the file path to include modification metadata, e.g., 🏁 Script executed: #!/bin/bash
# Search for file modification time tracking in the codebase
rg -n "getModificationTime|lastModified|FileStatus" --type scala -C 3Length of output: 8054 Now I'll examine the IndexBuilder.scala file to understand the cache structure and identify whether modification time is tracked: 🏁 Script executed: #!/bin/bash
# Examine the IndexBuilder.scala file around the cache logic
cat spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala | head -180 | tail -80Length of output: 3010 🏁 Script executed: #!/bin/bash
# Find FileWithOrder class definition
fd -t f "FileWithOrder" spark-cobol/src/main/scala/Length of output: 89 🏁 Script executed: #!/bin/bash
# Examine FileWithOrder class definition
cat spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/types/FileWithOrder.scalaLength of output: 778 🏁 Script executed: #!/bin/bash
# Find indexCache declaration and initialization
rg -n "indexCache" spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala -B 5 -A 2 | head -50Length of output: 1494 🏁 Script executed: #!/bin/bash
# Search for any file validation or modification checks in the index building process
rg -n "getModificationTime|lastModified|getLen|FileStatus" spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/ -C 3Length of output: 867 Add file modification time tracking to cache key to prevent stale index usage. The cache uses only Suggested fix: // Line 129 - change from:
filesList.filter(f => indexCache.containsKey(f.filePath))
// to:
filesList.filter(f => {
val fileStatus = fileSystem.getFileStatus(new Path(f.filePath))
val cacheKey = s"${f.filePath}_${fileStatus.getModificationTime}"
indexCache.containsKey(cacheKey)
})
// Line 160 - update cache storage to use same key format:
val fileStatus = fileSystem.getFileStatus(new Path(filePath))
val cacheKey = s"${filePath}_${fileStatus.getModificationTime}"
indexCache.put(cacheKey, indexEntries.sortBy(_.offsetFrom))
// Line 168 - update cache retrieval to use same key format:
val fileStatus = fileSystem.getFileStatus(new Path(f.filePath))
val cacheKey = s"${f.filePath}_${fileStatus.getModificationTime}"
indexCache.get(cacheKey)Alternatively, store the modification time alongside cached data and validate it on retrieval. 🤖 Prompt for AI Agents |
||
|
|
||
| /** | ||
|
|
@@ -336,4 +377,13 @@ private[cobol] object IndexBuilder extends Logging { | |
| logger.info(s"Index elements count: $indexCount, number of partitions = $numPartitions") | ||
| indexRDD.repartition(numPartitions).cache() | ||
| } | ||
|
|
||
| private def createIndexRDD(indexes: Array[SparseIndexEntry], sqlContext: SQLContext): RDD[SparseIndexEntry] = { | ||
| val indexCount = indexes.length | ||
|
|
||
| val numPartitions = Math.max(1, Math.min(indexCount, Constants.maxNumPartitions)) | ||
| logger.info(s"Index elements count: ${indexes.length}, number of partitions = $numPartitions") | ||
|
|
||
| sqlContext.sparkContext.parallelize(indexes, numPartitions) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.