@@ -159,14 +159,16 @@ def decompress_portfolio(self, portfolio, max_workers=None):
159159 output_dir .mkdir (exist_ok = True )
160160
161161 # Get all files for this accession
162- accession_files = [m for m in tar .getmembers ()
163- if m .name .startswith (f'{ accession_dir } /' ) and m .isfile ()]
162+ accession_files = [
163+ m .name for m in tar .getmembers ()
164+ if m .name .startswith (f'{ accession_dir } /' ) and m .isfile ()
165+ ]
164166
165167 # Parallel file extraction
166168 with ThreadPoolExecutor (max_workers = max_workers ) as executor :
167169 file_futures = [
168- executor .submit (self ._extract_file , member , tar , accession_dir , output_dir )
169- for member in accession_files
170+ executor .submit (self ._extract_file , member_name , batch_tar , accession_dir , output_dir )
171+ for member_name in accession_files
170172 ]
171173
172174 # Wait for all files to be processed
@@ -209,12 +211,17 @@ def _process_document(self, doc, compression, threshold, compression_level):
209211
210212 return content , compression_type
211213
212- def _extract_file (self , member , tar , accession_dir , output_dir ):
214+ def _extract_file (self , member_name , batch_tar_path , accession_dir , output_dir ):
213215 """Extract and decompress a single file from tar."""
214- relative_path = member . name [len (accession_dir )+ 1 :] # Remove accession prefix
216+ relative_path = member_name [len (accession_dir )+ 1 :] # Remove accession prefix
215217 output_path = output_dir / relative_path
216218
217- content = tar .extractfile (member ).read ()
219+ # Open the tar per thread to avoid concurrent reads on a shared TarFile handle.
220+ with tarfile .open (batch_tar_path , 'r' ) as tar :
221+ extracted = tar .extractfile (member_name )
222+ if extracted is None :
223+ return
224+ content = extracted .read ()
218225
219226 # Handle decompression based on filename
220227 if relative_path .endswith ('.gz' ):
@@ -285,4 +292,4 @@ def _write_submission_to_tar(self, tar_handle, submission, documents, compressio
285292
286293 tarinfo = tarfile .TarInfo (name = f'{ accession_prefix } /{ filename } ' )
287294 tarinfo .size = len (content )
288- tar_handle .addfile (tarinfo , io .BytesIO (content ))
295+ tar_handle .addfile (tarinfo , io .BytesIO (content ))
0 commit comments