diff --git a/src/main/java/org/jadice/filetype/Analyzer.java b/src/main/java/org/jadice/filetype/Analyzer.java index 3eb7018..464923e 100644 --- a/src/main/java/org/jadice/filetype/Analyzer.java +++ b/src/main/java/org/jadice/filetype/Analyzer.java @@ -228,17 +228,22 @@ public Map analyze(final InputStream sis, final AnalysisListener public Map analyze(final InputStream is, final AnalysisListener listener, final String fileName) throws IOException { Map result = new HashMap<>(); - - - // POI (3.1-Final) closes the stream during analyszs of office files - use an uncloseable stream wrapper - final UncloseableInputStream uis = new UncloseableInputStream(is); - final UncloseableSeekableInputStreamWrapper usis = new UncloseableSeekableInputStreamWrapper(new MemoryInputStream(uis)); - usis.lockClose(); // and don't unlock later as POI attempts to close asynchronously! - + final AnalysisListener effectiveListener = listener != null ? listener : DEFAULT_LISTENER; + // POI may close streams during analysis; shield callers by using an uncloseable, seekable wrapper. + // If the input is already seekable, avoid buffering the full stream in memory. + final SeekableInputStream baseStream; + if (is instanceof SeekableInputStream) { + baseStream = (SeekableInputStream) is; + } else { + final UncloseableInputStream uis = new UncloseableInputStream(is); + baseStream = new MemoryInputStream(uis); + } + final UncloseableSeekableInputStreamWrapper usis = new UncloseableSeekableInputStreamWrapper(baseStream); + usis.lockClose(); // do not unlock later as POI may attempt to close asynchronously final String sanitizedFileName = fileName != null ? fileName.replaceAll("[:\\\\/*?|<>]", "_") : null; String extension = FilenameUtils.getExtension(sanitizedFileName); - Context ctx = new Context(usis, result, listener, locale, extension); + Context ctx = new Context(usis, result, effectiveListener, locale, extension); database.analyze(ctx); @@ -267,12 +272,13 @@ public Map analyze(final File file, final AnalysisListener liste SeekableInputStream sis = new RandomAccessFileInputStream(file); try { String fileName = file.getName(); - return analyze(sis, null, fileName); + return analyze(sis, listener, fileName); } finally { try { sis.close(); } catch (IOException e) { - listener.error(this, "Exception closing RandomAccessFileInputStream", e); + final AnalysisListener effectiveListener = listener != null ? listener : DEFAULT_LISTENER; + effectiveListener.error(this, "Exception closing RandomAccessFileInputStream", e); } } } @@ -289,13 +295,13 @@ public Map analyze(final SeekableInputStream sis) throws IOExcep } - public Map analyzeWithFilename(final SeekableInputStream sis,final String fileName) throws IOException { + public Map analyzeWithFilename(final SeekableInputStream sis, final String fileName) throws IOException { return analyze(sis, DEFAULT_LISTENER, fileName); } /** * Analyze the stream supplied via an {@link InputStream}.
- * Caveat: the data will be buffered in memory. If you don't like this, supply a + * Caveat: non-seekable streams may be buffered in memory. If you don't like this, supply a * {@link SeekableInputStream} implementation or a {@link File} instead. * * @param is diff --git a/src/main/java/org/jadice/filetype/matchers/OfficeOpenXMLMatcher.java b/src/main/java/org/jadice/filetype/matchers/OfficeOpenXMLMatcher.java index 9e1a9f6..301cf2d 100644 --- a/src/main/java/org/jadice/filetype/matchers/OfficeOpenXMLMatcher.java +++ b/src/main/java/org/jadice/filetype/matchers/OfficeOpenXMLMatcher.java @@ -237,19 +237,13 @@ public boolean matches(final Context context) { SeekableInputStream sis = context.getStream(); try { sis.seek(0); - - ZipFile archive = ZipUtil.createZipFile(sis); - try { - detect(context, archive); - } finally { - archive.close(); - try { - Files.delete(archive.getFile().toPath()); - } catch (IOException ioe) { - LOGGER.debug("failed to delete temporary zip file", ioe); - } - } - + ZipFile archive = ZipUtil.createZipFile(sis); + try { + detect(context, archive); + } finally { + // Close releases file handles; any temp cleanup is handled by the ZipUtil ZipFile implementation. + archive.close(); + } return context.getProperty(MimeTypeAction.KEY) != null; } catch (IOException e) { context.error(this, "Exception analyzing Office Open XML Container", e); @@ -486,23 +480,39 @@ private InputStream getSafeInputStream(String fileName, final ZipFile archive) t if (fileName.startsWith("/")) { fileName = fileName.substring(1); } + // Support both entry layouts: root entries and entries prefixed with "/". + final String prefixed = archive.getFile().getName() + File.separator + fileName; + InputStream direct = tryGetSingleEntry(fileName, archive); + if (direct != null) { + return direct; + } + InputStream prefixedDirect = tryGetSingleEntry(prefixed, archive); + if (prefixedDirect != null) { + return prefixedDirect; + } + InputStream pieces = tryGetPieceStream(fileName, archive); + if (pieces != null) { + return pieces; + } + return tryGetPieceStream(prefixed, archive); + } - // consider the uuid directory name - fileName = archive.getFile().getName() + File.separator + fileName; - - final FileHeader entry = archive.getFileHeader(fileName); + private InputStream tryGetSingleEntry(final String name, final ZipFile archive) throws IOException { + final FileHeader entry = archive.getFileHeader(name); if (entry != null && !entry.isDirectory()) { - LOGGER.debug("Get '{}' from 1 piece", fileName); + LOGGER.debug("Get '{}' from 1 piece", name); return archive.getInputStream(entry); } + return null; + } - // try directory browsing: + private InputStream tryGetPieceStream(final String baseName, final ZipFile archive) throws IOException { // Assemble stream from "[0].piece"..."[$n].last.piece"; // see Office Open XML, Part 2: Open Packaging Conventions, sec 9.1.3.1 Logical Item Names List streams = new LinkedList<>(); int i = 0; FileHeader piece; - while ((piece = archive.getFileHeader(fileName + "/[" + i + "].piece")) != null) { + while ((piece = archive.getFileHeader(baseName + "/[" + i + "].piece")) != null) { final InputStream is = archive.getInputStream(piece); if (is == null) { break; @@ -512,7 +522,7 @@ private InputStream getSafeInputStream(String fileName, final ZipFile archive) t i++; } - final FileHeader last = archive.getFileHeader(fileName + "/[" + i + "].last.piece"); + final FileHeader last = archive.getFileHeader(baseName + "/[" + i + "].last.piece"); if (last == null) { return null; } @@ -523,7 +533,7 @@ private InputStream getSafeInputStream(String fileName, final ZipFile archive) t } streams.add(is); - LOGGER.debug("Get '{}' from {} pieces", fileName, streams.size()); + LOGGER.debug("Get '{}' from {} pieces", baseName, streams.size()); return new SequenceInputStream(Collections.enumeration(streams)); } } diff --git a/src/main/java/org/jadice/filetype/ziputil/ZipUtil.java b/src/main/java/org/jadice/filetype/ziputil/ZipUtil.java index 628d9f0..678468a 100644 --- a/src/main/java/org/jadice/filetype/ziputil/ZipUtil.java +++ b/src/main/java/org/jadice/filetype/ziputil/ZipUtil.java @@ -4,15 +4,12 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; +import java.nio.file.Files; +import java.nio.file.Path; import org.jadice.filetype.io.SeekableInputStream; import net.lingala.zip4j.ZipFile; -import net.lingala.zip4j.io.inputstream.ZipInputStream; -import net.lingala.zip4j.model.LocalFileHeader; public class ZipUtil { @@ -22,35 +19,41 @@ private ZipUtil() { // utility class } + /** Caller must close the returned {@link ZipFile} so the temporary ZIP file can be deleted. */ public static ZipFile createZipFile(SeekableInputStream sis) throws IOException { + final long fp = sis.getStreamPosition(); + final Path baseDir = TEMP_DIRECTORY.toPath(); + final Path tmpZip = Files.createTempFile(baseDir, "jadice-filetype-", ".zip"); + tmpZip.toFile().deleteOnExit(); + try (OutputStream os = new FileOutputStream(tmpZip.toFile())) { + final byte[] buffer = new byte[128 * 1024]; + int read; + while ((read = sis.read(buffer)) != -1) { + os.write(buffer, 0, read); + } + } finally { + sis.seek(fp); + } + return new AutoDeletingZipFile(tmpZip.toFile()); + } - final UUID uuid = UUID.randomUUID(); - final File tmpDir = new File(TEMP_DIRECTORY + File.separator + uuid); - - long fp = sis.getStreamPosition(); - LocalFileHeader localFileHeader; - int readLen; - byte[] readBuffer = new byte[4096]; - - try (ZipInputStream zipInputStream = new ZipInputStream(sis); ZipFile zipFile = new ZipFile(uuid.toString())) { - List files = new ArrayList<>(); - while ((localFileHeader = zipInputStream.getNextEntry()) != null) { - if (!localFileHeader.isDirectory()) { - final File extractedFile = new File( - tmpDir.getAbsolutePath() + File.separator + localFileHeader.getFileName()); - File parentFolder = new File(extractedFile.getParent()); - parentFolder.mkdirs(); - try (OutputStream outputStream = new FileOutputStream(extractedFile)) { - while ((readLen = zipInputStream.read(readBuffer)) != -1) { - outputStream.write(readBuffer, 0, readLen); - } - } - files.add(extractedFile); + private static final class AutoDeletingZipFile extends ZipFile { + private final File tmpFile; + private AutoDeletingZipFile(File tmpFile) { + super(tmpFile); + this.tmpFile = tmpFile; + } + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + try { + Files.deleteIfExists(tmpFile.toPath()); + } catch (IOException ignore) { + // best-effort cleanup } } - sis.seek(fp); - zipFile.addFolder(tmpDir); - return zipFile; } } }