From ee0dc1ba0ba038eb33d7d3ca3d092e864f1ed337 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 15:15:38 +0200 Subject: [PATCH 01/11] :sparkles: add support for crop & split file operations for v2 --- src/Mindee/Extraction/ImageExtractor.cs | 172 +++++++++++++++++++++ src/Mindee/Extraction/PdfExtractor.cs | 114 ++++++++++++++ src/Mindee/Image/ExtractedImage.cs | 36 +++-- src/Mindee/V1/Extraction/ImageExtractor.cs | 152 ++---------------- src/Mindee/V1/Extraction/PdfExtractor.cs | 89 +---------- src/Mindee/V2/FileOperations/Crop.cs | 55 +++++++ src/Mindee/V2/FileOperations/CropFiles.cs | 34 ++++ src/Mindee/V2/FileOperations/Split.cs | 64 ++++++++ src/Mindee/V2/FileOperations/SplitFiles.cs | 33 ++++ 9 files changed, 514 insertions(+), 235 deletions(-) create mode 100644 src/Mindee/Extraction/ImageExtractor.cs create mode 100644 src/Mindee/Extraction/PdfExtractor.cs create mode 100644 src/Mindee/V2/FileOperations/Crop.cs create mode 100644 src/Mindee/V2/FileOperations/CropFiles.cs create mode 100644 src/Mindee/V2/FileOperations/Split.cs create mode 100644 src/Mindee/V2/FileOperations/SplitFiles.cs diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs new file mode 100644 index 00000000..d9f52e73 --- /dev/null +++ b/src/Mindee/Extraction/ImageExtractor.cs @@ -0,0 +1,172 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Docnet.Core; +using Docnet.Core.Models; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; +using SkiaSharp; + +namespace Mindee.Extraction +{ + /// + /// Extract sub-images from an image. + /// + public class ImageExtractor + { + /// + /// Name of the file. + /// + protected readonly string _filename; + /// + /// List of SKBitmap representing the pages of the file. + /// + private readonly List _pageImages; + /// + /// Format to save the resulting images as. + /// + protected readonly string SaveFormat; + + /// + /// LocalInputSource object used by the ImageExtractor. + /// + public readonly LocalInputSource LocalInput; + + /// + /// Init from a Local Input Source. + /// + /// Locally loaded resource. + /// Format to save the resulting images as. + public ImageExtractor(LocalInputSource localInput, string saveFormat = null) + { + _filename = localInput.Filename; + _pageImages = []; + LocalInput = localInput; + if (saveFormat == null) + { + var extension = Path.GetExtension(localInput.Filename)?.Substring(1); + if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase)) + { + SaveFormat = extension; + } + else + { + SaveFormat = "jpg"; + } + } + else + { + SaveFormat = saveFormat; + } + + if (localInput.IsPdf()) + { + var pdfPageImages = PdfToImages(localInput.FileBytes); + _pageImages.AddRange(pdfPageImages); + } + else + { + _pageImages.Add(SKBitmap.Decode(localInput.FileBytes)); + } + } + + /// + /// Init from a path. + /// + /// Path to the file. + public ImageExtractor(string filePath) : this(new LocalInputSource(filePath)) + { + } + + /// + /// Renders the input Pdf's pages as individual images. + /// + /// Input pdf. + /// A list of pages, as SKBitmap. + private static List PdfToImages(byte[] fileBytes) + { + var images = new List(); + lock (DocLib.Instance) + { + using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1)); + for (var i = 0; i < docReader.GetPageCount(); i++) + { + using var pageReader = docReader.GetPageReader(i); + var width = pageReader.GetPageWidth(); + var height = pageReader.GetPageHeight(); + var bytes = pageReader.GetImage(); + var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height)); + images.Add(bmp); + } + + return images; + } + } + + /// + /// Splits the filename into name and extension. + /// + protected static string[] SplitNameStrict(string filename) + { + return + [ + Path.GetFileNameWithoutExtension(filename), + Path.GetExtension(filename).TrimStart('.') + ]; + } + + /// + /// Gets the number of pages in the file. + /// + /// The number of pages in the file. + public int GetPageCount() + { + return _pageImages.Count; + } + + /// + /// Extracts a single image from a field having position data. + /// + /// Bounding box of the field. + /// Index of the page containing the field. + /// Extracted image as an SKBitmap. + protected SKBitmap ExtractImage(Bbox bbox, int pageIndex) + { + var image = _pageImages[pageIndex]; + var width = image.Width; + var height = image.Height; + var minX = (int)Math.Round(bbox.MinX * width); + var maxX = (int)Math.Round(bbox.MaxX * width); + var minY = (int)Math.Round(bbox.MinY * height); + var maxY = (int)Math.Round(bbox.MaxY * height); + + var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY); + using var canvas = new SKCanvas(croppedBitmap); + var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height); + var sourceRect = new SKRect(minX, minY, maxX, maxY); + canvas.DrawBitmap(image, sourceRect, destRect); + + return croppedBitmap; + } + + /// + /// Extracts multiple images from a field having position data. + /// + /// The page index to extract, begins at 0. + /// The list of polygons representing the position data. + /// A list of extracted images. + public List ExtractMultipleImagesFromSource(int pageId, List polygons) + { + var filename = this.LocalInput.Filename; + var extractedImages = new List(); + foreach (var polygon in polygons) + { + var bbox = Utils.BboxFromPolygon(polygon); + var fieldFilename = $"{filename}_{pageId:D3}_{polygons.IndexOf(polygon):D3}.{SaveFormat}"; + extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat)); + } + return extractedImages; + } + } +} diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs new file mode 100644 index 00000000..a1408ac0 --- /dev/null +++ b/src/Mindee/Extraction/PdfExtractor.cs @@ -0,0 +1,114 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Docnet.Core; +using Microsoft.Extensions.Logging.Abstractions; +using Mindee.Input; +using Mindee.Pdf; +using SkiaSharp; + +namespace Mindee.Extraction +{ + /// + /// PDF extraction class. + /// + public class PdfExtractor + { + /// + /// Local input source. + /// + protected readonly LocalInputSource LocalInput; + + /// + /// Source PDF bytes. + /// + protected byte[] SourcePdf; + + /// + /// Initializes a new instance of the class. + /// + /// Instance of a LocalInputSource, provided by the user. + public PdfExtractor(LocalInputSource localInput) + { + LocalInput = localInput; + } + + /// + /// Wrapper for PDF GetPageCount(); + /// + /// The number of pages in the file. + public int GetPageCount() + { + return LocalInput.GetPageCount(); + } + + /// + /// Extract the PDF bytes. + /// + /// + protected byte[] PdfBytes() + { + if (SourcePdf != null) + { + return this.SourcePdf; + } + if (LocalInput.IsPdf()) + { + SourcePdf = LocalInput.FileBytes; + } + else + { + var memoryStream = new MemoryStream(); + using var image = SKImage.FromEncodedData(LocalInput.FileBytes); + using var bmp = SKBitmap.FromImage(image); + var pageSize = new SKSize(bmp.Width, bmp.Height); + using (var document = SKDocument.CreatePdf(memoryStream)) + { + var canvas = document.BeginPage(pageSize.Width, pageSize.Height); + canvas.DrawBitmap(bmp, SKPoint.Empty); + document.EndPage(); + } + + SourcePdf = memoryStream.ToArray(); + } + + return SourcePdf; + } + + /// + /// Extracts sub-documents from the source document using list of page indexes. + /// + /// List of sub-lists of pages to keep. + /// Extracted documents. + /// + public List ExtractSubDocuments(List> pageIndexes) + { + var extractedPdfs = new List(); + + foreach (var pageIndexElem in pageIndexes) + { + if (pageIndexElem.Count == 0) + { + throw new ArgumentException("Empty indexes not allowed for extraction."); + } + + var extension = Path.GetExtension(LocalInput.Filename); + var prefix = Path.GetFileNameWithoutExtension(LocalInput.Filename); + var fieldFilename = + $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}"; + + var splitQuery = new SplitQuery( + PdfBytes(), + new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray())); + lock (DocLib.Instance) + { + var pdfOperation = new DocNetApi(new NullLogger()); + var mergedPdfBytes = pdfOperation.Split(splitQuery).File; + extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename)); + } + } + + return extractedPdfs; + } + } +} diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs index d71c4879..bee8cd7c 100644 --- a/src/Mindee/Image/ExtractedImage.cs +++ b/src/Mindee/Image/ExtractedImage.cs @@ -42,17 +42,24 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat) /// Uses the default image format and filename. /// /// The output directory (must exist). - public void WriteToFile(string outputPath) + /// + public void WriteToFile(string outputPath, string fileFormat = null) { - var imagePath = Path.Combine(outputPath, Filename); - var format = GetEncodedImageFormat(_saveFormat); + var targetFormat = fileFormat ?? _saveFormat; + var format = GetEncodedImageFormat(targetFormat); - using (var image = SKImage.FromBitmap(Image)) - using (var data = image.Encode(format, 100)) - using (var stream = File.OpenWrite(imagePath)) + var finalFilename = Filename; + if (!string.IsNullOrWhiteSpace(fileFormat)) { - data.SaveTo(stream); + var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename); + finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}"; } + var imagePath = Path.Combine(outputPath, finalFilename); + + using var image = SKImage.FromBitmap(Image); + using var data = image.Encode(format, 100); + using var stream = File.OpenWrite(imagePath); + data.SaveTo(stream); } /// @@ -61,19 +68,18 @@ public void WriteToFile(string outputPath) /// An instance of . public LocalInputSource AsInputSource() { - using (var image = SKImage.FromBitmap(Image)) - using (var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100)) - using (var output = new MemoryStream()) - { - data.SaveTo(output); - return new LocalInputSource(output.ToArray(), Filename); - } + using var image = SKImage.FromBitmap(Image); + using var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100); + using var output = new MemoryStream(); + data.SaveTo(output); + return new LocalInputSource(output.ToArray(), Filename); } - private SKEncodedImageFormat GetEncodedImageFormat(string saveFormat) + private static SKEncodedImageFormat GetEncodedImageFormat(string saveFormat) { return saveFormat.ToLower() switch { + "jpg" or "jpeg" => SKEncodedImageFormat.Jpeg, "png" => SKEncodedImageFormat.Png, "bmp" => SKEncodedImageFormat.Bmp, "gif" => SKEncodedImageFormat.Gif, diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Extraction/ImageExtractor.cs index b6f6576b..aec7f4b1 100644 --- a/src/Mindee/V1/Extraction/ImageExtractor.cs +++ b/src/Mindee/V1/Extraction/ImageExtractor.cs @@ -1,122 +1,35 @@ -using System; using System.Collections.Generic; -using System.IO; -using Docnet.Core; -using Docnet.Core.Models; +using System.Linq; using Mindee.Exceptions; using Mindee.Geometry; using Mindee.Image; using Mindee.Input; using Mindee.V1.Parsing.Standard; -using SkiaSharp; namespace Mindee.V1.Extraction { /// - /// Extract sub-images from an image. + /// Legacy V1 Wrapper for ImageExtractor. /// - public class ImageExtractor + public sealed class ImageExtractor : Mindee.Extraction.ImageExtractor { - private readonly string _filename; - private readonly List _pageImages; - private readonly string _saveFormat; - - /// - /// LocalInputSource object used by the ImageExtractor. - /// - public readonly LocalInputSource LocalInput; - /// /// Init from a Local Input Source. /// /// Locally loaded resource. /// Format to save the resulting images as. public ImageExtractor(LocalInputSource localInput, string saveFormat = null) - { - _filename = localInput.Filename; - _pageImages = new List(); - LocalInput = localInput; - if (saveFormat == null) - { - var extension = Path.GetExtension(localInput.Filename)?.Substring(1); - if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase)) - { - _saveFormat = extension; - } - else - { - _saveFormat = "jpg"; - } - } - else - { - _saveFormat = saveFormat; - } - - if (localInput.IsPdf()) - { - var pdfPageImages = PdfToImages(localInput.FileBytes); - _pageImages.AddRange(pdfPageImages); - } - else - { - _pageImages.Add(SKBitmap.Decode(localInput.FileBytes)); - } - } + : base(localInput, saveFormat) + { } /// /// Init from a path. /// /// Path to the file. - public ImageExtractor(string filePath) : this(new LocalInputSource(filePath)) - { - } - - /// - /// Renders the input Pdf's pages as individual images. - /// - /// Input pdf. - /// A list of pages, as SKBitmap. - private static List PdfToImages(byte[] fileBytes) - { - var images = new List(); - lock (DocLib.Instance) - { - using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1)); - for (var i = 0; i < docReader.GetPageCount(); i++) - { - using var pageReader = docReader.GetPageReader(i); - var width = pageReader.GetPageWidth(); - var height = pageReader.GetPageHeight(); - var bytes = pageReader.GetImage(); - var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height)); - images.Add(bmp); - } + public ImageExtractor(string filePath) + : base(filePath) + { } - return images; - } - } - - /// - /// Splits the filename into name and extension. - /// - private static string[] SplitNameStrict(string filename) - { - return - [ - Path.GetFileNameWithoutExtension(filename), - Path.GetExtension(filename).TrimStart('.') - ]; - } - - /// - /// Gets the number of pages in the file. - /// - /// The number of pages in the file. - public int GetPageCount() - { - return _pageImages.Count; - } /// /// Extract multiple images on a given page from a list of fields having position data. @@ -143,7 +56,7 @@ public IList ExtractImagesFromPage(IList if (GetPageCount() > 1) { var splitName = SplitNameStrict(outputName); - filename = $"{splitName[0]}.{_saveFormat}"; + filename = $"{splitName[0]}.{SaveFormat}"; } else { @@ -167,7 +80,7 @@ public IList ExtractImagesFromPage(IList fields, if (GetPageCount() > 1) { var splitName = SplitNameStrict(outputName); - filename = $"{splitName[0]}.{_saveFormat}"; + filename = $"{splitName[0]}.{SaveFormat}"; } else { @@ -188,25 +101,15 @@ private List ExtractFromPage(IList field string outputName) where TBaseField : BaseField { var splitName = SplitNameStrict(outputName); - var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}"; - - var extractedImages = new List(); - for (var i = 0; i < fields.Count; i++) - { - var extractedImage = ExtractImage(fields[i], pageIndex, i + 1, filename); - if (extractedImage != null) - { - extractedImages.Add(extractedImage); - } - } + var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}"; - return extractedImages; + return fields.Select((t, i) => ExtractImage(t, pageIndex, i + 1, filename)).Where(extractedImage => extractedImage != null).ToList(); } private List ExtractFromPage(IList fields, int pageIndex, string outputName) { var splitName = SplitNameStrict(outputName); - var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}"; + var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}"; var extractedImages = new List(); for (var i = 0; i < fields.Count; i++) @@ -256,8 +159,8 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index } var bbox = Utils.BboxFromPolygon(boundingBox); - var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat); + var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat); } /// @@ -283,29 +186,8 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st } var bbox = Utils.BboxFromPolygon(boundingBox); - var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat); - } - - private SKBitmap ExtractImage(Bbox bbox, int pageIndex) - { - var image = _pageImages[pageIndex]; - var width = image.Width; - var height = image.Height; - var minX = (int)Math.Round(bbox.MinX * width); - var maxX = (int)Math.Round(bbox.MaxX * width); - var minY = (int)Math.Round(bbox.MinY * height); - var maxY = (int)Math.Round(bbox.MaxY * height); - - var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY); - using (var canvas = new SKCanvas(croppedBitmap)) - { - var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height); - var sourceRect = new SKRect(minX, minY, maxX, maxY); - canvas.DrawBitmap(image, sourceRect, destRect); - } - - return croppedBitmap; + var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat); } } } diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Extraction/PdfExtractor.cs index 4c1bb1be..6206e253 100644 --- a/src/Mindee/V1/Extraction/PdfExtractor.cs +++ b/src/Mindee/V1/Extraction/PdfExtractor.cs @@ -1,102 +1,21 @@ using System; using System.Collections.Generic; using System.Diagnostics; -using System.IO; using System.Linq; -using Docnet.Core; -using Docnet.Core.Models; -using Microsoft.Extensions.Logging.Abstractions; using Mindee.Input; using Mindee.Pdf; using Mindee.V1.Product.InvoiceSplitter; -using SkiaSharp; namespace Mindee.V1.Extraction { /// - /// PDF extraction class. + /// V1 wrapper for the PDF extraction class. /// - public class PdfExtractor + public class PdfExtractor : Mindee.Extraction.PdfExtractor { - private readonly string Filename; - private readonly byte[] SourcePdf; - - /// - /// Initializes a new instance of the class. - /// - /// Instance of a LocalInputSource, provided by the user. - public PdfExtractor(LocalInputSource localInput) - { - Filename = localInput.Filename; - - if (localInput.IsPdf()) - { - SourcePdf = localInput.FileBytes; - } - else - { - var memoryStream = new MemoryStream(); - using var image = SKImage.FromEncodedData(localInput.FileBytes); - using var bmp = SKBitmap.FromImage(image); - var pageSize = new SKSize(bmp.Width, bmp.Height); - using (var document = SKDocument.CreatePdf(memoryStream)) - { - var canvas = document.BeginPage(pageSize.Width, pageSize.Height); - canvas.DrawBitmap(bmp, SKPoint.Empty); - document.EndPage(); - } - - SourcePdf = memoryStream.ToArray(); - } - } - - /// - /// Wrapper for pdf GetPageCount(); - /// - /// The number of pages in the file. - public int GetPageCount() - { - lock (DocLib.Instance) - { - using var docInstance = DocLib.Instance.GetDocReader(SourcePdf, new PageDimensions(1, 1)); - return docInstance.GetPageCount(); - } - } - - /// - /// Extracts sub-documents from the source document using list of page indexes. - /// - /// List of sub-lists of pages to keep. - /// Extracted documents. - /// - public List ExtractSubDocuments(List> pageIndexes) + /// + public PdfExtractor(LocalInputSource localInput) : base(localInput) { - var extractedPdfs = new List(); - - foreach (var pageIndexElem in pageIndexes) - { - if (!pageIndexElem.Any()) - { - throw new ArgumentException("Empty indexes not allowed for extraction."); - } - - var extension = Path.GetExtension(Filename); - var prefix = Path.GetFileNameWithoutExtension(Filename); - var fieldFilename = - $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}"; - - var splitQuery = new SplitQuery( - SourcePdf, - new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray())); - lock (DocLib.Instance) - { - var pdfOperation = new DocNetApi(new NullLogger()); - var mergedPdfBytes = pdfOperation.Split(splitQuery).File; - extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename)); - } - } - - return extractedPdfs; } /// diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs new file mode 100644 index 00000000..0160dfc6 --- /dev/null +++ b/src/Mindee/V2/FileOperations/Crop.cs @@ -0,0 +1,55 @@ +using System.Collections.Generic; +using System.Linq; +using Mindee.Extraction; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; +using Mindee.V2.Product.Crop; + +namespace Mindee.V2.FileOperations +{ + /// + /// V2 Crop operation utility. + /// + public sealed class Crop + { + /// + /// LocalInputSource object used by the ImageExtractor. + /// + private readonly LocalInputSource _localInput; + + /// + /// + /// + /// + public Crop(LocalInputSource inputSource) + { + this._localInput = inputSource; + } + + /// + /// + /// + /// + /// + public ExtractedImage ExtractSingleCrop(CropItem crop) + { + var polygons = new List() { crop.Location.Polygon }; + var imageExtractor = new ImageExtractor(this._localInput); + return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0]; + } + + /// + /// Extracts multiple crop zones from an image. + /// + /// List of crops. + /// + public CropFiles ExtractCrops(List crops) + { + var imageExtractor = new ImageExtractor(this._localInput); + var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList()); + + return (CropFiles)extractedImages; + } + } +} diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs new file mode 100644 index 00000000..f344bf57 --- /dev/null +++ b/src/Mindee/V2/FileOperations/CropFiles.cs @@ -0,0 +1,34 @@ +using System.Collections.Generic; +using System.IO; +using Mindee.Image; + +namespace Mindee.V2.FileOperations +{ + /// + /// Collection of cropped files. + /// + public class CropFiles : List + { + /// + /// Saves all cropped files to disk. + /// + /// Path for all files + /// Prefix for file names + /// File format for saving (default: null) + public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat = null) + { + Directory.CreateDirectory(path); + + int index = 1; + foreach (var crop in this) + { + string fileName = $"{prefix}_{index:D3}.jpg"; + string filePath = Path.Combine(path, fileName); + + crop.WriteToFile(filePath, fileFormat); + + index++; + } + } + } +} diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs new file mode 100644 index 00000000..88e6d5b2 --- /dev/null +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -0,0 +1,64 @@ +using System.Collections.Generic; +using System.Linq; +using Mindee.Exceptions; +using Mindee.Extraction; +using Mindee.Input; + +namespace Mindee.V2.FileOperations +{ + /// + /// V2 Split operation utility. + /// + public sealed class Split + { + + /// + /// LocalInputSource object used by the ImageExtractor. + /// + private readonly LocalInputSource _localInput; + + /// + /// Expands a range of pages into a list of page indexes. + /// + /// Start of the range. + /// End of the range. + /// An array of page indexes. + public static List ExpandRange(int start, int end) + { + if (start > end) + { + throw new MindeeInputException("Invalid page range provided."); + } + + int count = end - start + 1; + return Enumerable.Range(start, count).ToList(); + } + + /// + /// + /// + /// + public Split(LocalInputSource inputSource) + { + this._localInput = inputSource; + } + + /// + /// Extracts the splits from the input file. + /// + /// List of subpage indexes to keep. + /// + public SplitFiles ExtractSplits(List> splits) + { + var pdfExtractor = new PdfExtractor(this._localInput); + if (splits.Count == 0) + { + throw new MindeeInputException("No splits provided for extraction."); + } + + List> expandedPageIndexes = []; + expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1]))); + return (SplitFiles)pdfExtractor.ExtractSubDocuments(expandedPageIndexes); + } + } +} diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs new file mode 100644 index 00000000..5b26af21 --- /dev/null +++ b/src/Mindee/V2/FileOperations/SplitFiles.cs @@ -0,0 +1,33 @@ +using System.Collections.Generic; +using System.IO; +using Mindee.Pdf; + +namespace Mindee.V2.FileOperations +{ + /// + /// Collection of split PDFs. + /// + public sealed class SplitFiles : List + { + /// + /// Saves all the extracted pages to disk. + /// + /// Path for all files + /// Prefix for file names + public void SaveAllToDisk(string path, string prefix = "split") + { + Directory.CreateDirectory(path); + + int index = 1; + foreach (var crop in this) + { + string fileName = $"{prefix}_{index:D3}.pdf"; + string filePath = Path.Combine(path, fileName); + + crop.WriteToFile(filePath); + + index++; + } + } + } +} From 9590686d1b8c22b346da30e8ccbd9c49ee167e73 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 17:19:57 +0200 Subject: [PATCH 02/11] add tests --- Directory.Build.props | 2 +- src/Mindee/Extraction/ImageExtractor.cs | 6 +- src/Mindee/Image/ExtractedImage.cs | 64 +++++++++--- src/Mindee/Pdf/ExtractedPdf.cs | 62 ++++++++---- src/Mindee/Pdf/PdfUtils.cs | 30 ++++++ src/Mindee/V1/Extraction/ImageExtractor.cs | 4 +- src/Mindee/V2/FileOperations/Crop.cs | 3 +- src/Mindee/V2/FileOperations/CropFiles.cs | 20 +++- src/Mindee/V2/FileOperations/Split.cs | 18 +++- src/Mindee/V2/FileOperations/SplitFiles.cs | 15 +++ .../V2/FileOperations/CropTest.cs | 89 +++++++++++++++++ .../V2/FileOperations/SplitTest.cs | 97 +++++++++++++++++++ .../V2/FileOperations/CropTest.cs | 67 +++++++++++++ .../V2/FileOperations/SplitTest.cs | 55 +++++++++++ 14 files changed, 487 insertions(+), 45 deletions(-) create mode 100644 tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs create mode 100644 tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs create mode 100644 tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs create mode 100644 tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs diff --git a/Directory.Build.props b/Directory.Build.props index c1dff72d..9d41defe 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -46,7 +46,7 @@ - + diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs index d9f52e73..fb7afb27 100644 --- a/src/Mindee/Extraction/ImageExtractor.cs +++ b/src/Mindee/Extraction/ImageExtractor.cs @@ -160,11 +160,13 @@ public List ExtractMultipleImagesFromSource(int pageId, List(); + int i = 0; foreach (var polygon in polygons) { var bbox = Utils.BboxFromPolygon(polygon); - var fieldFilename = $"{filename}_{pageId:D3}_{polygons.IndexOf(polygon):D3}.{SaveFormat}"; - extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat)); + var fieldFilename = $"{filename}_page{pageId}-{polygons.IndexOf(polygon)}.{SaveFormat}"; + extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat, pageId, i)); + i++; } return extractedImages; } diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs index bee8cd7c..c160522a 100644 --- a/src/Mindee/Image/ExtractedImage.cs +++ b/src/Mindee/Image/ExtractedImage.cs @@ -14,17 +14,31 @@ public class ExtractedImage /// private readonly string _saveFormat; + /// + /// Page number the image was extracted from. + /// + public int PageId; + + /// + /// ID of the image. + /// + public int ElementId; + /// /// Initializes a new instance of the class. /// /// The extracted image. /// The filename for the image. /// The format to save the image. - public ExtractedImage(SKBitmap image, string filename, string saveFormat) + /// The page number the image was extracted from. + /// The ID of the image. + public ExtractedImage(SKBitmap image, string filename, string saveFormat, int pageId, int elementId) { Image = image; Filename = filename; _saveFormat = saveFormat; + PageId = pageId; + ElementId = elementId; } /// @@ -35,29 +49,48 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat) /// /// Name of the file. /// - private string Filename { get; } + public string Filename { get; } /// /// Writes the image to a file. - /// Uses the default image format and filename. + /// If outputPath has an extension, it is treated as a full file path. + /// Otherwise, it is treated as a directory and uses the default filename. /// - /// The output directory (must exist). - /// - public void WriteToFile(string outputPath, string fileFormat = null) + /// The output directory (must exist) or full file path. + /// The quality of the image. Defaults to 100. + /// The desired format. If null, inferred from extension or default. + public void WriteToFile(string outputPath, int quality = 100, string fileFormat = null) { + string imagePath; var targetFormat = fileFormat ?? _saveFormat; - var format = GetEncodedImageFormat(targetFormat); - var finalFilename = Filename; - if (!string.IsNullOrWhiteSpace(fileFormat)) + if (Path.HasExtension(outputPath)) + { + imagePath = outputPath; + if (string.IsNullOrWhiteSpace(fileFormat)) + { + var extension = Path.GetExtension(outputPath).TrimStart('.'); + if (!string.IsNullOrWhiteSpace(extension)) + { + targetFormat = extension.ToLower(); + } + } + } + else { - var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename); - finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}"; + var finalFilename = Filename; + if (!string.IsNullOrWhiteSpace(fileFormat)) + { + var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename); + finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}"; + } + imagePath = Path.Combine(outputPath, finalFilename); } - var imagePath = Path.Combine(outputPath, finalFilename); + + var format = GetEncodedImageFormat(targetFormat); using var image = SKImage.FromBitmap(Image); - using var data = image.Encode(format, 100); + using var data = image.Encode(format, quality); using var stream = File.OpenWrite(imagePath); data.SaveTo(stream); } @@ -65,11 +98,12 @@ public void WriteToFile(string outputPath, string fileFormat = null) /// /// Returns the image in a format suitable for sending to a client for parsing. /// + /// The quality of the image. Defaults to 100. /// An instance of . - public LocalInputSource AsInputSource() + public LocalInputSource AsInputSource(int quality = 100) { using var image = SKImage.FromBitmap(Image); - using var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100); + using var data = image.Encode(GetEncodedImageFormat(_saveFormat), quality); using var output = new MemoryStream(); data.SaveTo(output); return new LocalInputSource(output.ToArray(), Filename); diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs index 8286d74e..38086e53 100644 --- a/src/Mindee/Pdf/ExtractedPdf.cs +++ b/src/Mindee/Pdf/ExtractedPdf.cs @@ -1,6 +1,5 @@ using System.IO; -using Docnet.Core; -using Docnet.Core.Models; +using Mindee.Exceptions; using Mindee.Input; namespace Mindee.Pdf @@ -11,24 +10,55 @@ namespace Mindee.Pdf public class ExtractedPdf { /// - /// Name of the original file. + /// Loca /// - public readonly string Filename; + public readonly LocalInputSource LocalInput; /// - /// File object for an ExtractedPdf. + /// Page count. /// - public readonly byte[] PdfBytes; + public int PageCount { get; set; } + + /// + /// Original filename. + /// + public readonly string Filename; /// /// Initializes a new instance of the class. /// - /// A byte array representation of the Pdf. + /// A byte array representation of the Pdf. /// Name of the original file. - public ExtractedPdf(byte[] pdfBytes, string filename) + public ExtractedPdf(byte[] fileBytes, string filename) { - PdfBytes = pdfBytes; - Filename = filename; + var tmpInput = new LocalInputSource(fileBytes, filename); + if (tmpInput.IsPdf()) + { + LocalInput = tmpInput; + } + else + { + byte[] pdfBytes = PdfUtils.ConvertImageToPdf(fileBytes, filename); + string newFilename = Path.ChangeExtension(filename, ".pdf"); + LocalInput = new LocalInputSource(pdfBytes, newFilename); + } + PageCount = LocalInput.GetPageCount(); + Filename = LocalInput.Filename; + } + + /// + /// Initializes a new instance of the class. + /// + /// LocalInputSource containing the Pdf bytes and filename. + public ExtractedPdf(LocalInputSource localInput) + { + LocalInput = localInput; + if (!localInput.IsPdf()) + { + throw new MindeeInputException("The input file is not a PDF."); + } + PageCount = LocalInput.GetPageCount(); + Filename = LocalInput.Filename; } /// @@ -37,11 +67,7 @@ public ExtractedPdf(byte[] pdfBytes, string filename) /// The number of pages in the file. public int GetPageCount() { - lock (DocLib.Instance) - { - using var docInstance = DocLib.Instance.GetDocReader(PdfBytes, new PageDimensions(1, 1)); - return docInstance.GetPageCount(); - } + return LocalInput.GetPageCount(); } /// @@ -50,13 +76,13 @@ public int GetPageCount() /// the output directory (must exist). public void WriteToFile(string outputPath) { - var pdfPath = Path.Combine(outputPath, Filename); + var pdfPath = Path.Combine(outputPath, LocalInput.Filename); if (Path.GetFileName(outputPath) != string.Empty) { pdfPath = Path.GetFullPath(outputPath); } - File.WriteAllBytes(pdfPath, PdfBytes); + File.WriteAllBytes(pdfPath, LocalInput.FileBytes); } /// @@ -65,7 +91,7 @@ public void WriteToFile(string outputPath) /// an instance of public LocalInputSource AsInputSource() { - return new LocalInputSource(PdfBytes, Filename); + return LocalInput; } } } diff --git a/src/Mindee/Pdf/PdfUtils.cs b/src/Mindee/Pdf/PdfUtils.cs index 1aff8156..a44a1406 100644 --- a/src/Mindee/Pdf/PdfUtils.cs +++ b/src/Mindee/Pdf/PdfUtils.cs @@ -1,4 +1,5 @@ using System; +using System.IO; using System.Linq; using Docnet.Core; using Docnet.Core.Models; @@ -178,5 +179,34 @@ public static bool HasSourceText(byte[] fileBytes) return false; } + + /// + /// Converts an image to a PDF. + /// + /// Raw image bytes. + /// Name of the file. + /// + /// + public static byte[] ConvertImageToPdf(byte[] imageBytes, string filename) + { + using var ms = new MemoryStream(); + using var bitmap = SKBitmap.Decode(imageBytes); + if (bitmap == null) + { + throw new MindeeInputException($"The file {filename} is not a valid image."); + } + + using (var document = SKDocument.CreatePdf(ms)) + { + using (var canvas = document.BeginPage(bitmap.Width, bitmap.Height)) + { + canvas.DrawBitmap(bitmap, 0, 0); + document.EndPage(); + } + document.Close(); + } + + return ms.ToArray(); + } } } diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Extraction/ImageExtractor.cs index aec7f4b1..f98f2de7 100644 --- a/src/Mindee/V1/Extraction/ImageExtractor.cs +++ b/src/Mindee/V1/Extraction/ImageExtractor.cs @@ -160,7 +160,7 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index var bbox = Utils.BboxFromPolygon(boundingBox); var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat); + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index); } /// @@ -187,7 +187,7 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st var bbox = Utils.BboxFromPolygon(boundingBox); var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat); + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index); } } } diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs index 0160dfc6..7882d959 100644 --- a/src/Mindee/V2/FileOperations/Crop.cs +++ b/src/Mindee/V2/FileOperations/Crop.cs @@ -48,8 +48,7 @@ public CropFiles ExtractCrops(List crops) { var imageExtractor = new ImageExtractor(this._localInput); var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList()); - - return (CropFiles)extractedImages; + return new CropFiles(extractedImages); } } } diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs index f344bf57..f547761a 100644 --- a/src/Mindee/V2/FileOperations/CropFiles.cs +++ b/src/Mindee/V2/FileOperations/CropFiles.cs @@ -9,13 +9,29 @@ namespace Mindee.V2.FileOperations /// public class CropFiles : List { + /// + /// + /// + /// + public CropFiles(IEnumerable collection) : base(collection) + { + } + + /// + /// + /// + public CropFiles() : base() + { + } + /// /// Saves all cropped files to disk. /// /// Path for all files /// Prefix for file names + /// Quality of the output image /// File format for saving (default: null) - public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat = null) + public void SaveAllToDisk(string path, int quality = 100, string prefix = "crop", string fileFormat = null) { Directory.CreateDirectory(path); @@ -25,7 +41,7 @@ public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat string fileName = $"{prefix}_{index:D3}.jpg"; string filePath = Path.Combine(path, fileName); - crop.WriteToFile(filePath, fileFormat); + crop.WriteToFile(filePath, quality, fileFormat); index++; } diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs index 88e6d5b2..3bc992e4 100644 --- a/src/Mindee/V2/FileOperations/Split.cs +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -1,8 +1,10 @@ using System.Collections.Generic; +using System.IO; using System.Linq; using Mindee.Exceptions; using Mindee.Extraction; using Mindee.Input; +using Mindee.Pdf; namespace Mindee.V2.FileOperations { @@ -35,12 +37,22 @@ public static List ExpandRange(int start, int end) } /// - /// + /// Initializes an instance of a Split operation. + /// Transforms images to PDFs if necessary. /// /// public Split(LocalInputSource inputSource) { - this._localInput = inputSource; + if (inputSource.IsPdf()) + { + _localInput = inputSource; + } + else + { + byte[] pdfBytes = PdfUtils.ConvertImageToPdf(inputSource.FileBytes, inputSource.Filename); + string newFilename = Path.ChangeExtension(inputSource.Filename, ".pdf"); + _localInput = new LocalInputSource(pdfBytes, newFilename); + } } /// @@ -58,7 +70,7 @@ public SplitFiles ExtractSplits(List> splits) List> expandedPageIndexes = []; expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1]))); - return (SplitFiles)pdfExtractor.ExtractSubDocuments(expandedPageIndexes); + return new SplitFiles(pdfExtractor.ExtractSubDocuments(expandedPageIndexes)); } } } diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs index 5b26af21..443f0847 100644 --- a/src/Mindee/V2/FileOperations/SplitFiles.cs +++ b/src/Mindee/V2/FileOperations/SplitFiles.cs @@ -9,6 +9,21 @@ namespace Mindee.V2.FileOperations /// public sealed class SplitFiles : List { + /// + /// + /// + /// + public SplitFiles(IEnumerable collection) : base(collection) + { + } + + /// + /// + /// + public SplitFiles() : base() + { + } + /// /// Saves all the extracted pages to disk. /// diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs new file mode 100644 index 00000000..8c66c7be --- /dev/null +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -0,0 +1,89 @@ +using Mindee.Input; +using Mindee.V2; +using Mindee.V2.FileOperations; +using Mindee.V2.Product.Crop; +using Mindee.V2.Product.Crop.Params; +using Mindee.V2.Product.Extraction; +using Mindee.V2.Product.Extraction.Params; + +namespace Mindee.IntegrationTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class CropTest : IDisposable + { + private readonly string? _cropModelId; + private readonly string? _findocModelId; + private readonly Client _client; + private readonly string _outputDir; + + public CropTest() + { + var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey"); + _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey); + _cropModelId = Environment.GetEnvironmentVariable("MindeeV2__Crop__Model__Id"); + _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id"); + + _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output"); + if (!Directory.Exists(_outputDir)) + { + Directory.CreateDirectory(_outputDir); + } + } + + public void Dispose() + { + var file1 = Path.Combine(_outputDir, "crop_001.jpg"); + var file2 = Path.Combine(_outputDir, "crop_002.jpg"); + + if (File.Exists(file1)) File.Delete(file1); + if (File.Exists(file2)) File.Delete(file2); + } + + private void CheckFindocReturn(ExtractionResponse findocResponse) + { + Assert.True(findocResponse.Inference.Model.Id.Length > 0); + + var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField; + Assert.NotNull(totalAmount); + Assert.True(totalAmount.Value > 0); + } + + [Fact(Timeout = 180000)] + public async Task Extract_Crops_From_Image_Correctly() + { + var inputSource = new LocalInputSource( + Constants.V2ProductDir + "crop/default_sample.jpg"); + var cropParams = new CropParameters(_cropModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, cropParams); + + Assert.NotNull(response); + Assert.Equal(2, response.Inference.Result.Crops.Count); + + var cropOperation = new Crop(inputSource); + var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops); + + Assert.Equal(2, extractedImages.Count); + Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename); + Assert.Equal("default_sample.jpg_page0-1.jpg", extractedImages[1].Filename); + + var extractionInput = extractedImages[0].AsInputSource(); + var findocParams = new ExtractionParameters(_findocModelId); + + var invoice0 = await _client.EnqueueAndGetResultAsync( + extractionInput, findocParams); + + CheckFindocReturn(invoice0); + + extractedImages.SaveAllToDisk(_outputDir, 50); + + var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg")); + Assert.InRange(file1Info.Length, 100000, 110000); + + var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg")); + Assert.InRange(file2Info.Length, 100000, 110000); + } + } +} diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs new file mode 100644 index 00000000..1801d90d --- /dev/null +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs @@ -0,0 +1,97 @@ +using Mindee.Input; +using Mindee.V2; +using Mindee.V2.FileOperations; +using Mindee.V2.Product.Extraction; +using Mindee.V2.Product.Extraction.Params; +using Mindee.V2.Product.Split; +using Mindee.V2.Product.Split.Params; + +namespace Mindee.IntegrationTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class SplitTest : IDisposable + { + private readonly string? _splitModelId; + private readonly string? _findocModelId; + private readonly Client _client; + private readonly string _outputDir; + + public SplitTest() + { + var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey"); + _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey); + _splitModelId = Environment.GetEnvironmentVariable("MindeeV2__Split__Model__Id"); + _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id"); + + _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output"); + if (!Directory.Exists(_outputDir)) + { + Directory.CreateDirectory(_outputDir); + } + } + + public void Dispose() + { + var file1 = Path.Combine(_outputDir, "split_001.pdf"); + var file2 = Path.Combine(_outputDir, "split_002.pdf"); + + if (File.Exists(file1)) File.Delete(file1); + if (File.Exists(file2)) File.Delete(file2); + } + + private void CheckFindocReturn(ExtractionResponse findocResponse) + { + Assert.True(findocResponse.Inference.Model.Id.Length > 0); + + var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField; + Assert.NotNull(totalAmount); + Assert.True(totalAmount.Value > 0); + } + + [Fact(Timeout = 180000)] + public async Task Extract_Splits_From_Pdf_Correctly() + { + var inputSource = new LocalInputSource( + Constants.V2ProductDir + "split/default_sample.pdf"); + var splitParams = new SplitParameters(_splitModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, splitParams); + + Assert.NotNull(response); + Assert.Equal(2, response.Inference.Result.Splits.Count); + + var splitOperation = new Split(inputSource); + var extractedSplits = splitOperation.ExtractSplits( + response.Inference.Result.Splits.Select(s => s.PageRange).ToList()); + + Assert.Equal(2, extractedSplits.Count); + Assert.Equal("default_sample_001-001.pdf", extractedSplits[0].Filename); + Assert.Equal("default_sample_002-002.pdf", extractedSplits[1].Filename); + + var extractionInput = extractedSplits[0].AsInputSource(); + var findocParams = new ExtractionParameters(_findocModelId); + + var invoice0 = await _client.EnqueueAndGetResultAsync( + extractionInput, findocParams); + + CheckFindocReturn(invoice0); + + extractedSplits.SaveAllToDisk(_outputDir); + + for (int i = 0; i < extractedSplits.Count; i++) + { + var fileName = $"split_{i + 1:D3}.pdf"; + var filePath = Path.Combine(_outputDir, fileName); + var fileInfo = new FileInfo(filePath); + + Assert.True(fileInfo.Exists); + Assert.True(fileInfo.Length > 0); + + var localInput = new LocalInputSource(fileInfo); + Assert.Equal(extractedSplits[i].PageCount, localInput.GetPageCount()); + } + } + } +} diff --git a/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs new file mode 100644 index 00000000..5a2fb359 --- /dev/null +++ b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs @@ -0,0 +1,67 @@ +using Mindee.Input; +using Mindee.V2.FileOperations; +using Mindee.V2.Parsing; +using Mindee.V2.Product.Crop; + +namespace Mindee.UnitTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class CropTest + { + private readonly string _cropDataDir = Path.Combine(Constants.V2RootDir, "products", "crop"); + + [Fact] + public void Processes_SinglePage_CropSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_cropDataDir, "default_sample.jpg"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_cropDataDir, "crop_single.json"))); + var doc = localResponse.DeserializeResponse(); + + var cropOperation = new Crop(inputSample); + var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops); + + Assert.Single(extractedCrops); + + Assert.Equal(0, extractedCrops[0].PageId); + Assert.Equal(0, extractedCrops[0].ElementId); + + using var bitmap0 = extractedCrops[0].Image; + Assert.Equal(2822, bitmap0.Width); + Assert.Equal(1572, bitmap0.Height); + } + + [Fact] + public void Processes_MultiPage_ReceiptSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_cropDataDir, "multipage_sample.pdf"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_cropDataDir, "crop_multiple.json"))); + var doc = localResponse.DeserializeResponse(); + + var cropOperation = new Crop(inputSample); + var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops); + + Assert.Equal(2, extractedCrops.Count); + + Assert.Equal(0, extractedCrops[0].PageId); + Assert.Equal(0, extractedCrops[0].ElementId); + + using var bitmap0 = extractedCrops[0].Image; + Assert.Equal(156, bitmap0.Width); + Assert.Equal(757, bitmap0.Height); + + Assert.Equal(0, extractedCrops[1].PageId); + Assert.Equal(1, extractedCrops[1].ElementId); + + using var bitmap1 = extractedCrops[1].Image; + Assert.Equal(188, bitmap1.Width); + Assert.Equal(691, bitmap1.Height); + } + } +} diff --git a/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs new file mode 100644 index 00000000..88093be3 --- /dev/null +++ b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs @@ -0,0 +1,55 @@ +using Mindee.Input; +using Mindee.V2.FileOperations; +using Mindee.V2.Parsing; +using Mindee.V2.Product.Split; + +namespace Mindee.UnitTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class SplitTest + { + private readonly string _splitDataDir = Path.Combine(Constants.V2RootDir, "products", "split"); + private readonly string _finDocDataDir = Path.Combine(Constants.V2RootDir, "products", "extraction", "financial_document"); + + [Fact] + public void Processes_SinglePage_Split_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_finDocDataDir, "default_sample.jpg"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_splitDataDir, "split_single.json"))); + var doc = localResponse.DeserializeResponse(); + + var splitOperation = new Split(inputSample); + List splits = doc.Inference.Result.Splits; + var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList()); + + Assert.Single(extractedSplits); + + Assert.Equal(1, extractedSplits[0].PageCount); + } + + [Fact] + public void Processes_MultiPage_ReceiptSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_splitDataDir, "invoice_5p.pdf"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_splitDataDir, "split_multiple.json"))); + var doc = localResponse.DeserializeResponse(); + + var splitOperation = new Split(inputSample); + List splits = doc.Inference.Result.Splits; + var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList()); + + Assert.Equal(3, extractedSplits.Count); + + Assert.Equal(1, extractedSplits[0].PageCount); + Assert.Equal(3, extractedSplits[1].PageCount); + Assert.Equal(1, extractedSplits[2].PageCount); + } + } +} From 56dd678e03f16dfc313aa67e0439609a8ed73a91 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 17:45:55 +0200 Subject: [PATCH 03/11] uniformize exception throws --- src/Mindee/Extraction/PdfExtractor.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs index a1408ac0..625745f2 100644 --- a/src/Mindee/Extraction/PdfExtractor.cs +++ b/src/Mindee/Extraction/PdfExtractor.cs @@ -3,6 +3,7 @@ using System.IO; using Docnet.Core; using Microsoft.Extensions.Logging.Abstractions; +using Mindee.Exceptions; using Mindee.Input; using Mindee.Pdf; using SkiaSharp; @@ -89,7 +90,7 @@ public List ExtractSubDocuments(List> pageIndexes) { if (pageIndexElem.Count == 0) { - throw new ArgumentException("Empty indexes not allowed for extraction."); + throw new MindeeInputException("Empty indexes not allowed for extraction."); } var extension = Path.GetExtension(LocalInput.Filename); From 2c2021c6f450a00dfb447e0344960e445ffcb937 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:20:17 +0200 Subject: [PATCH 04/11] fix typo --- src/Mindee/Pdf/ExtractedPdf.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs index 38086e53..6191b1e4 100644 --- a/src/Mindee/Pdf/ExtractedPdf.cs +++ b/src/Mindee/Pdf/ExtractedPdf.cs @@ -10,7 +10,7 @@ namespace Mindee.Pdf public class ExtractedPdf { /// - /// Loca + /// Local input source. /// public readonly LocalInputSource LocalInput; From 358bb451603899a3245bd005c032ad68d107c9e1 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:09:04 +0200 Subject: [PATCH 05/11] add missing save methods --- src/Mindee/V2/FileOperations/Crop.cs | 2 +- src/Mindee/V2/FileOperations/Split.cs | 11 +++++++++++ src/Mindee/V2/Product/Crop/CropItem.cs | 16 ++++++++++++++++ src/Mindee/V2/Product/Split/SplitRange.cs | 13 +++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs index 7882d959..56dc54f6 100644 --- a/src/Mindee/V2/FileOperations/Crop.cs +++ b/src/Mindee/V2/FileOperations/Crop.cs @@ -34,7 +34,7 @@ public Crop(LocalInputSource inputSource) /// public ExtractedImage ExtractSingleCrop(CropItem crop) { - var polygons = new List() { crop.Location.Polygon }; + var polygons = new List { crop.Location.Polygon }; var imageExtractor = new ImageExtractor(this._localInput); return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0]; } diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs index 3bc992e4..bd9931bc 100644 --- a/src/Mindee/V2/FileOperations/Split.cs +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -5,6 +5,7 @@ using Mindee.Extraction; using Mindee.Input; using Mindee.Pdf; +using Mindee.V2.Product.Split; namespace Mindee.V2.FileOperations { @@ -55,6 +56,16 @@ public Split(LocalInputSource inputSource) } } + /// + /// Extracts a single split from the input file. + /// + /// + /// + public ExtractedPdf ExtractSingleSplit(SplitRange splitRange) + { + return ExtractSplits([splitRange.PageRange])[0]; + } + /// /// Extracts the splits from the input file. /// diff --git a/src/Mindee/V2/Product/Crop/CropItem.cs b/src/Mindee/V2/Product/Crop/CropItem.cs index 923811cf..d703f28e 100644 --- a/src/Mindee/V2/Product/Crop/CropItem.cs +++ b/src/Mindee/V2/Product/Crop/CropItem.cs @@ -1,4 +1,9 @@ +using System.Collections.Generic; using System.Text.Json.Serialization; +using Mindee.Extraction; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; using Mindee.V2.Parsing.Inference.Field; namespace Mindee.V2.Product.Crop @@ -28,5 +33,16 @@ public override string ToString() { return $"* :Location: {Location}\n :Object Type: {ObjectType}"; } + + /// + /// Extract the crop from the source document. + /// + /// + /// + public ExtractedImage ExtractFromFile(LocalInputSource inputSource) + { + var crop = new FileOperations.Crop(inputSource); + return crop.ExtractSingleCrop(this); + } } } diff --git a/src/Mindee/V2/Product/Split/SplitRange.cs b/src/Mindee/V2/Product/Split/SplitRange.cs index 6d0cb704..7b93519d 100644 --- a/src/Mindee/V2/Product/Split/SplitRange.cs +++ b/src/Mindee/V2/Product/Split/SplitRange.cs @@ -1,5 +1,7 @@ using System.Collections.Generic; using System.Text.Json.Serialization; +using Mindee.Input; +using Mindee.Pdf; namespace Mindee.V2.Product.Split { @@ -29,5 +31,16 @@ public override string ToString() string pageRange = string.Join(",", PageRange); return $"* :Page Range: {pageRange}\n :Document Type: {DocumentType}"; } + + /// + /// Extracts the split from the source document. + /// + /// + /// + public ExtractedPdf ExtractFromFile(LocalInputSource inputSource) + { + var split = new FileOperations.Split(inputSource); + return split.ExtractSingleSplit(this); + } } } From 603922c49eccce12541b0a5716e4f1d896f4876b Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 10:08:44 +0200 Subject: [PATCH 06/11] add complementary test for crop page count --- src/Mindee/V2/FileOperations/Crop.cs | 15 ++++++++---- src/Mindee/V2/FileOperations/Split.cs | 2 +- .../V2/FileOperations/CropTest.cs | 23 +++++++++++++++++-- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs index 56dc54f6..59601115 100644 --- a/src/Mindee/V2/FileOperations/Crop.cs +++ b/src/Mindee/V2/FileOperations/Crop.cs @@ -14,7 +14,7 @@ namespace Mindee.V2.FileOperations public sealed class Crop { /// - /// LocalInputSource object used by the ImageExtractor. + /// LocalInputSource object. /// private readonly LocalInputSource _localInput; @@ -28,7 +28,7 @@ public Crop(LocalInputSource inputSource) } /// - /// + /// Extract a single crop item from a file. /// /// /// @@ -40,15 +40,20 @@ public ExtractedImage ExtractSingleCrop(CropItem crop) } /// - /// Extracts multiple crop zones from an image. + /// Extracts multiple crop zones from a file. /// /// List of crops. /// public CropFiles ExtractCrops(List crops) { var imageExtractor = new ImageExtractor(this._localInput); - var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList()); - return new CropFiles(extractedImages); + CropFiles extractedImages = []; + var cropsPerPage = crops.GroupBy(c => c.Location.Page).ToList(); + foreach (var pageCrops in cropsPerPage) + { + extractedImages.AddRange(imageExtractor.ExtractMultipleImagesFromSource(pageCrops.Key, pageCrops.Select(c => c.Location.Polygon).ToList())); + } + return extractedImages; } } } diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs index bd9931bc..101b4004 100644 --- a/src/Mindee/V2/FileOperations/Split.cs +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -16,7 +16,7 @@ public sealed class Split { /// - /// LocalInputSource object used by the ImageExtractor. + /// LocalInputSource object. /// private readonly LocalInputSource _localInput; diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs index 8c66c7be..1fa0b796 100644 --- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -52,8 +52,8 @@ private void CheckFindocReturn(ExtractionResponse findocResponse) [Fact(Timeout = 180000)] public async Task Extract_Crops_From_Image_Correctly() { - var inputSource = new LocalInputSource( - Constants.V2ProductDir + "crop/default_sample.jpg"); + var inputSource = new LocalInputSource(Path.Combine( + Constants.V2ProductDir, "crop/default_sample.jpg")); var cropParams = new CropParameters(_cropModelId); var response = await _client.EnqueueAndGetResultAsync( @@ -85,5 +85,24 @@ public async Task Extract_Crops_From_Image_Correctly() var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg")); Assert.InRange(file2Info.Length, 100000, 110000); } + + [Fact(Timeout = 180000)] + public async Task Extract_Crops_From_Each_Pdf_Page_Correctly() + { + + var inputSource = new LocalInputSource( + new FileInfo(Path.Combine(Constants.V2ProductDir, "multipage_sample.pdf"))); + + var cropParams = new CropParameters(_cropModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, cropParams); + var cropOperation = new Crop(inputSource); + var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops); + + Assert.Equal(2, extractedImages.Count); + Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename); + Assert.Equal("default_sample.jpg_page1-0.jpg", extractedImages[1].Filename); + } } } From 610b7c37d25cde45f0474fa925d2aa16b504b796 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 10:21:29 +0200 Subject: [PATCH 07/11] fix syntax --- tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs index 1fa0b796..1f630b2d 100644 --- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -91,7 +91,7 @@ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly() { var inputSource = new LocalInputSource( - new FileInfo(Path.Combine(Constants.V2ProductDir, "multipage_sample.pdf"))); + new FileInfo(Path.Combine(Constants.V2ProductDir, "crop/multipage_sample.pdf"))); var cropParams = new CropParameters(_cropModelId); From 410fd6f883882d1640ec632ab4f608b5f952bb46 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 10:52:48 +0200 Subject: [PATCH 08/11] fix naming for V1 extraction namespace --- src/Mindee/V1/{Extraction => Image}/ImageExtractor.cs | 2 +- src/Mindee/V1/{Extraction => Image}/PdfExtractor.cs | 2 +- .../V1/InvoiceSplitterAutoExtractionTest.cs | 2 +- tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs | 2 +- tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename src/Mindee/V1/{Extraction => Image}/ImageExtractor.cs (99%) rename src/Mindee/V1/{Extraction => Image}/PdfExtractor.cs (99%) diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Image/ImageExtractor.cs similarity index 99% rename from src/Mindee/V1/Extraction/ImageExtractor.cs rename to src/Mindee/V1/Image/ImageExtractor.cs index f98f2de7..91f544fe 100644 --- a/src/Mindee/V1/Extraction/ImageExtractor.cs +++ b/src/Mindee/V1/Image/ImageExtractor.cs @@ -6,7 +6,7 @@ using Mindee.Input; using Mindee.V1.Parsing.Standard; -namespace Mindee.V1.Extraction +namespace Mindee.V1.Image { /// /// Legacy V1 Wrapper for ImageExtractor. diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Image/PdfExtractor.cs similarity index 99% rename from src/Mindee/V1/Extraction/PdfExtractor.cs rename to src/Mindee/V1/Image/PdfExtractor.cs index 6206e253..359852ba 100644 --- a/src/Mindee/V1/Extraction/PdfExtractor.cs +++ b/src/Mindee/V1/Image/PdfExtractor.cs @@ -6,7 +6,7 @@ using Mindee.Pdf; using Mindee.V1.Product.InvoiceSplitter; -namespace Mindee.V1.Extraction +namespace Mindee.V1.Image { /// /// V1 wrapper for the PDF extraction class. diff --git a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs index 98234901..616d3c16 100644 --- a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs +++ b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs @@ -1,6 +1,6 @@ using Mindee.Input; using Mindee.Pdf; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.Invoice; using Mindee.V1.Product.InvoiceSplitter; diff --git a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs index e8564025..e3503329 100644 --- a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs +++ b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs @@ -1,5 +1,5 @@ using Mindee.Input; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.BarcodeReader; using Mindee.V1.Product.MultiReceiptsDetector; diff --git a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs index b88fd3d5..a6207273 100644 --- a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs +++ b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs @@ -1,5 +1,5 @@ using Mindee.Input; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.InvoiceSplitter; From 937da1e296ddd4cfcb056d33b8b5b0a3f31bbb76 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 11:42:08 +0200 Subject: [PATCH 09/11] fix test --- tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs index 1f630b2d..4b633a83 100644 --- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -100,9 +100,9 @@ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly() var cropOperation = new Crop(inputSource); var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops); - Assert.Equal(2, extractedImages.Count); - Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename); - Assert.Equal("default_sample.jpg_page1-0.jpg", extractedImages[1].Filename); + Assert.Equal(5, extractedImages.Count); + Assert.Equal("multipage_sample.pdf_page0-0.jpg", extractedImages[0].Filename); + Assert.Equal("multipage_sample.pdf_page1-0.jpg", extractedImages[3].Filename); } } } From f4bd446092dcb14eb3fa18bcba052812a9e03b22 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:06:34 +0200 Subject: [PATCH 10/11] fix test --- tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs index 4b633a83..e851e459 100644 --- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -80,10 +80,10 @@ public async Task Extract_Crops_From_Image_Correctly() extractedImages.SaveAllToDisk(_outputDir, 50); var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg")); - Assert.InRange(file1Info.Length, 100000, 110000); + Assert.InRange(file1Info.Length, 99000, 110000); var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg")); - Assert.InRange(file2Info.Length, 100000, 110000); + Assert.InRange(file2Info.Length, 99000, 110000); } [Fact(Timeout = 180000)] From 1221ba1453427ff548e11a5eea163eafd731b27c Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:19:55 +0200 Subject: [PATCH 11/11] remove unnecessary safety for splits --- src/Mindee/V2/FileOperations/Split.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs index 101b4004..25ca0cd4 100644 --- a/src/Mindee/V2/FileOperations/Split.cs +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -74,10 +74,6 @@ public ExtractedPdf ExtractSingleSplit(SplitRange splitRange) public SplitFiles ExtractSplits(List> splits) { var pdfExtractor = new PdfExtractor(this._localInput); - if (splits.Count == 0) - { - throw new MindeeInputException("No splits provided for extraction."); - } List> expandedPageIndexes = []; expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1])));