From ee0dc1ba0ba038eb33d7d3ca3d092e864f1ed337 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Tue, 14 Apr 2026 15:15:38 +0200
Subject: [PATCH 01/11] :sparkles: add support for crop & split file operations
for v2
---
src/Mindee/Extraction/ImageExtractor.cs | 172 +++++++++++++++++++++
src/Mindee/Extraction/PdfExtractor.cs | 114 ++++++++++++++
src/Mindee/Image/ExtractedImage.cs | 36 +++--
src/Mindee/V1/Extraction/ImageExtractor.cs | 152 ++----------------
src/Mindee/V1/Extraction/PdfExtractor.cs | 89 +----------
src/Mindee/V2/FileOperations/Crop.cs | 55 +++++++
src/Mindee/V2/FileOperations/CropFiles.cs | 34 ++++
src/Mindee/V2/FileOperations/Split.cs | 64 ++++++++
src/Mindee/V2/FileOperations/SplitFiles.cs | 33 ++++
9 files changed, 514 insertions(+), 235 deletions(-)
create mode 100644 src/Mindee/Extraction/ImageExtractor.cs
create mode 100644 src/Mindee/Extraction/PdfExtractor.cs
create mode 100644 src/Mindee/V2/FileOperations/Crop.cs
create mode 100644 src/Mindee/V2/FileOperations/CropFiles.cs
create mode 100644 src/Mindee/V2/FileOperations/Split.cs
create mode 100644 src/Mindee/V2/FileOperations/SplitFiles.cs
diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs
new file mode 100644
index 00000000..d9f52e73
--- /dev/null
+++ b/src/Mindee/Extraction/ImageExtractor.cs
@@ -0,0 +1,172 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Docnet.Core;
+using Docnet.Core.Models;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
+using SkiaSharp;
+
+namespace Mindee.Extraction
+{
+ ///
+ /// Extract sub-images from an image.
+ ///
+ public class ImageExtractor
+ {
+ ///
+ /// Name of the file.
+ ///
+ protected readonly string _filename;
+ ///
+ /// List of SKBitmap representing the pages of the file.
+ ///
+ private readonly List _pageImages;
+ ///
+ /// Format to save the resulting images as.
+ ///
+ protected readonly string SaveFormat;
+
+ ///
+ /// LocalInputSource object used by the ImageExtractor.
+ ///
+ public readonly LocalInputSource LocalInput;
+
+ ///
+ /// Init from a Local Input Source.
+ ///
+ /// Locally loaded resource.
+ /// Format to save the resulting images as.
+ public ImageExtractor(LocalInputSource localInput, string saveFormat = null)
+ {
+ _filename = localInput.Filename;
+ _pageImages = [];
+ LocalInput = localInput;
+ if (saveFormat == null)
+ {
+ var extension = Path.GetExtension(localInput.Filename)?.Substring(1);
+ if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase))
+ {
+ SaveFormat = extension;
+ }
+ else
+ {
+ SaveFormat = "jpg";
+ }
+ }
+ else
+ {
+ SaveFormat = saveFormat;
+ }
+
+ if (localInput.IsPdf())
+ {
+ var pdfPageImages = PdfToImages(localInput.FileBytes);
+ _pageImages.AddRange(pdfPageImages);
+ }
+ else
+ {
+ _pageImages.Add(SKBitmap.Decode(localInput.FileBytes));
+ }
+ }
+
+ ///
+ /// Init from a path.
+ ///
+ /// Path to the file.
+ public ImageExtractor(string filePath) : this(new LocalInputSource(filePath))
+ {
+ }
+
+ ///
+ /// Renders the input Pdf's pages as individual images.
+ ///
+ /// Input pdf.
+ /// A list of pages, as SKBitmap.
+ private static List PdfToImages(byte[] fileBytes)
+ {
+ var images = new List();
+ lock (DocLib.Instance)
+ {
+ using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1));
+ for (var i = 0; i < docReader.GetPageCount(); i++)
+ {
+ using var pageReader = docReader.GetPageReader(i);
+ var width = pageReader.GetPageWidth();
+ var height = pageReader.GetPageHeight();
+ var bytes = pageReader.GetImage();
+ var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height));
+ images.Add(bmp);
+ }
+
+ return images;
+ }
+ }
+
+ ///
+ /// Splits the filename into name and extension.
+ ///
+ protected static string[] SplitNameStrict(string filename)
+ {
+ return
+ [
+ Path.GetFileNameWithoutExtension(filename),
+ Path.GetExtension(filename).TrimStart('.')
+ ];
+ }
+
+ ///
+ /// Gets the number of pages in the file.
+ ///
+ /// The number of pages in the file.
+ public int GetPageCount()
+ {
+ return _pageImages.Count;
+ }
+
+ ///
+ /// Extracts a single image from a field having position data.
+ ///
+ /// Bounding box of the field.
+ /// Index of the page containing the field.
+ /// Extracted image as an SKBitmap.
+ protected SKBitmap ExtractImage(Bbox bbox, int pageIndex)
+ {
+ var image = _pageImages[pageIndex];
+ var width = image.Width;
+ var height = image.Height;
+ var minX = (int)Math.Round(bbox.MinX * width);
+ var maxX = (int)Math.Round(bbox.MaxX * width);
+ var minY = (int)Math.Round(bbox.MinY * height);
+ var maxY = (int)Math.Round(bbox.MaxY * height);
+
+ var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY);
+ using var canvas = new SKCanvas(croppedBitmap);
+ var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height);
+ var sourceRect = new SKRect(minX, minY, maxX, maxY);
+ canvas.DrawBitmap(image, sourceRect, destRect);
+
+ return croppedBitmap;
+ }
+
+ ///
+ /// Extracts multiple images from a field having position data.
+ ///
+ /// The page index to extract, begins at 0.
+ /// The list of polygons representing the position data.
+ /// A list of extracted images.
+ public List ExtractMultipleImagesFromSource(int pageId, List polygons)
+ {
+ var filename = this.LocalInput.Filename;
+ var extractedImages = new List();
+ foreach (var polygon in polygons)
+ {
+ var bbox = Utils.BboxFromPolygon(polygon);
+ var fieldFilename = $"{filename}_{pageId:D3}_{polygons.IndexOf(polygon):D3}.{SaveFormat}";
+ extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat));
+ }
+ return extractedImages;
+ }
+ }
+}
diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs
new file mode 100644
index 00000000..a1408ac0
--- /dev/null
+++ b/src/Mindee/Extraction/PdfExtractor.cs
@@ -0,0 +1,114 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Docnet.Core;
+using Microsoft.Extensions.Logging.Abstractions;
+using Mindee.Input;
+using Mindee.Pdf;
+using SkiaSharp;
+
+namespace Mindee.Extraction
+{
+ ///
+ /// PDF extraction class.
+ ///
+ public class PdfExtractor
+ {
+ ///
+ /// Local input source.
+ ///
+ protected readonly LocalInputSource LocalInput;
+
+ ///
+ /// Source PDF bytes.
+ ///
+ protected byte[] SourcePdf;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// Instance of a LocalInputSource, provided by the user.
+ public PdfExtractor(LocalInputSource localInput)
+ {
+ LocalInput = localInput;
+ }
+
+ ///
+ /// Wrapper for PDF GetPageCount();
+ ///
+ /// The number of pages in the file.
+ public int GetPageCount()
+ {
+ return LocalInput.GetPageCount();
+ }
+
+ ///
+ /// Extract the PDF bytes.
+ ///
+ ///
+ protected byte[] PdfBytes()
+ {
+ if (SourcePdf != null)
+ {
+ return this.SourcePdf;
+ }
+ if (LocalInput.IsPdf())
+ {
+ SourcePdf = LocalInput.FileBytes;
+ }
+ else
+ {
+ var memoryStream = new MemoryStream();
+ using var image = SKImage.FromEncodedData(LocalInput.FileBytes);
+ using var bmp = SKBitmap.FromImage(image);
+ var pageSize = new SKSize(bmp.Width, bmp.Height);
+ using (var document = SKDocument.CreatePdf(memoryStream))
+ {
+ var canvas = document.BeginPage(pageSize.Width, pageSize.Height);
+ canvas.DrawBitmap(bmp, SKPoint.Empty);
+ document.EndPage();
+ }
+
+ SourcePdf = memoryStream.ToArray();
+ }
+
+ return SourcePdf;
+ }
+
+ ///
+ /// Extracts sub-documents from the source document using list of page indexes.
+ ///
+ /// List of sub-lists of pages to keep.
+ /// Extracted documents.
+ ///
+ public List ExtractSubDocuments(List> pageIndexes)
+ {
+ var extractedPdfs = new List();
+
+ foreach (var pageIndexElem in pageIndexes)
+ {
+ if (pageIndexElem.Count == 0)
+ {
+ throw new ArgumentException("Empty indexes not allowed for extraction.");
+ }
+
+ var extension = Path.GetExtension(LocalInput.Filename);
+ var prefix = Path.GetFileNameWithoutExtension(LocalInput.Filename);
+ var fieldFilename =
+ $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}";
+
+ var splitQuery = new SplitQuery(
+ PdfBytes(),
+ new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray()));
+ lock (DocLib.Instance)
+ {
+ var pdfOperation = new DocNetApi(new NullLogger());
+ var mergedPdfBytes = pdfOperation.Split(splitQuery).File;
+ extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename));
+ }
+ }
+
+ return extractedPdfs;
+ }
+ }
+}
diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs
index d71c4879..bee8cd7c 100644
--- a/src/Mindee/Image/ExtractedImage.cs
+++ b/src/Mindee/Image/ExtractedImage.cs
@@ -42,17 +42,24 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat)
/// Uses the default image format and filename.
///
/// The output directory (must exist).
- public void WriteToFile(string outputPath)
+ ///
+ public void WriteToFile(string outputPath, string fileFormat = null)
{
- var imagePath = Path.Combine(outputPath, Filename);
- var format = GetEncodedImageFormat(_saveFormat);
+ var targetFormat = fileFormat ?? _saveFormat;
+ var format = GetEncodedImageFormat(targetFormat);
- using (var image = SKImage.FromBitmap(Image))
- using (var data = image.Encode(format, 100))
- using (var stream = File.OpenWrite(imagePath))
+ var finalFilename = Filename;
+ if (!string.IsNullOrWhiteSpace(fileFormat))
{
- data.SaveTo(stream);
+ var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename);
+ finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}";
}
+ var imagePath = Path.Combine(outputPath, finalFilename);
+
+ using var image = SKImage.FromBitmap(Image);
+ using var data = image.Encode(format, 100);
+ using var stream = File.OpenWrite(imagePath);
+ data.SaveTo(stream);
}
///
@@ -61,19 +68,18 @@ public void WriteToFile(string outputPath)
/// An instance of .
public LocalInputSource AsInputSource()
{
- using (var image = SKImage.FromBitmap(Image))
- using (var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100))
- using (var output = new MemoryStream())
- {
- data.SaveTo(output);
- return new LocalInputSource(output.ToArray(), Filename);
- }
+ using var image = SKImage.FromBitmap(Image);
+ using var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100);
+ using var output = new MemoryStream();
+ data.SaveTo(output);
+ return new LocalInputSource(output.ToArray(), Filename);
}
- private SKEncodedImageFormat GetEncodedImageFormat(string saveFormat)
+ private static SKEncodedImageFormat GetEncodedImageFormat(string saveFormat)
{
return saveFormat.ToLower() switch
{
+ "jpg" or "jpeg" => SKEncodedImageFormat.Jpeg,
"png" => SKEncodedImageFormat.Png,
"bmp" => SKEncodedImageFormat.Bmp,
"gif" => SKEncodedImageFormat.Gif,
diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Extraction/ImageExtractor.cs
index b6f6576b..aec7f4b1 100644
--- a/src/Mindee/V1/Extraction/ImageExtractor.cs
+++ b/src/Mindee/V1/Extraction/ImageExtractor.cs
@@ -1,122 +1,35 @@
-using System;
using System.Collections.Generic;
-using System.IO;
-using Docnet.Core;
-using Docnet.Core.Models;
+using System.Linq;
using Mindee.Exceptions;
using Mindee.Geometry;
using Mindee.Image;
using Mindee.Input;
using Mindee.V1.Parsing.Standard;
-using SkiaSharp;
namespace Mindee.V1.Extraction
{
///
- /// Extract sub-images from an image.
+ /// Legacy V1 Wrapper for ImageExtractor.
///
- public class ImageExtractor
+ public sealed class ImageExtractor : Mindee.Extraction.ImageExtractor
{
- private readonly string _filename;
- private readonly List _pageImages;
- private readonly string _saveFormat;
-
- ///
- /// LocalInputSource object used by the ImageExtractor.
- ///
- public readonly LocalInputSource LocalInput;
-
///
/// Init from a Local Input Source.
///
/// Locally loaded resource.
/// Format to save the resulting images as.
public ImageExtractor(LocalInputSource localInput, string saveFormat = null)
- {
- _filename = localInput.Filename;
- _pageImages = new List();
- LocalInput = localInput;
- if (saveFormat == null)
- {
- var extension = Path.GetExtension(localInput.Filename)?.Substring(1);
- if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase))
- {
- _saveFormat = extension;
- }
- else
- {
- _saveFormat = "jpg";
- }
- }
- else
- {
- _saveFormat = saveFormat;
- }
-
- if (localInput.IsPdf())
- {
- var pdfPageImages = PdfToImages(localInput.FileBytes);
- _pageImages.AddRange(pdfPageImages);
- }
- else
- {
- _pageImages.Add(SKBitmap.Decode(localInput.FileBytes));
- }
- }
+ : base(localInput, saveFormat)
+ { }
///
/// Init from a path.
///
/// Path to the file.
- public ImageExtractor(string filePath) : this(new LocalInputSource(filePath))
- {
- }
-
- ///
- /// Renders the input Pdf's pages as individual images.
- ///
- /// Input pdf.
- /// A list of pages, as SKBitmap.
- private static List PdfToImages(byte[] fileBytes)
- {
- var images = new List();
- lock (DocLib.Instance)
- {
- using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1));
- for (var i = 0; i < docReader.GetPageCount(); i++)
- {
- using var pageReader = docReader.GetPageReader(i);
- var width = pageReader.GetPageWidth();
- var height = pageReader.GetPageHeight();
- var bytes = pageReader.GetImage();
- var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height));
- images.Add(bmp);
- }
+ public ImageExtractor(string filePath)
+ : base(filePath)
+ { }
- return images;
- }
- }
-
- ///
- /// Splits the filename into name and extension.
- ///
- private static string[] SplitNameStrict(string filename)
- {
- return
- [
- Path.GetFileNameWithoutExtension(filename),
- Path.GetExtension(filename).TrimStart('.')
- ];
- }
-
- ///
- /// Gets the number of pages in the file.
- ///
- /// The number of pages in the file.
- public int GetPageCount()
- {
- return _pageImages.Count;
- }
///
/// Extract multiple images on a given page from a list of fields having position data.
@@ -143,7 +56,7 @@ public IList ExtractImagesFromPage(IList
if (GetPageCount() > 1)
{
var splitName = SplitNameStrict(outputName);
- filename = $"{splitName[0]}.{_saveFormat}";
+ filename = $"{splitName[0]}.{SaveFormat}";
}
else
{
@@ -167,7 +80,7 @@ public IList ExtractImagesFromPage(IList fields,
if (GetPageCount() > 1)
{
var splitName = SplitNameStrict(outputName);
- filename = $"{splitName[0]}.{_saveFormat}";
+ filename = $"{splitName[0]}.{SaveFormat}";
}
else
{
@@ -188,25 +101,15 @@ private List ExtractFromPage(IList field
string outputName) where TBaseField : BaseField
{
var splitName = SplitNameStrict(outputName);
- var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}";
-
- var extractedImages = new List();
- for (var i = 0; i < fields.Count; i++)
- {
- var extractedImage = ExtractImage(fields[i], pageIndex, i + 1, filename);
- if (extractedImage != null)
- {
- extractedImages.Add(extractedImage);
- }
- }
+ var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}";
- return extractedImages;
+ return fields.Select((t, i) => ExtractImage(t, pageIndex, i + 1, filename)).Where(extractedImage => extractedImage != null).ToList();
}
private List ExtractFromPage(IList fields, int pageIndex, string outputName)
{
var splitName = SplitNameStrict(outputName);
- var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}";
+ var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}";
var extractedImages = new List();
for (var i = 0; i < fields.Count; i++)
@@ -256,8 +159,8 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index
}
var bbox = Utils.BboxFromPolygon(boundingBox);
- var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat);
+ var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat);
}
///
@@ -283,29 +186,8 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st
}
var bbox = Utils.BboxFromPolygon(boundingBox);
- var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat);
- }
-
- private SKBitmap ExtractImage(Bbox bbox, int pageIndex)
- {
- var image = _pageImages[pageIndex];
- var width = image.Width;
- var height = image.Height;
- var minX = (int)Math.Round(bbox.MinX * width);
- var maxX = (int)Math.Round(bbox.MaxX * width);
- var minY = (int)Math.Round(bbox.MinY * height);
- var maxY = (int)Math.Round(bbox.MaxY * height);
-
- var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY);
- using (var canvas = new SKCanvas(croppedBitmap))
- {
- var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height);
- var sourceRect = new SKRect(minX, minY, maxX, maxY);
- canvas.DrawBitmap(image, sourceRect, destRect);
- }
-
- return croppedBitmap;
+ var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat);
}
}
}
diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Extraction/PdfExtractor.cs
index 4c1bb1be..6206e253 100644
--- a/src/Mindee/V1/Extraction/PdfExtractor.cs
+++ b/src/Mindee/V1/Extraction/PdfExtractor.cs
@@ -1,102 +1,21 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
-using System.IO;
using System.Linq;
-using Docnet.Core;
-using Docnet.Core.Models;
-using Microsoft.Extensions.Logging.Abstractions;
using Mindee.Input;
using Mindee.Pdf;
using Mindee.V1.Product.InvoiceSplitter;
-using SkiaSharp;
namespace Mindee.V1.Extraction
{
///
- /// PDF extraction class.
+ /// V1 wrapper for the PDF extraction class.
///
- public class PdfExtractor
+ public class PdfExtractor : Mindee.Extraction.PdfExtractor
{
- private readonly string Filename;
- private readonly byte[] SourcePdf;
-
- ///
- /// Initializes a new instance of the class.
- ///
- /// Instance of a LocalInputSource, provided by the user.
- public PdfExtractor(LocalInputSource localInput)
- {
- Filename = localInput.Filename;
-
- if (localInput.IsPdf())
- {
- SourcePdf = localInput.FileBytes;
- }
- else
- {
- var memoryStream = new MemoryStream();
- using var image = SKImage.FromEncodedData(localInput.FileBytes);
- using var bmp = SKBitmap.FromImage(image);
- var pageSize = new SKSize(bmp.Width, bmp.Height);
- using (var document = SKDocument.CreatePdf(memoryStream))
- {
- var canvas = document.BeginPage(pageSize.Width, pageSize.Height);
- canvas.DrawBitmap(bmp, SKPoint.Empty);
- document.EndPage();
- }
-
- SourcePdf = memoryStream.ToArray();
- }
- }
-
- ///
- /// Wrapper for pdf GetPageCount();
- ///
- /// The number of pages in the file.
- public int GetPageCount()
- {
- lock (DocLib.Instance)
- {
- using var docInstance = DocLib.Instance.GetDocReader(SourcePdf, new PageDimensions(1, 1));
- return docInstance.GetPageCount();
- }
- }
-
- ///
- /// Extracts sub-documents from the source document using list of page indexes.
- ///
- /// List of sub-lists of pages to keep.
- /// Extracted documents.
- ///
- public List ExtractSubDocuments(List> pageIndexes)
+ ///
+ public PdfExtractor(LocalInputSource localInput) : base(localInput)
{
- var extractedPdfs = new List();
-
- foreach (var pageIndexElem in pageIndexes)
- {
- if (!pageIndexElem.Any())
- {
- throw new ArgumentException("Empty indexes not allowed for extraction.");
- }
-
- var extension = Path.GetExtension(Filename);
- var prefix = Path.GetFileNameWithoutExtension(Filename);
- var fieldFilename =
- $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}";
-
- var splitQuery = new SplitQuery(
- SourcePdf,
- new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray()));
- lock (DocLib.Instance)
- {
- var pdfOperation = new DocNetApi(new NullLogger());
- var mergedPdfBytes = pdfOperation.Split(splitQuery).File;
- extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename));
- }
- }
-
- return extractedPdfs;
}
///
diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs
new file mode 100644
index 00000000..0160dfc6
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/Crop.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+using System.Linq;
+using Mindee.Extraction;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
+using Mindee.V2.Product.Crop;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// V2 Crop operation utility.
+ ///
+ public sealed class Crop
+ {
+ ///
+ /// LocalInputSource object used by the ImageExtractor.
+ ///
+ private readonly LocalInputSource _localInput;
+
+ ///
+ ///
+ ///
+ ///
+ public Crop(LocalInputSource inputSource)
+ {
+ this._localInput = inputSource;
+ }
+
+ ///
+ ///
+ ///
+ ///
+ ///
+ public ExtractedImage ExtractSingleCrop(CropItem crop)
+ {
+ var polygons = new List() { crop.Location.Polygon };
+ var imageExtractor = new ImageExtractor(this._localInput);
+ return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0];
+ }
+
+ ///
+ /// Extracts multiple crop zones from an image.
+ ///
+ /// List of crops.
+ ///
+ public CropFiles ExtractCrops(List crops)
+ {
+ var imageExtractor = new ImageExtractor(this._localInput);
+ var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList());
+
+ return (CropFiles)extractedImages;
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs
new file mode 100644
index 00000000..f344bf57
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/CropFiles.cs
@@ -0,0 +1,34 @@
+using System.Collections.Generic;
+using System.IO;
+using Mindee.Image;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// Collection of cropped files.
+ ///
+ public class CropFiles : List
+ {
+ ///
+ /// Saves all cropped files to disk.
+ ///
+ /// Path for all files
+ /// Prefix for file names
+ /// File format for saving (default: null)
+ public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat = null)
+ {
+ Directory.CreateDirectory(path);
+
+ int index = 1;
+ foreach (var crop in this)
+ {
+ string fileName = $"{prefix}_{index:D3}.jpg";
+ string filePath = Path.Combine(path, fileName);
+
+ crop.WriteToFile(filePath, fileFormat);
+
+ index++;
+ }
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
new file mode 100644
index 00000000..88e6d5b2
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+using System.Linq;
+using Mindee.Exceptions;
+using Mindee.Extraction;
+using Mindee.Input;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// V2 Split operation utility.
+ ///
+ public sealed class Split
+ {
+
+ ///
+ /// LocalInputSource object used by the ImageExtractor.
+ ///
+ private readonly LocalInputSource _localInput;
+
+ ///
+ /// Expands a range of pages into a list of page indexes.
+ ///
+ /// Start of the range.
+ /// End of the range.
+ /// An array of page indexes.
+ public static List ExpandRange(int start, int end)
+ {
+ if (start > end)
+ {
+ throw new MindeeInputException("Invalid page range provided.");
+ }
+
+ int count = end - start + 1;
+ return Enumerable.Range(start, count).ToList();
+ }
+
+ ///
+ ///
+ ///
+ ///
+ public Split(LocalInputSource inputSource)
+ {
+ this._localInput = inputSource;
+ }
+
+ ///
+ /// Extracts the splits from the input file.
+ ///
+ /// List of subpage indexes to keep.
+ ///
+ public SplitFiles ExtractSplits(List> splits)
+ {
+ var pdfExtractor = new PdfExtractor(this._localInput);
+ if (splits.Count == 0)
+ {
+ throw new MindeeInputException("No splits provided for extraction.");
+ }
+
+ List> expandedPageIndexes = [];
+ expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1])));
+ return (SplitFiles)pdfExtractor.ExtractSubDocuments(expandedPageIndexes);
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs
new file mode 100644
index 00000000..5b26af21
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/SplitFiles.cs
@@ -0,0 +1,33 @@
+using System.Collections.Generic;
+using System.IO;
+using Mindee.Pdf;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// Collection of split PDFs.
+ ///
+ public sealed class SplitFiles : List
+ {
+ ///
+ /// Saves all the extracted pages to disk.
+ ///
+ /// Path for all files
+ /// Prefix for file names
+ public void SaveAllToDisk(string path, string prefix = "split")
+ {
+ Directory.CreateDirectory(path);
+
+ int index = 1;
+ foreach (var crop in this)
+ {
+ string fileName = $"{prefix}_{index:D3}.pdf";
+ string filePath = Path.Combine(path, fileName);
+
+ crop.WriteToFile(filePath);
+
+ index++;
+ }
+ }
+ }
+}
From 9590686d1b8c22b346da30e8ccbd9c49ee167e73 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Tue, 14 Apr 2026 17:19:57 +0200
Subject: [PATCH 02/11] add tests
---
Directory.Build.props | 2 +-
src/Mindee/Extraction/ImageExtractor.cs | 6 +-
src/Mindee/Image/ExtractedImage.cs | 64 +++++++++---
src/Mindee/Pdf/ExtractedPdf.cs | 62 ++++++++----
src/Mindee/Pdf/PdfUtils.cs | 30 ++++++
src/Mindee/V1/Extraction/ImageExtractor.cs | 4 +-
src/Mindee/V2/FileOperations/Crop.cs | 3 +-
src/Mindee/V2/FileOperations/CropFiles.cs | 20 +++-
src/Mindee/V2/FileOperations/Split.cs | 18 +++-
src/Mindee/V2/FileOperations/SplitFiles.cs | 15 +++
.../V2/FileOperations/CropTest.cs | 89 +++++++++++++++++
.../V2/FileOperations/SplitTest.cs | 97 +++++++++++++++++++
.../V2/FileOperations/CropTest.cs | 67 +++++++++++++
.../V2/FileOperations/SplitTest.cs | 55 +++++++++++
14 files changed, 487 insertions(+), 45 deletions(-)
create mode 100644 tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
create mode 100644 tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs
create mode 100644 tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs
create mode 100644 tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs
diff --git a/Directory.Build.props b/Directory.Build.props
index c1dff72d..9d41defe 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -46,7 +46,7 @@
-
+
diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs
index d9f52e73..fb7afb27 100644
--- a/src/Mindee/Extraction/ImageExtractor.cs
+++ b/src/Mindee/Extraction/ImageExtractor.cs
@@ -160,11 +160,13 @@ public List ExtractMultipleImagesFromSource(int pageId, List();
+ int i = 0;
foreach (var polygon in polygons)
{
var bbox = Utils.BboxFromPolygon(polygon);
- var fieldFilename = $"{filename}_{pageId:D3}_{polygons.IndexOf(polygon):D3}.{SaveFormat}";
- extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat));
+ var fieldFilename = $"{filename}_page{pageId}-{polygons.IndexOf(polygon)}.{SaveFormat}";
+ extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat, pageId, i));
+ i++;
}
return extractedImages;
}
diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs
index bee8cd7c..c160522a 100644
--- a/src/Mindee/Image/ExtractedImage.cs
+++ b/src/Mindee/Image/ExtractedImage.cs
@@ -14,17 +14,31 @@ public class ExtractedImage
///
private readonly string _saveFormat;
+ ///
+ /// Page number the image was extracted from.
+ ///
+ public int PageId;
+
+ ///
+ /// ID of the image.
+ ///
+ public int ElementId;
+
///
/// Initializes a new instance of the class.
///
/// The extracted image.
/// The filename for the image.
/// The format to save the image.
- public ExtractedImage(SKBitmap image, string filename, string saveFormat)
+ /// The page number the image was extracted from.
+ /// The ID of the image.
+ public ExtractedImage(SKBitmap image, string filename, string saveFormat, int pageId, int elementId)
{
Image = image;
Filename = filename;
_saveFormat = saveFormat;
+ PageId = pageId;
+ ElementId = elementId;
}
///
@@ -35,29 +49,48 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat)
///
/// Name of the file.
///
- private string Filename { get; }
+ public string Filename { get; }
///
/// Writes the image to a file.
- /// Uses the default image format and filename.
+ /// If outputPath has an extension, it is treated as a full file path.
+ /// Otherwise, it is treated as a directory and uses the default filename.
///
- /// The output directory (must exist).
- ///
- public void WriteToFile(string outputPath, string fileFormat = null)
+ /// The output directory (must exist) or full file path.
+ /// The quality of the image. Defaults to 100.
+ /// The desired format. If null, inferred from extension or default.
+ public void WriteToFile(string outputPath, int quality = 100, string fileFormat = null)
{
+ string imagePath;
var targetFormat = fileFormat ?? _saveFormat;
- var format = GetEncodedImageFormat(targetFormat);
- var finalFilename = Filename;
- if (!string.IsNullOrWhiteSpace(fileFormat))
+ if (Path.HasExtension(outputPath))
+ {
+ imagePath = outputPath;
+ if (string.IsNullOrWhiteSpace(fileFormat))
+ {
+ var extension = Path.GetExtension(outputPath).TrimStart('.');
+ if (!string.IsNullOrWhiteSpace(extension))
+ {
+ targetFormat = extension.ToLower();
+ }
+ }
+ }
+ else
{
- var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename);
- finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}";
+ var finalFilename = Filename;
+ if (!string.IsNullOrWhiteSpace(fileFormat))
+ {
+ var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename);
+ finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}";
+ }
+ imagePath = Path.Combine(outputPath, finalFilename);
}
- var imagePath = Path.Combine(outputPath, finalFilename);
+
+ var format = GetEncodedImageFormat(targetFormat);
using var image = SKImage.FromBitmap(Image);
- using var data = image.Encode(format, 100);
+ using var data = image.Encode(format, quality);
using var stream = File.OpenWrite(imagePath);
data.SaveTo(stream);
}
@@ -65,11 +98,12 @@ public void WriteToFile(string outputPath, string fileFormat = null)
///
/// Returns the image in a format suitable for sending to a client for parsing.
///
+ /// The quality of the image. Defaults to 100.
/// An instance of .
- public LocalInputSource AsInputSource()
+ public LocalInputSource AsInputSource(int quality = 100)
{
using var image = SKImage.FromBitmap(Image);
- using var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100);
+ using var data = image.Encode(GetEncodedImageFormat(_saveFormat), quality);
using var output = new MemoryStream();
data.SaveTo(output);
return new LocalInputSource(output.ToArray(), Filename);
diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs
index 8286d74e..38086e53 100644
--- a/src/Mindee/Pdf/ExtractedPdf.cs
+++ b/src/Mindee/Pdf/ExtractedPdf.cs
@@ -1,6 +1,5 @@
using System.IO;
-using Docnet.Core;
-using Docnet.Core.Models;
+using Mindee.Exceptions;
using Mindee.Input;
namespace Mindee.Pdf
@@ -11,24 +10,55 @@ namespace Mindee.Pdf
public class ExtractedPdf
{
///
- /// Name of the original file.
+ /// Loca
///
- public readonly string Filename;
+ public readonly LocalInputSource LocalInput;
///
- /// File object for an ExtractedPdf.
+ /// Page count.
///
- public readonly byte[] PdfBytes;
+ public int PageCount { get; set; }
+
+ ///
+ /// Original filename.
+ ///
+ public readonly string Filename;
///
/// Initializes a new instance of the class.
///
- /// A byte array representation of the Pdf.
+ /// A byte array representation of the Pdf.
/// Name of the original file.
- public ExtractedPdf(byte[] pdfBytes, string filename)
+ public ExtractedPdf(byte[] fileBytes, string filename)
{
- PdfBytes = pdfBytes;
- Filename = filename;
+ var tmpInput = new LocalInputSource(fileBytes, filename);
+ if (tmpInput.IsPdf())
+ {
+ LocalInput = tmpInput;
+ }
+ else
+ {
+ byte[] pdfBytes = PdfUtils.ConvertImageToPdf(fileBytes, filename);
+ string newFilename = Path.ChangeExtension(filename, ".pdf");
+ LocalInput = new LocalInputSource(pdfBytes, newFilename);
+ }
+ PageCount = LocalInput.GetPageCount();
+ Filename = LocalInput.Filename;
+ }
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// LocalInputSource containing the Pdf bytes and filename.
+ public ExtractedPdf(LocalInputSource localInput)
+ {
+ LocalInput = localInput;
+ if (!localInput.IsPdf())
+ {
+ throw new MindeeInputException("The input file is not a PDF.");
+ }
+ PageCount = LocalInput.GetPageCount();
+ Filename = LocalInput.Filename;
}
///
@@ -37,11 +67,7 @@ public ExtractedPdf(byte[] pdfBytes, string filename)
/// The number of pages in the file.
public int GetPageCount()
{
- lock (DocLib.Instance)
- {
- using var docInstance = DocLib.Instance.GetDocReader(PdfBytes, new PageDimensions(1, 1));
- return docInstance.GetPageCount();
- }
+ return LocalInput.GetPageCount();
}
///
@@ -50,13 +76,13 @@ public int GetPageCount()
/// the output directory (must exist).
public void WriteToFile(string outputPath)
{
- var pdfPath = Path.Combine(outputPath, Filename);
+ var pdfPath = Path.Combine(outputPath, LocalInput.Filename);
if (Path.GetFileName(outputPath) != string.Empty)
{
pdfPath = Path.GetFullPath(outputPath);
}
- File.WriteAllBytes(pdfPath, PdfBytes);
+ File.WriteAllBytes(pdfPath, LocalInput.FileBytes);
}
///
@@ -65,7 +91,7 @@ public void WriteToFile(string outputPath)
/// an instance of
public LocalInputSource AsInputSource()
{
- return new LocalInputSource(PdfBytes, Filename);
+ return LocalInput;
}
}
}
diff --git a/src/Mindee/Pdf/PdfUtils.cs b/src/Mindee/Pdf/PdfUtils.cs
index 1aff8156..a44a1406 100644
--- a/src/Mindee/Pdf/PdfUtils.cs
+++ b/src/Mindee/Pdf/PdfUtils.cs
@@ -1,4 +1,5 @@
using System;
+using System.IO;
using System.Linq;
using Docnet.Core;
using Docnet.Core.Models;
@@ -178,5 +179,34 @@ public static bool HasSourceText(byte[] fileBytes)
return false;
}
+
+ ///
+ /// Converts an image to a PDF.
+ ///
+ /// Raw image bytes.
+ /// Name of the file.
+ ///
+ ///
+ public static byte[] ConvertImageToPdf(byte[] imageBytes, string filename)
+ {
+ using var ms = new MemoryStream();
+ using var bitmap = SKBitmap.Decode(imageBytes);
+ if (bitmap == null)
+ {
+ throw new MindeeInputException($"The file {filename} is not a valid image.");
+ }
+
+ using (var document = SKDocument.CreatePdf(ms))
+ {
+ using (var canvas = document.BeginPage(bitmap.Width, bitmap.Height))
+ {
+ canvas.DrawBitmap(bitmap, 0, 0);
+ document.EndPage();
+ }
+ document.Close();
+ }
+
+ return ms.ToArray();
+ }
}
}
diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Extraction/ImageExtractor.cs
index aec7f4b1..f98f2de7 100644
--- a/src/Mindee/V1/Extraction/ImageExtractor.cs
+++ b/src/Mindee/V1/Extraction/ImageExtractor.cs
@@ -160,7 +160,7 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index
var bbox = Utils.BboxFromPolygon(boundingBox);
var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat);
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index);
}
///
@@ -187,7 +187,7 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st
var bbox = Utils.BboxFromPolygon(boundingBox);
var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat);
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index);
}
}
}
diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs
index 0160dfc6..7882d959 100644
--- a/src/Mindee/V2/FileOperations/Crop.cs
+++ b/src/Mindee/V2/FileOperations/Crop.cs
@@ -48,8 +48,7 @@ public CropFiles ExtractCrops(List crops)
{
var imageExtractor = new ImageExtractor(this._localInput);
var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList());
-
- return (CropFiles)extractedImages;
+ return new CropFiles(extractedImages);
}
}
}
diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs
index f344bf57..f547761a 100644
--- a/src/Mindee/V2/FileOperations/CropFiles.cs
+++ b/src/Mindee/V2/FileOperations/CropFiles.cs
@@ -9,13 +9,29 @@ namespace Mindee.V2.FileOperations
///
public class CropFiles : List
{
+ ///
+ ///
+ ///
+ ///
+ public CropFiles(IEnumerable collection) : base(collection)
+ {
+ }
+
+ ///
+ ///
+ ///
+ public CropFiles() : base()
+ {
+ }
+
///
/// Saves all cropped files to disk.
///
/// Path for all files
/// Prefix for file names
+ /// Quality of the output image
/// File format for saving (default: null)
- public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat = null)
+ public void SaveAllToDisk(string path, int quality = 100, string prefix = "crop", string fileFormat = null)
{
Directory.CreateDirectory(path);
@@ -25,7 +41,7 @@ public void SaveAllToDisk(string path, string prefix = "crop", string fileFormat
string fileName = $"{prefix}_{index:D3}.jpg";
string filePath = Path.Combine(path, fileName);
- crop.WriteToFile(filePath, fileFormat);
+ crop.WriteToFile(filePath, quality, fileFormat);
index++;
}
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
index 88e6d5b2..3bc992e4 100644
--- a/src/Mindee/V2/FileOperations/Split.cs
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -1,8 +1,10 @@
using System.Collections.Generic;
+using System.IO;
using System.Linq;
using Mindee.Exceptions;
using Mindee.Extraction;
using Mindee.Input;
+using Mindee.Pdf;
namespace Mindee.V2.FileOperations
{
@@ -35,12 +37,22 @@ public static List ExpandRange(int start, int end)
}
///
- ///
+ /// Initializes an instance of a Split operation.
+ /// Transforms images to PDFs if necessary.
///
///
public Split(LocalInputSource inputSource)
{
- this._localInput = inputSource;
+ if (inputSource.IsPdf())
+ {
+ _localInput = inputSource;
+ }
+ else
+ {
+ byte[] pdfBytes = PdfUtils.ConvertImageToPdf(inputSource.FileBytes, inputSource.Filename);
+ string newFilename = Path.ChangeExtension(inputSource.Filename, ".pdf");
+ _localInput = new LocalInputSource(pdfBytes, newFilename);
+ }
}
///
@@ -58,7 +70,7 @@ public SplitFiles ExtractSplits(List> splits)
List> expandedPageIndexes = [];
expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1])));
- return (SplitFiles)pdfExtractor.ExtractSubDocuments(expandedPageIndexes);
+ return new SplitFiles(pdfExtractor.ExtractSubDocuments(expandedPageIndexes));
}
}
}
diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs
index 5b26af21..443f0847 100644
--- a/src/Mindee/V2/FileOperations/SplitFiles.cs
+++ b/src/Mindee/V2/FileOperations/SplitFiles.cs
@@ -9,6 +9,21 @@ namespace Mindee.V2.FileOperations
///
public sealed class SplitFiles : List
{
+ ///
+ ///
+ ///
+ ///
+ public SplitFiles(IEnumerable collection) : base(collection)
+ {
+ }
+
+ ///
+ ///
+ ///
+ public SplitFiles() : base()
+ {
+ }
+
///
/// Saves all the extracted pages to disk.
///
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
new file mode 100644
index 00000000..8c66c7be
--- /dev/null
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -0,0 +1,89 @@
+using Mindee.Input;
+using Mindee.V2;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Product.Crop;
+using Mindee.V2.Product.Crop.Params;
+using Mindee.V2.Product.Extraction;
+using Mindee.V2.Product.Extraction.Params;
+
+namespace Mindee.IntegrationTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class CropTest : IDisposable
+ {
+ private readonly string? _cropModelId;
+ private readonly string? _findocModelId;
+ private readonly Client _client;
+ private readonly string _outputDir;
+
+ public CropTest()
+ {
+ var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey");
+ _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey);
+ _cropModelId = Environment.GetEnvironmentVariable("MindeeV2__Crop__Model__Id");
+ _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id");
+
+ _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output");
+ if (!Directory.Exists(_outputDir))
+ {
+ Directory.CreateDirectory(_outputDir);
+ }
+ }
+
+ public void Dispose()
+ {
+ var file1 = Path.Combine(_outputDir, "crop_001.jpg");
+ var file2 = Path.Combine(_outputDir, "crop_002.jpg");
+
+ if (File.Exists(file1)) File.Delete(file1);
+ if (File.Exists(file2)) File.Delete(file2);
+ }
+
+ private void CheckFindocReturn(ExtractionResponse findocResponse)
+ {
+ Assert.True(findocResponse.Inference.Model.Id.Length > 0);
+
+ var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField;
+ Assert.NotNull(totalAmount);
+ Assert.True(totalAmount.Value > 0);
+ }
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Crops_From_Image_Correctly()
+ {
+ var inputSource = new LocalInputSource(
+ Constants.V2ProductDir + "crop/default_sample.jpg");
+ var cropParams = new CropParameters(_cropModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, cropParams);
+
+ Assert.NotNull(response);
+ Assert.Equal(2, response.Inference.Result.Crops.Count);
+
+ var cropOperation = new Crop(inputSource);
+ var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops);
+
+ Assert.Equal(2, extractedImages.Count);
+ Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename);
+ Assert.Equal("default_sample.jpg_page0-1.jpg", extractedImages[1].Filename);
+
+ var extractionInput = extractedImages[0].AsInputSource();
+ var findocParams = new ExtractionParameters(_findocModelId);
+
+ var invoice0 = await _client.EnqueueAndGetResultAsync(
+ extractionInput, findocParams);
+
+ CheckFindocReturn(invoice0);
+
+ extractedImages.SaveAllToDisk(_outputDir, 50);
+
+ var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg"));
+ Assert.InRange(file1Info.Length, 100000, 110000);
+
+ var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg"));
+ Assert.InRange(file2Info.Length, 100000, 110000);
+ }
+ }
+}
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs
new file mode 100644
index 00000000..1801d90d
--- /dev/null
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs
@@ -0,0 +1,97 @@
+using Mindee.Input;
+using Mindee.V2;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Product.Extraction;
+using Mindee.V2.Product.Extraction.Params;
+using Mindee.V2.Product.Split;
+using Mindee.V2.Product.Split.Params;
+
+namespace Mindee.IntegrationTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class SplitTest : IDisposable
+ {
+ private readonly string? _splitModelId;
+ private readonly string? _findocModelId;
+ private readonly Client _client;
+ private readonly string _outputDir;
+
+ public SplitTest()
+ {
+ var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey");
+ _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey);
+ _splitModelId = Environment.GetEnvironmentVariable("MindeeV2__Split__Model__Id");
+ _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id");
+
+ _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output");
+ if (!Directory.Exists(_outputDir))
+ {
+ Directory.CreateDirectory(_outputDir);
+ }
+ }
+
+ public void Dispose()
+ {
+ var file1 = Path.Combine(_outputDir, "split_001.pdf");
+ var file2 = Path.Combine(_outputDir, "split_002.pdf");
+
+ if (File.Exists(file1)) File.Delete(file1);
+ if (File.Exists(file2)) File.Delete(file2);
+ }
+
+ private void CheckFindocReturn(ExtractionResponse findocResponse)
+ {
+ Assert.True(findocResponse.Inference.Model.Id.Length > 0);
+
+ var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField;
+ Assert.NotNull(totalAmount);
+ Assert.True(totalAmount.Value > 0);
+ }
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Splits_From_Pdf_Correctly()
+ {
+ var inputSource = new LocalInputSource(
+ Constants.V2ProductDir + "split/default_sample.pdf");
+ var splitParams = new SplitParameters(_splitModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, splitParams);
+
+ Assert.NotNull(response);
+ Assert.Equal(2, response.Inference.Result.Splits.Count);
+
+ var splitOperation = new Split(inputSource);
+ var extractedSplits = splitOperation.ExtractSplits(
+ response.Inference.Result.Splits.Select(s => s.PageRange).ToList());
+
+ Assert.Equal(2, extractedSplits.Count);
+ Assert.Equal("default_sample_001-001.pdf", extractedSplits[0].Filename);
+ Assert.Equal("default_sample_002-002.pdf", extractedSplits[1].Filename);
+
+ var extractionInput = extractedSplits[0].AsInputSource();
+ var findocParams = new ExtractionParameters(_findocModelId);
+
+ var invoice0 = await _client.EnqueueAndGetResultAsync(
+ extractionInput, findocParams);
+
+ CheckFindocReturn(invoice0);
+
+ extractedSplits.SaveAllToDisk(_outputDir);
+
+ for (int i = 0; i < extractedSplits.Count; i++)
+ {
+ var fileName = $"split_{i + 1:D3}.pdf";
+ var filePath = Path.Combine(_outputDir, fileName);
+ var fileInfo = new FileInfo(filePath);
+
+ Assert.True(fileInfo.Exists);
+ Assert.True(fileInfo.Length > 0);
+
+ var localInput = new LocalInputSource(fileInfo);
+ Assert.Equal(extractedSplits[i].PageCount, localInput.GetPageCount());
+ }
+ }
+ }
+}
diff --git a/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs
new file mode 100644
index 00000000..5a2fb359
--- /dev/null
+++ b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs
@@ -0,0 +1,67 @@
+using Mindee.Input;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Parsing;
+using Mindee.V2.Product.Crop;
+
+namespace Mindee.UnitTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class CropTest
+ {
+ private readonly string _cropDataDir = Path.Combine(Constants.V2RootDir, "products", "crop");
+
+ [Fact]
+ public void Processes_SinglePage_CropSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_cropDataDir, "default_sample.jpg")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_cropDataDir, "crop_single.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var cropOperation = new Crop(inputSample);
+ var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops);
+
+ Assert.Single(extractedCrops);
+
+ Assert.Equal(0, extractedCrops[0].PageId);
+ Assert.Equal(0, extractedCrops[0].ElementId);
+
+ using var bitmap0 = extractedCrops[0].Image;
+ Assert.Equal(2822, bitmap0.Width);
+ Assert.Equal(1572, bitmap0.Height);
+ }
+
+ [Fact]
+ public void Processes_MultiPage_ReceiptSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_cropDataDir, "multipage_sample.pdf")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_cropDataDir, "crop_multiple.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var cropOperation = new Crop(inputSample);
+ var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops);
+
+ Assert.Equal(2, extractedCrops.Count);
+
+ Assert.Equal(0, extractedCrops[0].PageId);
+ Assert.Equal(0, extractedCrops[0].ElementId);
+
+ using var bitmap0 = extractedCrops[0].Image;
+ Assert.Equal(156, bitmap0.Width);
+ Assert.Equal(757, bitmap0.Height);
+
+ Assert.Equal(0, extractedCrops[1].PageId);
+ Assert.Equal(1, extractedCrops[1].ElementId);
+
+ using var bitmap1 = extractedCrops[1].Image;
+ Assert.Equal(188, bitmap1.Width);
+ Assert.Equal(691, bitmap1.Height);
+ }
+ }
+}
diff --git a/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs
new file mode 100644
index 00000000..88093be3
--- /dev/null
+++ b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs
@@ -0,0 +1,55 @@
+using Mindee.Input;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Parsing;
+using Mindee.V2.Product.Split;
+
+namespace Mindee.UnitTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class SplitTest
+ {
+ private readonly string _splitDataDir = Path.Combine(Constants.V2RootDir, "products", "split");
+ private readonly string _finDocDataDir = Path.Combine(Constants.V2RootDir, "products", "extraction", "financial_document");
+
+ [Fact]
+ public void Processes_SinglePage_Split_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_finDocDataDir, "default_sample.jpg")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_splitDataDir, "split_single.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var splitOperation = new Split(inputSample);
+ List splits = doc.Inference.Result.Splits;
+ var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList());
+
+ Assert.Single(extractedSplits);
+
+ Assert.Equal(1, extractedSplits[0].PageCount);
+ }
+
+ [Fact]
+ public void Processes_MultiPage_ReceiptSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_splitDataDir, "invoice_5p.pdf")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_splitDataDir, "split_multiple.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var splitOperation = new Split(inputSample);
+ List splits = doc.Inference.Result.Splits;
+ var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList());
+
+ Assert.Equal(3, extractedSplits.Count);
+
+ Assert.Equal(1, extractedSplits[0].PageCount);
+ Assert.Equal(3, extractedSplits[1].PageCount);
+ Assert.Equal(1, extractedSplits[2].PageCount);
+ }
+ }
+}
From 56dd678e03f16dfc313aa67e0439609a8ed73a91 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Tue, 14 Apr 2026 17:45:55 +0200
Subject: [PATCH 03/11] uniformize exception throws
---
src/Mindee/Extraction/PdfExtractor.cs | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs
index a1408ac0..625745f2 100644
--- a/src/Mindee/Extraction/PdfExtractor.cs
+++ b/src/Mindee/Extraction/PdfExtractor.cs
@@ -3,6 +3,7 @@
using System.IO;
using Docnet.Core;
using Microsoft.Extensions.Logging.Abstractions;
+using Mindee.Exceptions;
using Mindee.Input;
using Mindee.Pdf;
using SkiaSharp;
@@ -89,7 +90,7 @@ public List ExtractSubDocuments(List> pageIndexes)
{
if (pageIndexElem.Count == 0)
{
- throw new ArgumentException("Empty indexes not allowed for extraction.");
+ throw new MindeeInputException("Empty indexes not allowed for extraction.");
}
var extension = Path.GetExtension(LocalInput.Filename);
From 2c2021c6f450a00dfb447e0344960e445ffcb937 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Wed, 15 Apr 2026 10:20:17 +0200
Subject: [PATCH 04/11] fix typo
---
src/Mindee/Pdf/ExtractedPdf.cs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs
index 38086e53..6191b1e4 100644
--- a/src/Mindee/Pdf/ExtractedPdf.cs
+++ b/src/Mindee/Pdf/ExtractedPdf.cs
@@ -10,7 +10,7 @@ namespace Mindee.Pdf
public class ExtractedPdf
{
///
- /// Loca
+ /// Local input source.
///
public readonly LocalInputSource LocalInput;
From 358bb451603899a3245bd005c032ad68d107c9e1 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Thu, 16 Apr 2026 10:09:04 +0200
Subject: [PATCH 05/11] add missing save methods
---
src/Mindee/V2/FileOperations/Crop.cs | 2 +-
src/Mindee/V2/FileOperations/Split.cs | 11 +++++++++++
src/Mindee/V2/Product/Crop/CropItem.cs | 16 ++++++++++++++++
src/Mindee/V2/Product/Split/SplitRange.cs | 13 +++++++++++++
4 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs
index 7882d959..56dc54f6 100644
--- a/src/Mindee/V2/FileOperations/Crop.cs
+++ b/src/Mindee/V2/FileOperations/Crop.cs
@@ -34,7 +34,7 @@ public Crop(LocalInputSource inputSource)
///
public ExtractedImage ExtractSingleCrop(CropItem crop)
{
- var polygons = new List() { crop.Location.Polygon };
+ var polygons = new List { crop.Location.Polygon };
var imageExtractor = new ImageExtractor(this._localInput);
return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0];
}
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
index 3bc992e4..bd9931bc 100644
--- a/src/Mindee/V2/FileOperations/Split.cs
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -5,6 +5,7 @@
using Mindee.Extraction;
using Mindee.Input;
using Mindee.Pdf;
+using Mindee.V2.Product.Split;
namespace Mindee.V2.FileOperations
{
@@ -55,6 +56,16 @@ public Split(LocalInputSource inputSource)
}
}
+ ///
+ /// Extracts a single split from the input file.
+ ///
+ ///
+ ///
+ public ExtractedPdf ExtractSingleSplit(SplitRange splitRange)
+ {
+ return ExtractSplits([splitRange.PageRange])[0];
+ }
+
///
/// Extracts the splits from the input file.
///
diff --git a/src/Mindee/V2/Product/Crop/CropItem.cs b/src/Mindee/V2/Product/Crop/CropItem.cs
index 923811cf..d703f28e 100644
--- a/src/Mindee/V2/Product/Crop/CropItem.cs
+++ b/src/Mindee/V2/Product/Crop/CropItem.cs
@@ -1,4 +1,9 @@
+using System.Collections.Generic;
using System.Text.Json.Serialization;
+using Mindee.Extraction;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
using Mindee.V2.Parsing.Inference.Field;
namespace Mindee.V2.Product.Crop
@@ -28,5 +33,16 @@ public override string ToString()
{
return $"* :Location: {Location}\n :Object Type: {ObjectType}";
}
+
+ ///
+ /// Extract the crop from the source document.
+ ///
+ ///
+ ///
+ public ExtractedImage ExtractFromFile(LocalInputSource inputSource)
+ {
+ var crop = new FileOperations.Crop(inputSource);
+ return crop.ExtractSingleCrop(this);
+ }
}
}
diff --git a/src/Mindee/V2/Product/Split/SplitRange.cs b/src/Mindee/V2/Product/Split/SplitRange.cs
index 6d0cb704..7b93519d 100644
--- a/src/Mindee/V2/Product/Split/SplitRange.cs
+++ b/src/Mindee/V2/Product/Split/SplitRange.cs
@@ -1,5 +1,7 @@
using System.Collections.Generic;
using System.Text.Json.Serialization;
+using Mindee.Input;
+using Mindee.Pdf;
namespace Mindee.V2.Product.Split
{
@@ -29,5 +31,16 @@ public override string ToString()
string pageRange = string.Join(",", PageRange);
return $"* :Page Range: {pageRange}\n :Document Type: {DocumentType}";
}
+
+ ///
+ /// Extracts the split from the source document.
+ ///
+ ///
+ ///
+ public ExtractedPdf ExtractFromFile(LocalInputSource inputSource)
+ {
+ var split = new FileOperations.Split(inputSource);
+ return split.ExtractSingleSplit(this);
+ }
}
}
From 603922c49eccce12541b0a5716e4f1d896f4876b Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 10:08:44 +0200
Subject: [PATCH 06/11] add complementary test for crop page count
---
src/Mindee/V2/FileOperations/Crop.cs | 15 ++++++++----
src/Mindee/V2/FileOperations/Split.cs | 2 +-
.../V2/FileOperations/CropTest.cs | 23 +++++++++++++++++--
3 files changed, 32 insertions(+), 8 deletions(-)
diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs
index 56dc54f6..59601115 100644
--- a/src/Mindee/V2/FileOperations/Crop.cs
+++ b/src/Mindee/V2/FileOperations/Crop.cs
@@ -14,7 +14,7 @@ namespace Mindee.V2.FileOperations
public sealed class Crop
{
///
- /// LocalInputSource object used by the ImageExtractor.
+ /// LocalInputSource object.
///
private readonly LocalInputSource _localInput;
@@ -28,7 +28,7 @@ public Crop(LocalInputSource inputSource)
}
///
- ///
+ /// Extract a single crop item from a file.
///
///
///
@@ -40,15 +40,20 @@ public ExtractedImage ExtractSingleCrop(CropItem crop)
}
///
- /// Extracts multiple crop zones from an image.
+ /// Extracts multiple crop zones from a file.
///
/// List of crops.
///
public CropFiles ExtractCrops(List crops)
{
var imageExtractor = new ImageExtractor(this._localInput);
- var extractedImages = imageExtractor.ExtractMultipleImagesFromSource(crops[0].Location.Page, crops.Select(c => c.Location.Polygon).ToList());
- return new CropFiles(extractedImages);
+ CropFiles extractedImages = [];
+ var cropsPerPage = crops.GroupBy(c => c.Location.Page).ToList();
+ foreach (var pageCrops in cropsPerPage)
+ {
+ extractedImages.AddRange(imageExtractor.ExtractMultipleImagesFromSource(pageCrops.Key, pageCrops.Select(c => c.Location.Polygon).ToList()));
+ }
+ return extractedImages;
}
}
}
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
index bd9931bc..101b4004 100644
--- a/src/Mindee/V2/FileOperations/Split.cs
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -16,7 +16,7 @@ public sealed class Split
{
///
- /// LocalInputSource object used by the ImageExtractor.
+ /// LocalInputSource object.
///
private readonly LocalInputSource _localInput;
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
index 8c66c7be..1fa0b796 100644
--- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -52,8 +52,8 @@ private void CheckFindocReturn(ExtractionResponse findocResponse)
[Fact(Timeout = 180000)]
public async Task Extract_Crops_From_Image_Correctly()
{
- var inputSource = new LocalInputSource(
- Constants.V2ProductDir + "crop/default_sample.jpg");
+ var inputSource = new LocalInputSource(Path.Combine(
+ Constants.V2ProductDir, "crop/default_sample.jpg"));
var cropParams = new CropParameters(_cropModelId);
var response = await _client.EnqueueAndGetResultAsync(
@@ -85,5 +85,24 @@ public async Task Extract_Crops_From_Image_Correctly()
var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg"));
Assert.InRange(file2Info.Length, 100000, 110000);
}
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly()
+ {
+
+ var inputSource = new LocalInputSource(
+ new FileInfo(Path.Combine(Constants.V2ProductDir, "multipage_sample.pdf")));
+
+ var cropParams = new CropParameters(_cropModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, cropParams);
+ var cropOperation = new Crop(inputSource);
+ var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops);
+
+ Assert.Equal(2, extractedImages.Count);
+ Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename);
+ Assert.Equal("default_sample.jpg_page1-0.jpg", extractedImages[1].Filename);
+ }
}
}
From 610b7c37d25cde45f0474fa925d2aa16b504b796 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 10:21:29 +0200
Subject: [PATCH 07/11] fix syntax
---
tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
index 1fa0b796..1f630b2d 100644
--- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -91,7 +91,7 @@ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly()
{
var inputSource = new LocalInputSource(
- new FileInfo(Path.Combine(Constants.V2ProductDir, "multipage_sample.pdf")));
+ new FileInfo(Path.Combine(Constants.V2ProductDir, "crop/multipage_sample.pdf")));
var cropParams = new CropParameters(_cropModelId);
From 410fd6f883882d1640ec632ab4f608b5f952bb46 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 10:52:48 +0200
Subject: [PATCH 08/11] fix naming for V1 extraction namespace
---
src/Mindee/V1/{Extraction => Image}/ImageExtractor.cs | 2 +-
src/Mindee/V1/{Extraction => Image}/PdfExtractor.cs | 2 +-
.../V1/InvoiceSplitterAutoExtractionTest.cs | 2 +-
tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs | 2 +-
tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs | 2 +-
5 files changed, 5 insertions(+), 5 deletions(-)
rename src/Mindee/V1/{Extraction => Image}/ImageExtractor.cs (99%)
rename src/Mindee/V1/{Extraction => Image}/PdfExtractor.cs (99%)
diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Image/ImageExtractor.cs
similarity index 99%
rename from src/Mindee/V1/Extraction/ImageExtractor.cs
rename to src/Mindee/V1/Image/ImageExtractor.cs
index f98f2de7..91f544fe 100644
--- a/src/Mindee/V1/Extraction/ImageExtractor.cs
+++ b/src/Mindee/V1/Image/ImageExtractor.cs
@@ -6,7 +6,7 @@
using Mindee.Input;
using Mindee.V1.Parsing.Standard;
-namespace Mindee.V1.Extraction
+namespace Mindee.V1.Image
{
///
/// Legacy V1 Wrapper for ImageExtractor.
diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Image/PdfExtractor.cs
similarity index 99%
rename from src/Mindee/V1/Extraction/PdfExtractor.cs
rename to src/Mindee/V1/Image/PdfExtractor.cs
index 6206e253..359852ba 100644
--- a/src/Mindee/V1/Extraction/PdfExtractor.cs
+++ b/src/Mindee/V1/Image/PdfExtractor.cs
@@ -6,7 +6,7 @@
using Mindee.Pdf;
using Mindee.V1.Product.InvoiceSplitter;
-namespace Mindee.V1.Extraction
+namespace Mindee.V1.Image
{
///
/// V1 wrapper for the PDF extraction class.
diff --git a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
index 98234901..616d3c16 100644
--- a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
+++ b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
@@ -1,6 +1,6 @@
using Mindee.Input;
using Mindee.Pdf;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.Invoice;
using Mindee.V1.Product.InvoiceSplitter;
diff --git a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
index e8564025..e3503329 100644
--- a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
+++ b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
@@ -1,5 +1,5 @@
using Mindee.Input;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.BarcodeReader;
using Mindee.V1.Product.MultiReceiptsDetector;
diff --git a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
index b88fd3d5..a6207273 100644
--- a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
+++ b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
@@ -1,5 +1,5 @@
using Mindee.Input;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.InvoiceSplitter;
From 937da1e296ddd4cfcb056d33b8b5b0a3f31bbb76 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 11:42:08 +0200
Subject: [PATCH 09/11] fix test
---
tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
index 1f630b2d..4b633a83 100644
--- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -100,9 +100,9 @@ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly()
var cropOperation = new Crop(inputSource);
var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops);
- Assert.Equal(2, extractedImages.Count);
- Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename);
- Assert.Equal("default_sample.jpg_page1-0.jpg", extractedImages[1].Filename);
+ Assert.Equal(5, extractedImages.Count);
+ Assert.Equal("multipage_sample.pdf_page0-0.jpg", extractedImages[0].Filename);
+ Assert.Equal("multipage_sample.pdf_page1-0.jpg", extractedImages[3].Filename);
}
}
}
From f4bd446092dcb14eb3fa18bcba052812a9e03b22 Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 16:06:34 +0200
Subject: [PATCH 10/11] fix test
---
tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
index 4b633a83..e851e459 100644
--- a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -80,10 +80,10 @@ public async Task Extract_Crops_From_Image_Correctly()
extractedImages.SaveAllToDisk(_outputDir, 50);
var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg"));
- Assert.InRange(file1Info.Length, 100000, 110000);
+ Assert.InRange(file1Info.Length, 99000, 110000);
var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg"));
- Assert.InRange(file2Info.Length, 100000, 110000);
+ Assert.InRange(file2Info.Length, 99000, 110000);
}
[Fact(Timeout = 180000)]
From 1221ba1453427ff548e11a5eea163eafd731b27c Mon Sep 17 00:00:00 2001
From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com>
Date: Fri, 17 Apr 2026 16:19:55 +0200
Subject: [PATCH 11/11] remove unnecessary safety for splits
---
src/Mindee/V2/FileOperations/Split.cs | 4 ----
1 file changed, 4 deletions(-)
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
index 101b4004..25ca0cd4 100644
--- a/src/Mindee/V2/FileOperations/Split.cs
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -74,10 +74,6 @@ public ExtractedPdf ExtractSingleSplit(SplitRange splitRange)
public SplitFiles ExtractSplits(List> splits)
{
var pdfExtractor = new PdfExtractor(this._localInput);
- if (splits.Count == 0)
- {
- throw new MindeeInputException("No splits provided for extraction.");
- }
List> expandedPageIndexes = [];
expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1])));