diff --git a/Directory.Build.props b/Directory.Build.props
index c1dff72d1..9d41defe6 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -46,7 +46,7 @@
-
+
diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs
new file mode 100644
index 000000000..fb7afb27d
--- /dev/null
+++ b/src/Mindee/Extraction/ImageExtractor.cs
@@ -0,0 +1,174 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Docnet.Core;
+using Docnet.Core.Models;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
+using SkiaSharp;
+
+namespace Mindee.Extraction
+{
+ ///
+ /// Extract sub-images from an image.
+ ///
+ public class ImageExtractor
+ {
+ ///
+ /// Name of the file.
+ ///
+ protected readonly string _filename;
+ ///
+ /// List of SKBitmap representing the pages of the file.
+ ///
+ private readonly List _pageImages;
+ ///
+ /// Format to save the resulting images as.
+ ///
+ protected readonly string SaveFormat;
+
+ ///
+ /// LocalInputSource object used by the ImageExtractor.
+ ///
+ public readonly LocalInputSource LocalInput;
+
+ ///
+ /// Init from a Local Input Source.
+ ///
+ /// Locally loaded resource.
+ /// Format to save the resulting images as.
+ public ImageExtractor(LocalInputSource localInput, string saveFormat = null)
+ {
+ _filename = localInput.Filename;
+ _pageImages = [];
+ LocalInput = localInput;
+ if (saveFormat == null)
+ {
+ var extension = Path.GetExtension(localInput.Filename)?.Substring(1);
+ if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase))
+ {
+ SaveFormat = extension;
+ }
+ else
+ {
+ SaveFormat = "jpg";
+ }
+ }
+ else
+ {
+ SaveFormat = saveFormat;
+ }
+
+ if (localInput.IsPdf())
+ {
+ var pdfPageImages = PdfToImages(localInput.FileBytes);
+ _pageImages.AddRange(pdfPageImages);
+ }
+ else
+ {
+ _pageImages.Add(SKBitmap.Decode(localInput.FileBytes));
+ }
+ }
+
+ ///
+ /// Init from a path.
+ ///
+ /// Path to the file.
+ public ImageExtractor(string filePath) : this(new LocalInputSource(filePath))
+ {
+ }
+
+ ///
+ /// Renders the input Pdf's pages as individual images.
+ ///
+ /// Input pdf.
+ /// A list of pages, as SKBitmap.
+ private static List PdfToImages(byte[] fileBytes)
+ {
+ var images = new List();
+ lock (DocLib.Instance)
+ {
+ using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1));
+ for (var i = 0; i < docReader.GetPageCount(); i++)
+ {
+ using var pageReader = docReader.GetPageReader(i);
+ var width = pageReader.GetPageWidth();
+ var height = pageReader.GetPageHeight();
+ var bytes = pageReader.GetImage();
+ var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height));
+ images.Add(bmp);
+ }
+
+ return images;
+ }
+ }
+
+ ///
+ /// Splits the filename into name and extension.
+ ///
+ protected static string[] SplitNameStrict(string filename)
+ {
+ return
+ [
+ Path.GetFileNameWithoutExtension(filename),
+ Path.GetExtension(filename).TrimStart('.')
+ ];
+ }
+
+ ///
+ /// Gets the number of pages in the file.
+ ///
+ /// The number of pages in the file.
+ public int GetPageCount()
+ {
+ return _pageImages.Count;
+ }
+
+ ///
+ /// Extracts a single image from a field having position data.
+ ///
+ /// Bounding box of the field.
+ /// Index of the page containing the field.
+ /// Extracted image as an SKBitmap.
+ protected SKBitmap ExtractImage(Bbox bbox, int pageIndex)
+ {
+ var image = _pageImages[pageIndex];
+ var width = image.Width;
+ var height = image.Height;
+ var minX = (int)Math.Round(bbox.MinX * width);
+ var maxX = (int)Math.Round(bbox.MaxX * width);
+ var minY = (int)Math.Round(bbox.MinY * height);
+ var maxY = (int)Math.Round(bbox.MaxY * height);
+
+ var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY);
+ using var canvas = new SKCanvas(croppedBitmap);
+ var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height);
+ var sourceRect = new SKRect(minX, minY, maxX, maxY);
+ canvas.DrawBitmap(image, sourceRect, destRect);
+
+ return croppedBitmap;
+ }
+
+ ///
+ /// Extracts multiple images from a field having position data.
+ ///
+ /// The page index to extract, begins at 0.
+ /// The list of polygons representing the position data.
+ /// A list of extracted images.
+ public List ExtractMultipleImagesFromSource(int pageId, List polygons)
+ {
+ var filename = this.LocalInput.Filename;
+ var extractedImages = new List();
+ int i = 0;
+ foreach (var polygon in polygons)
+ {
+ var bbox = Utils.BboxFromPolygon(polygon);
+ var fieldFilename = $"{filename}_page{pageId}-{polygons.IndexOf(polygon)}.{SaveFormat}";
+ extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat, pageId, i));
+ i++;
+ }
+ return extractedImages;
+ }
+ }
+}
diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs
new file mode 100644
index 000000000..625745f2d
--- /dev/null
+++ b/src/Mindee/Extraction/PdfExtractor.cs
@@ -0,0 +1,115 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Docnet.Core;
+using Microsoft.Extensions.Logging.Abstractions;
+using Mindee.Exceptions;
+using Mindee.Input;
+using Mindee.Pdf;
+using SkiaSharp;
+
+namespace Mindee.Extraction
+{
+ ///
+ /// PDF extraction class.
+ ///
+ public class PdfExtractor
+ {
+ ///
+ /// Local input source.
+ ///
+ protected readonly LocalInputSource LocalInput;
+
+ ///
+ /// Source PDF bytes.
+ ///
+ protected byte[] SourcePdf;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// Instance of a LocalInputSource, provided by the user.
+ public PdfExtractor(LocalInputSource localInput)
+ {
+ LocalInput = localInput;
+ }
+
+ ///
+ /// Wrapper for PDF GetPageCount();
+ ///
+ /// The number of pages in the file.
+ public int GetPageCount()
+ {
+ return LocalInput.GetPageCount();
+ }
+
+ ///
+ /// Extract the PDF bytes.
+ ///
+ ///
+ protected byte[] PdfBytes()
+ {
+ if (SourcePdf != null)
+ {
+ return this.SourcePdf;
+ }
+ if (LocalInput.IsPdf())
+ {
+ SourcePdf = LocalInput.FileBytes;
+ }
+ else
+ {
+ var memoryStream = new MemoryStream();
+ using var image = SKImage.FromEncodedData(LocalInput.FileBytes);
+ using var bmp = SKBitmap.FromImage(image);
+ var pageSize = new SKSize(bmp.Width, bmp.Height);
+ using (var document = SKDocument.CreatePdf(memoryStream))
+ {
+ var canvas = document.BeginPage(pageSize.Width, pageSize.Height);
+ canvas.DrawBitmap(bmp, SKPoint.Empty);
+ document.EndPage();
+ }
+
+ SourcePdf = memoryStream.ToArray();
+ }
+
+ return SourcePdf;
+ }
+
+ ///
+ /// Extracts sub-documents from the source document using list of page indexes.
+ ///
+ /// List of sub-lists of pages to keep.
+ /// Extracted documents.
+ ///
+ public List ExtractSubDocuments(List> pageIndexes)
+ {
+ var extractedPdfs = new List();
+
+ foreach (var pageIndexElem in pageIndexes)
+ {
+ if (pageIndexElem.Count == 0)
+ {
+ throw new MindeeInputException("Empty indexes not allowed for extraction.");
+ }
+
+ var extension = Path.GetExtension(LocalInput.Filename);
+ var prefix = Path.GetFileNameWithoutExtension(LocalInput.Filename);
+ var fieldFilename =
+ $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}";
+
+ var splitQuery = new SplitQuery(
+ PdfBytes(),
+ new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray()));
+ lock (DocLib.Instance)
+ {
+ var pdfOperation = new DocNetApi(new NullLogger());
+ var mergedPdfBytes = pdfOperation.Split(splitQuery).File;
+ extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename));
+ }
+ }
+
+ return extractedPdfs;
+ }
+ }
+}
diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs
index d71c48798..c160522a7 100644
--- a/src/Mindee/Image/ExtractedImage.cs
+++ b/src/Mindee/Image/ExtractedImage.cs
@@ -14,17 +14,31 @@ public class ExtractedImage
///
private readonly string _saveFormat;
+ ///
+ /// Page number the image was extracted from.
+ ///
+ public int PageId;
+
+ ///
+ /// ID of the image.
+ ///
+ public int ElementId;
+
///
/// Initializes a new instance of the class.
///
/// The extracted image.
/// The filename for the image.
/// The format to save the image.
- public ExtractedImage(SKBitmap image, string filename, string saveFormat)
+ /// The page number the image was extracted from.
+ /// The ID of the image.
+ public ExtractedImage(SKBitmap image, string filename, string saveFormat, int pageId, int elementId)
{
Image = image;
Filename = filename;
_saveFormat = saveFormat;
+ PageId = pageId;
+ ElementId = elementId;
}
///
@@ -35,45 +49,71 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat)
///
/// Name of the file.
///
- private string Filename { get; }
+ public string Filename { get; }
///
/// Writes the image to a file.
- /// Uses the default image format and filename.
+ /// If outputPath has an extension, it is treated as a full file path.
+ /// Otherwise, it is treated as a directory and uses the default filename.
///
- /// The output directory (must exist).
- public void WriteToFile(string outputPath)
+ /// The output directory (must exist) or full file path.
+ /// The quality of the image. Defaults to 100.
+ /// The desired format. If null, inferred from extension or default.
+ public void WriteToFile(string outputPath, int quality = 100, string fileFormat = null)
{
- var imagePath = Path.Combine(outputPath, Filename);
- var format = GetEncodedImageFormat(_saveFormat);
+ string imagePath;
+ var targetFormat = fileFormat ?? _saveFormat;
- using (var image = SKImage.FromBitmap(Image))
- using (var data = image.Encode(format, 100))
- using (var stream = File.OpenWrite(imagePath))
+ if (Path.HasExtension(outputPath))
{
- data.SaveTo(stream);
+ imagePath = outputPath;
+ if (string.IsNullOrWhiteSpace(fileFormat))
+ {
+ var extension = Path.GetExtension(outputPath).TrimStart('.');
+ if (!string.IsNullOrWhiteSpace(extension))
+ {
+ targetFormat = extension.ToLower();
+ }
+ }
}
+ else
+ {
+ var finalFilename = Filename;
+ if (!string.IsNullOrWhiteSpace(fileFormat))
+ {
+ var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename);
+ finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}";
+ }
+ imagePath = Path.Combine(outputPath, finalFilename);
+ }
+
+ var format = GetEncodedImageFormat(targetFormat);
+
+ using var image = SKImage.FromBitmap(Image);
+ using var data = image.Encode(format, quality);
+ using var stream = File.OpenWrite(imagePath);
+ data.SaveTo(stream);
}
///
/// Returns the image in a format suitable for sending to a client for parsing.
///
+ /// The quality of the image. Defaults to 100.
/// An instance of .
- public LocalInputSource AsInputSource()
+ public LocalInputSource AsInputSource(int quality = 100)
{
- using (var image = SKImage.FromBitmap(Image))
- using (var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100))
- using (var output = new MemoryStream())
- {
- data.SaveTo(output);
- return new LocalInputSource(output.ToArray(), Filename);
- }
+ using var image = SKImage.FromBitmap(Image);
+ using var data = image.Encode(GetEncodedImageFormat(_saveFormat), quality);
+ using var output = new MemoryStream();
+ data.SaveTo(output);
+ return new LocalInputSource(output.ToArray(), Filename);
}
- private SKEncodedImageFormat GetEncodedImageFormat(string saveFormat)
+ private static SKEncodedImageFormat GetEncodedImageFormat(string saveFormat)
{
return saveFormat.ToLower() switch
{
+ "jpg" or "jpeg" => SKEncodedImageFormat.Jpeg,
"png" => SKEncodedImageFormat.Png,
"bmp" => SKEncodedImageFormat.Bmp,
"gif" => SKEncodedImageFormat.Gif,
diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs
index 8286d74eb..6191b1e4c 100644
--- a/src/Mindee/Pdf/ExtractedPdf.cs
+++ b/src/Mindee/Pdf/ExtractedPdf.cs
@@ -1,6 +1,5 @@
using System.IO;
-using Docnet.Core;
-using Docnet.Core.Models;
+using Mindee.Exceptions;
using Mindee.Input;
namespace Mindee.Pdf
@@ -11,24 +10,55 @@ namespace Mindee.Pdf
public class ExtractedPdf
{
///
- /// Name of the original file.
+ /// Local input source.
///
- public readonly string Filename;
+ public readonly LocalInputSource LocalInput;
///
- /// File object for an ExtractedPdf.
+ /// Page count.
///
- public readonly byte[] PdfBytes;
+ public int PageCount { get; set; }
+
+ ///
+ /// Original filename.
+ ///
+ public readonly string Filename;
///
/// Initializes a new instance of the class.
///
- /// A byte array representation of the Pdf.
+ /// A byte array representation of the Pdf.
/// Name of the original file.
- public ExtractedPdf(byte[] pdfBytes, string filename)
+ public ExtractedPdf(byte[] fileBytes, string filename)
{
- PdfBytes = pdfBytes;
- Filename = filename;
+ var tmpInput = new LocalInputSource(fileBytes, filename);
+ if (tmpInput.IsPdf())
+ {
+ LocalInput = tmpInput;
+ }
+ else
+ {
+ byte[] pdfBytes = PdfUtils.ConvertImageToPdf(fileBytes, filename);
+ string newFilename = Path.ChangeExtension(filename, ".pdf");
+ LocalInput = new LocalInputSource(pdfBytes, newFilename);
+ }
+ PageCount = LocalInput.GetPageCount();
+ Filename = LocalInput.Filename;
+ }
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// LocalInputSource containing the Pdf bytes and filename.
+ public ExtractedPdf(LocalInputSource localInput)
+ {
+ LocalInput = localInput;
+ if (!localInput.IsPdf())
+ {
+ throw new MindeeInputException("The input file is not a PDF.");
+ }
+ PageCount = LocalInput.GetPageCount();
+ Filename = LocalInput.Filename;
}
///
@@ -37,11 +67,7 @@ public ExtractedPdf(byte[] pdfBytes, string filename)
/// The number of pages in the file.
public int GetPageCount()
{
- lock (DocLib.Instance)
- {
- using var docInstance = DocLib.Instance.GetDocReader(PdfBytes, new PageDimensions(1, 1));
- return docInstance.GetPageCount();
- }
+ return LocalInput.GetPageCount();
}
///
@@ -50,13 +76,13 @@ public int GetPageCount()
/// the output directory (must exist).
public void WriteToFile(string outputPath)
{
- var pdfPath = Path.Combine(outputPath, Filename);
+ var pdfPath = Path.Combine(outputPath, LocalInput.Filename);
if (Path.GetFileName(outputPath) != string.Empty)
{
pdfPath = Path.GetFullPath(outputPath);
}
- File.WriteAllBytes(pdfPath, PdfBytes);
+ File.WriteAllBytes(pdfPath, LocalInput.FileBytes);
}
///
@@ -65,7 +91,7 @@ public void WriteToFile(string outputPath)
/// an instance of
public LocalInputSource AsInputSource()
{
- return new LocalInputSource(PdfBytes, Filename);
+ return LocalInput;
}
}
}
diff --git a/src/Mindee/Pdf/PdfUtils.cs b/src/Mindee/Pdf/PdfUtils.cs
index 1aff81561..a44a1406f 100644
--- a/src/Mindee/Pdf/PdfUtils.cs
+++ b/src/Mindee/Pdf/PdfUtils.cs
@@ -1,4 +1,5 @@
using System;
+using System.IO;
using System.Linq;
using Docnet.Core;
using Docnet.Core.Models;
@@ -178,5 +179,34 @@ public static bool HasSourceText(byte[] fileBytes)
return false;
}
+
+ ///
+ /// Converts an image to a PDF.
+ ///
+ /// Raw image bytes.
+ /// Name of the file.
+ ///
+ ///
+ public static byte[] ConvertImageToPdf(byte[] imageBytes, string filename)
+ {
+ using var ms = new MemoryStream();
+ using var bitmap = SKBitmap.Decode(imageBytes);
+ if (bitmap == null)
+ {
+ throw new MindeeInputException($"The file {filename} is not a valid image.");
+ }
+
+ using (var document = SKDocument.CreatePdf(ms))
+ {
+ using (var canvas = document.BeginPage(bitmap.Width, bitmap.Height))
+ {
+ canvas.DrawBitmap(bitmap, 0, 0);
+ document.EndPage();
+ }
+ document.Close();
+ }
+
+ return ms.ToArray();
+ }
}
}
diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Extraction/PdfExtractor.cs
deleted file mode 100644
index 4c1bb1be7..000000000
--- a/src/Mindee/V1/Extraction/PdfExtractor.cs
+++ /dev/null
@@ -1,165 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.IO;
-using System.Linq;
-using Docnet.Core;
-using Docnet.Core.Models;
-using Microsoft.Extensions.Logging.Abstractions;
-using Mindee.Input;
-using Mindee.Pdf;
-using Mindee.V1.Product.InvoiceSplitter;
-using SkiaSharp;
-
-namespace Mindee.V1.Extraction
-{
- ///
- /// PDF extraction class.
- ///
- public class PdfExtractor
- {
- private readonly string Filename;
- private readonly byte[] SourcePdf;
-
- ///
- /// Initializes a new instance of the class.
- ///
- /// Instance of a LocalInputSource, provided by the user.
- public PdfExtractor(LocalInputSource localInput)
- {
- Filename = localInput.Filename;
-
- if (localInput.IsPdf())
- {
- SourcePdf = localInput.FileBytes;
- }
- else
- {
- var memoryStream = new MemoryStream();
- using var image = SKImage.FromEncodedData(localInput.FileBytes);
- using var bmp = SKBitmap.FromImage(image);
- var pageSize = new SKSize(bmp.Width, bmp.Height);
- using (var document = SKDocument.CreatePdf(memoryStream))
- {
- var canvas = document.BeginPage(pageSize.Width, pageSize.Height);
- canvas.DrawBitmap(bmp, SKPoint.Empty);
- document.EndPage();
- }
-
- SourcePdf = memoryStream.ToArray();
- }
- }
-
- ///
- /// Wrapper for pdf GetPageCount();
- ///
- /// The number of pages in the file.
- public int GetPageCount()
- {
- lock (DocLib.Instance)
- {
- using var docInstance = DocLib.Instance.GetDocReader(SourcePdf, new PageDimensions(1, 1));
- return docInstance.GetPageCount();
- }
- }
-
- ///
- /// Extracts sub-documents from the source document using list of page indexes.
- ///
- /// List of sub-lists of pages to keep.
- /// Extracted documents.
- ///
- public List ExtractSubDocuments(List> pageIndexes)
- {
- var extractedPdfs = new List();
-
- foreach (var pageIndexElem in pageIndexes)
- {
- if (!pageIndexElem.Any())
- {
- throw new ArgumentException("Empty indexes not allowed for extraction.");
- }
-
- var extension = Path.GetExtension(Filename);
- var prefix = Path.GetFileNameWithoutExtension(Filename);
- var fieldFilename =
- $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}";
-
- var splitQuery = new SplitQuery(
- SourcePdf,
- new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray()));
- lock (DocLib.Instance)
- {
- var pdfOperation = new DocNetApi(new NullLogger());
- var mergedPdfBytes = pdfOperation.Split(splitQuery).File;
- extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename));
- }
- }
-
- return extractedPdfs;
- }
-
- ///
- /// Extracts invoices as complete PDFs from the document. Include cuts for confidence scores below 1.0.
- ///
- /// List of sub-lists of pages to keep.
- /// A list of extracted invoices.
- public List ExtractInvoices(List pageIndexes)
- {
- var indexes = pageIndexes.Select(pi => pi.PageIndexes.ToList()).ToList();
- return ExtractSubDocuments(indexes.ToList());
- }
-
- ///
- /// Extracts invoices as complete PDFs from the document.
- ///
- /// List of sub-lists of pages to keep.
- /// Whether to trust confidence scores of 1.0 only or not.
- /// A list of extracted invoices.
- public List ExtractInvoices(IList pageIndexes, bool strict)
- {
- if (!strict)
- {
- return ExtractInvoices(pageIndexes.ToList());
- }
-
- var correctPageIndexes = new List>();
- var iterator = pageIndexes.GetEnumerator();
- using var iterator1 = (IDisposable)iterator;
- var currentList = new List();
- double? previousConfidence = null;
-
- while (iterator.MoveNext())
- {
- var pageIndex = iterator.Current;
- Debug.Assert(pageIndex != null, nameof(pageIndex) + " != null");
- var confidence = pageIndex.Confidence ?? 0.0;
- var pageList = pageIndex.PageIndexes;
-
- if (Math.Abs(confidence - 1.0) < 0.01 && previousConfidence == null)
- {
- currentList = new List(pageList);
- }
- else if (Math.Abs(confidence - 1.0) < 0.01)
- {
- correctPageIndexes.Add(currentList);
- currentList = new List(pageList);
- }
- else if (confidence == 0.0 && !iterator.MoveNext())
- {
- currentList.AddRange(pageList);
- correctPageIndexes.Add(currentList);
- }
- else
- {
- correctPageIndexes.Add(currentList);
- correctPageIndexes.Add(pageList.ToList());
- }
-
- previousConfidence = confidence;
- }
-
- return ExtractSubDocuments(correctPageIndexes);
- }
- }
-}
diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Image/ImageExtractor.cs
similarity index 59%
rename from src/Mindee/V1/Extraction/ImageExtractor.cs
rename to src/Mindee/V1/Image/ImageExtractor.cs
index b6f6576bd..91f544fe8 100644
--- a/src/Mindee/V1/Extraction/ImageExtractor.cs
+++ b/src/Mindee/V1/Image/ImageExtractor.cs
@@ -1,122 +1,35 @@
-using System;
using System.Collections.Generic;
-using System.IO;
-using Docnet.Core;
-using Docnet.Core.Models;
+using System.Linq;
using Mindee.Exceptions;
using Mindee.Geometry;
using Mindee.Image;
using Mindee.Input;
using Mindee.V1.Parsing.Standard;
-using SkiaSharp;
-namespace Mindee.V1.Extraction
+namespace Mindee.V1.Image
{
///
- /// Extract sub-images from an image.
+ /// Legacy V1 Wrapper for ImageExtractor.
///
- public class ImageExtractor
+ public sealed class ImageExtractor : Mindee.Extraction.ImageExtractor
{
- private readonly string _filename;
- private readonly List _pageImages;
- private readonly string _saveFormat;
-
- ///
- /// LocalInputSource object used by the ImageExtractor.
- ///
- public readonly LocalInputSource LocalInput;
-
///
/// Init from a Local Input Source.
///
/// Locally loaded resource.
/// Format to save the resulting images as.
public ImageExtractor(LocalInputSource localInput, string saveFormat = null)
- {
- _filename = localInput.Filename;
- _pageImages = new List();
- LocalInput = localInput;
- if (saveFormat == null)
- {
- var extension = Path.GetExtension(localInput.Filename)?.Substring(1);
- if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase))
- {
- _saveFormat = extension;
- }
- else
- {
- _saveFormat = "jpg";
- }
- }
- else
- {
- _saveFormat = saveFormat;
- }
-
- if (localInput.IsPdf())
- {
- var pdfPageImages = PdfToImages(localInput.FileBytes);
- _pageImages.AddRange(pdfPageImages);
- }
- else
- {
- _pageImages.Add(SKBitmap.Decode(localInput.FileBytes));
- }
- }
+ : base(localInput, saveFormat)
+ { }
///
/// Init from a path.
///
/// Path to the file.
- public ImageExtractor(string filePath) : this(new LocalInputSource(filePath))
- {
- }
-
- ///
- /// Renders the input Pdf's pages as individual images.
- ///
- /// Input pdf.
- /// A list of pages, as SKBitmap.
- private static List PdfToImages(byte[] fileBytes)
- {
- var images = new List();
- lock (DocLib.Instance)
- {
- using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1));
- for (var i = 0; i < docReader.GetPageCount(); i++)
- {
- using var pageReader = docReader.GetPageReader(i);
- var width = pageReader.GetPageWidth();
- var height = pageReader.GetPageHeight();
- var bytes = pageReader.GetImage();
- var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height));
- images.Add(bmp);
- }
+ public ImageExtractor(string filePath)
+ : base(filePath)
+ { }
- return images;
- }
- }
-
- ///
- /// Splits the filename into name and extension.
- ///
- private static string[] SplitNameStrict(string filename)
- {
- return
- [
- Path.GetFileNameWithoutExtension(filename),
- Path.GetExtension(filename).TrimStart('.')
- ];
- }
-
- ///
- /// Gets the number of pages in the file.
- ///
- /// The number of pages in the file.
- public int GetPageCount()
- {
- return _pageImages.Count;
- }
///
/// Extract multiple images on a given page from a list of fields having position data.
@@ -143,7 +56,7 @@ public IList ExtractImagesFromPage(IList
if (GetPageCount() > 1)
{
var splitName = SplitNameStrict(outputName);
- filename = $"{splitName[0]}.{_saveFormat}";
+ filename = $"{splitName[0]}.{SaveFormat}";
}
else
{
@@ -167,7 +80,7 @@ public IList ExtractImagesFromPage(IList fields,
if (GetPageCount() > 1)
{
var splitName = SplitNameStrict(outputName);
- filename = $"{splitName[0]}.{_saveFormat}";
+ filename = $"{splitName[0]}.{SaveFormat}";
}
else
{
@@ -188,25 +101,15 @@ private List ExtractFromPage(IList field
string outputName) where TBaseField : BaseField
{
var splitName = SplitNameStrict(outputName);
- var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}";
-
- var extractedImages = new List();
- for (var i = 0; i < fields.Count; i++)
- {
- var extractedImage = ExtractImage(fields[i], pageIndex, i + 1, filename);
- if (extractedImage != null)
- {
- extractedImages.Add(extractedImage);
- }
- }
+ var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}";
- return extractedImages;
+ return fields.Select((t, i) => ExtractImage(t, pageIndex, i + 1, filename)).Where(extractedImage => extractedImage != null).ToList();
}
private List ExtractFromPage(IList fields, int pageIndex, string outputName)
{
var splitName = SplitNameStrict(outputName);
- var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}";
+ var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}";
var extractedImages = new List();
for (var i = 0; i < fields.Count; i++)
@@ -256,8 +159,8 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index
}
var bbox = Utils.BboxFromPolygon(boundingBox);
- var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat);
+ var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index);
}
///
@@ -283,29 +186,8 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st
}
var bbox = Utils.BboxFromPolygon(boundingBox);
- var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}";
- return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat);
- }
-
- private SKBitmap ExtractImage(Bbox bbox, int pageIndex)
- {
- var image = _pageImages[pageIndex];
- var width = image.Width;
- var height = image.Height;
- var minX = (int)Math.Round(bbox.MinX * width);
- var maxX = (int)Math.Round(bbox.MaxX * width);
- var minY = (int)Math.Round(bbox.MinY * height);
- var maxY = (int)Math.Round(bbox.MaxY * height);
-
- var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY);
- using (var canvas = new SKCanvas(croppedBitmap))
- {
- var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height);
- var sourceRect = new SKRect(minX, minY, maxX, maxY);
- canvas.DrawBitmap(image, sourceRect, destRect);
- }
-
- return croppedBitmap;
+ var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}";
+ return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index);
}
}
}
diff --git a/src/Mindee/V1/Image/PdfExtractor.cs b/src/Mindee/V1/Image/PdfExtractor.cs
new file mode 100644
index 000000000..359852ba3
--- /dev/null
+++ b/src/Mindee/V1/Image/PdfExtractor.cs
@@ -0,0 +1,84 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using Mindee.Input;
+using Mindee.Pdf;
+using Mindee.V1.Product.InvoiceSplitter;
+
+namespace Mindee.V1.Image
+{
+ ///
+ /// V1 wrapper for the PDF extraction class.
+ ///
+ public class PdfExtractor : Mindee.Extraction.PdfExtractor
+ {
+ ///
+ public PdfExtractor(LocalInputSource localInput) : base(localInput)
+ {
+ }
+
+ ///
+ /// Extracts invoices as complete PDFs from the document. Include cuts for confidence scores below 1.0.
+ ///
+ /// List of sub-lists of pages to keep.
+ /// A list of extracted invoices.
+ public List ExtractInvoices(List pageIndexes)
+ {
+ var indexes = pageIndexes.Select(pi => pi.PageIndexes.ToList()).ToList();
+ return ExtractSubDocuments(indexes.ToList());
+ }
+
+ ///
+ /// Extracts invoices as complete PDFs from the document.
+ ///
+ /// List of sub-lists of pages to keep.
+ /// Whether to trust confidence scores of 1.0 only or not.
+ /// A list of extracted invoices.
+ public List ExtractInvoices(IList pageIndexes, bool strict)
+ {
+ if (!strict)
+ {
+ return ExtractInvoices(pageIndexes.ToList());
+ }
+
+ var correctPageIndexes = new List>();
+ var iterator = pageIndexes.GetEnumerator();
+ using var iterator1 = (IDisposable)iterator;
+ var currentList = new List();
+ double? previousConfidence = null;
+
+ while (iterator.MoveNext())
+ {
+ var pageIndex = iterator.Current;
+ Debug.Assert(pageIndex != null, nameof(pageIndex) + " != null");
+ var confidence = pageIndex.Confidence ?? 0.0;
+ var pageList = pageIndex.PageIndexes;
+
+ if (Math.Abs(confidence - 1.0) < 0.01 && previousConfidence == null)
+ {
+ currentList = new List(pageList);
+ }
+ else if (Math.Abs(confidence - 1.0) < 0.01)
+ {
+ correctPageIndexes.Add(currentList);
+ currentList = new List(pageList);
+ }
+ else if (confidence == 0.0 && !iterator.MoveNext())
+ {
+ currentList.AddRange(pageList);
+ correctPageIndexes.Add(currentList);
+ }
+ else
+ {
+ correctPageIndexes.Add(currentList);
+ correctPageIndexes.Add(pageList.ToList());
+ }
+
+ previousConfidence = confidence;
+ }
+
+ return ExtractSubDocuments(correctPageIndexes);
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs
new file mode 100644
index 000000000..596011152
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/Crop.cs
@@ -0,0 +1,59 @@
+using System.Collections.Generic;
+using System.Linq;
+using Mindee.Extraction;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
+using Mindee.V2.Product.Crop;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// V2 Crop operation utility.
+ ///
+ public sealed class Crop
+ {
+ ///
+ /// LocalInputSource object.
+ ///
+ private readonly LocalInputSource _localInput;
+
+ ///
+ ///
+ ///
+ ///
+ public Crop(LocalInputSource inputSource)
+ {
+ this._localInput = inputSource;
+ }
+
+ ///
+ /// Extract a single crop item from a file.
+ ///
+ ///
+ ///
+ public ExtractedImage ExtractSingleCrop(CropItem crop)
+ {
+ var polygons = new List { crop.Location.Polygon };
+ var imageExtractor = new ImageExtractor(this._localInput);
+ return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0];
+ }
+
+ ///
+ /// Extracts multiple crop zones from a file.
+ ///
+ /// List of crops.
+ ///
+ public CropFiles ExtractCrops(List crops)
+ {
+ var imageExtractor = new ImageExtractor(this._localInput);
+ CropFiles extractedImages = [];
+ var cropsPerPage = crops.GroupBy(c => c.Location.Page).ToList();
+ foreach (var pageCrops in cropsPerPage)
+ {
+ extractedImages.AddRange(imageExtractor.ExtractMultipleImagesFromSource(pageCrops.Key, pageCrops.Select(c => c.Location.Polygon).ToList()));
+ }
+ return extractedImages;
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs
new file mode 100644
index 000000000..f547761a4
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/CropFiles.cs
@@ -0,0 +1,50 @@
+using System.Collections.Generic;
+using System.IO;
+using Mindee.Image;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// Collection of cropped files.
+ ///
+ public class CropFiles : List
+ {
+ ///
+ ///
+ ///
+ ///
+ public CropFiles(IEnumerable collection) : base(collection)
+ {
+ }
+
+ ///
+ ///
+ ///
+ public CropFiles() : base()
+ {
+ }
+
+ ///
+ /// Saves all cropped files to disk.
+ ///
+ /// Path for all files
+ /// Prefix for file names
+ /// Quality of the output image
+ /// File format for saving (default: null)
+ public void SaveAllToDisk(string path, int quality = 100, string prefix = "crop", string fileFormat = null)
+ {
+ Directory.CreateDirectory(path);
+
+ int index = 1;
+ foreach (var crop in this)
+ {
+ string fileName = $"{prefix}_{index:D3}.jpg";
+ string filePath = Path.Combine(path, fileName);
+
+ crop.WriteToFile(filePath, quality, fileFormat);
+
+ index++;
+ }
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs
new file mode 100644
index 000000000..25ca0cd47
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/Split.cs
@@ -0,0 +1,83 @@
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Mindee.Exceptions;
+using Mindee.Extraction;
+using Mindee.Input;
+using Mindee.Pdf;
+using Mindee.V2.Product.Split;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// V2 Split operation utility.
+ ///
+ public sealed class Split
+ {
+
+ ///
+ /// LocalInputSource object.
+ ///
+ private readonly LocalInputSource _localInput;
+
+ ///
+ /// Expands a range of pages into a list of page indexes.
+ ///
+ /// Start of the range.
+ /// End of the range.
+ /// An array of page indexes.
+ public static List ExpandRange(int start, int end)
+ {
+ if (start > end)
+ {
+ throw new MindeeInputException("Invalid page range provided.");
+ }
+
+ int count = end - start + 1;
+ return Enumerable.Range(start, count).ToList();
+ }
+
+ ///
+ /// Initializes an instance of a Split operation.
+ /// Transforms images to PDFs if necessary.
+ ///
+ ///
+ public Split(LocalInputSource inputSource)
+ {
+ if (inputSource.IsPdf())
+ {
+ _localInput = inputSource;
+ }
+ else
+ {
+ byte[] pdfBytes = PdfUtils.ConvertImageToPdf(inputSource.FileBytes, inputSource.Filename);
+ string newFilename = Path.ChangeExtension(inputSource.Filename, ".pdf");
+ _localInput = new LocalInputSource(pdfBytes, newFilename);
+ }
+ }
+
+ ///
+ /// Extracts a single split from the input file.
+ ///
+ ///
+ ///
+ public ExtractedPdf ExtractSingleSplit(SplitRange splitRange)
+ {
+ return ExtractSplits([splitRange.PageRange])[0];
+ }
+
+ ///
+ /// Extracts the splits from the input file.
+ ///
+ /// List of subpage indexes to keep.
+ ///
+ public SplitFiles ExtractSplits(List> splits)
+ {
+ var pdfExtractor = new PdfExtractor(this._localInput);
+
+ List> expandedPageIndexes = [];
+ expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1])));
+ return new SplitFiles(pdfExtractor.ExtractSubDocuments(expandedPageIndexes));
+ }
+ }
+}
diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs
new file mode 100644
index 000000000..443f08478
--- /dev/null
+++ b/src/Mindee/V2/FileOperations/SplitFiles.cs
@@ -0,0 +1,48 @@
+using System.Collections.Generic;
+using System.IO;
+using Mindee.Pdf;
+
+namespace Mindee.V2.FileOperations
+{
+ ///
+ /// Collection of split PDFs.
+ ///
+ public sealed class SplitFiles : List
+ {
+ ///
+ ///
+ ///
+ ///
+ public SplitFiles(IEnumerable collection) : base(collection)
+ {
+ }
+
+ ///
+ ///
+ ///
+ public SplitFiles() : base()
+ {
+ }
+
+ ///
+ /// Saves all the extracted pages to disk.
+ ///
+ /// Path for all files
+ /// Prefix for file names
+ public void SaveAllToDisk(string path, string prefix = "split")
+ {
+ Directory.CreateDirectory(path);
+
+ int index = 1;
+ foreach (var crop in this)
+ {
+ string fileName = $"{prefix}_{index:D3}.pdf";
+ string filePath = Path.Combine(path, fileName);
+
+ crop.WriteToFile(filePath);
+
+ index++;
+ }
+ }
+ }
+}
diff --git a/src/Mindee/V2/Product/Crop/CropItem.cs b/src/Mindee/V2/Product/Crop/CropItem.cs
index 923811cfd..d703f28e7 100644
--- a/src/Mindee/V2/Product/Crop/CropItem.cs
+++ b/src/Mindee/V2/Product/Crop/CropItem.cs
@@ -1,4 +1,9 @@
+using System.Collections.Generic;
using System.Text.Json.Serialization;
+using Mindee.Extraction;
+using Mindee.Geometry;
+using Mindee.Image;
+using Mindee.Input;
using Mindee.V2.Parsing.Inference.Field;
namespace Mindee.V2.Product.Crop
@@ -28,5 +33,16 @@ public override string ToString()
{
return $"* :Location: {Location}\n :Object Type: {ObjectType}";
}
+
+ ///
+ /// Extract the crop from the source document.
+ ///
+ ///
+ ///
+ public ExtractedImage ExtractFromFile(LocalInputSource inputSource)
+ {
+ var crop = new FileOperations.Crop(inputSource);
+ return crop.ExtractSingleCrop(this);
+ }
}
}
diff --git a/src/Mindee/V2/Product/Split/SplitRange.cs b/src/Mindee/V2/Product/Split/SplitRange.cs
index 6d0cb704c..7b93519d7 100644
--- a/src/Mindee/V2/Product/Split/SplitRange.cs
+++ b/src/Mindee/V2/Product/Split/SplitRange.cs
@@ -1,5 +1,7 @@
using System.Collections.Generic;
using System.Text.Json.Serialization;
+using Mindee.Input;
+using Mindee.Pdf;
namespace Mindee.V2.Product.Split
{
@@ -29,5 +31,16 @@ public override string ToString()
string pageRange = string.Join(",", PageRange);
return $"* :Page Range: {pageRange}\n :Document Type: {DocumentType}";
}
+
+ ///
+ /// Extracts the split from the source document.
+ ///
+ ///
+ ///
+ public ExtractedPdf ExtractFromFile(LocalInputSource inputSource)
+ {
+ var split = new FileOperations.Split(inputSource);
+ return split.ExtractSingleSplit(this);
+ }
}
}
diff --git a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
index 98234901b..616d3c16f 100644
--- a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
+++ b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs
@@ -1,6 +1,6 @@
using Mindee.Input;
using Mindee.Pdf;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.Invoice;
using Mindee.V1.Product.InvoiceSplitter;
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
new file mode 100644
index 000000000..e851e459e
--- /dev/null
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs
@@ -0,0 +1,108 @@
+using Mindee.Input;
+using Mindee.V2;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Product.Crop;
+using Mindee.V2.Product.Crop.Params;
+using Mindee.V2.Product.Extraction;
+using Mindee.V2.Product.Extraction.Params;
+
+namespace Mindee.IntegrationTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class CropTest : IDisposable
+ {
+ private readonly string? _cropModelId;
+ private readonly string? _findocModelId;
+ private readonly Client _client;
+ private readonly string _outputDir;
+
+ public CropTest()
+ {
+ var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey");
+ _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey);
+ _cropModelId = Environment.GetEnvironmentVariable("MindeeV2__Crop__Model__Id");
+ _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id");
+
+ _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output");
+ if (!Directory.Exists(_outputDir))
+ {
+ Directory.CreateDirectory(_outputDir);
+ }
+ }
+
+ public void Dispose()
+ {
+ var file1 = Path.Combine(_outputDir, "crop_001.jpg");
+ var file2 = Path.Combine(_outputDir, "crop_002.jpg");
+
+ if (File.Exists(file1)) File.Delete(file1);
+ if (File.Exists(file2)) File.Delete(file2);
+ }
+
+ private void CheckFindocReturn(ExtractionResponse findocResponse)
+ {
+ Assert.True(findocResponse.Inference.Model.Id.Length > 0);
+
+ var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField;
+ Assert.NotNull(totalAmount);
+ Assert.True(totalAmount.Value > 0);
+ }
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Crops_From_Image_Correctly()
+ {
+ var inputSource = new LocalInputSource(Path.Combine(
+ Constants.V2ProductDir, "crop/default_sample.jpg"));
+ var cropParams = new CropParameters(_cropModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, cropParams);
+
+ Assert.NotNull(response);
+ Assert.Equal(2, response.Inference.Result.Crops.Count);
+
+ var cropOperation = new Crop(inputSource);
+ var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops);
+
+ Assert.Equal(2, extractedImages.Count);
+ Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename);
+ Assert.Equal("default_sample.jpg_page0-1.jpg", extractedImages[1].Filename);
+
+ var extractionInput = extractedImages[0].AsInputSource();
+ var findocParams = new ExtractionParameters(_findocModelId);
+
+ var invoice0 = await _client.EnqueueAndGetResultAsync(
+ extractionInput, findocParams);
+
+ CheckFindocReturn(invoice0);
+
+ extractedImages.SaveAllToDisk(_outputDir, 50);
+
+ var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg"));
+ Assert.InRange(file1Info.Length, 99000, 110000);
+
+ var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg"));
+ Assert.InRange(file2Info.Length, 99000, 110000);
+ }
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Crops_From_Each_Pdf_Page_Correctly()
+ {
+
+ var inputSource = new LocalInputSource(
+ new FileInfo(Path.Combine(Constants.V2ProductDir, "crop/multipage_sample.pdf")));
+
+ var cropParams = new CropParameters(_cropModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, cropParams);
+ var cropOperation = new Crop(inputSource);
+ var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops);
+
+ Assert.Equal(5, extractedImages.Count);
+ Assert.Equal("multipage_sample.pdf_page0-0.jpg", extractedImages[0].Filename);
+ Assert.Equal("multipage_sample.pdf_page1-0.jpg", extractedImages[3].Filename);
+ }
+ }
+}
diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs
new file mode 100644
index 000000000..1801d90d1
--- /dev/null
+++ b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs
@@ -0,0 +1,97 @@
+using Mindee.Input;
+using Mindee.V2;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Product.Extraction;
+using Mindee.V2.Product.Extraction.Params;
+using Mindee.V2.Product.Split;
+using Mindee.V2.Product.Split.Params;
+
+namespace Mindee.IntegrationTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class SplitTest : IDisposable
+ {
+ private readonly string? _splitModelId;
+ private readonly string? _findocModelId;
+ private readonly Client _client;
+ private readonly string _outputDir;
+
+ public SplitTest()
+ {
+ var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey");
+ _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey);
+ _splitModelId = Environment.GetEnvironmentVariable("MindeeV2__Split__Model__Id");
+ _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id");
+
+ _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output");
+ if (!Directory.Exists(_outputDir))
+ {
+ Directory.CreateDirectory(_outputDir);
+ }
+ }
+
+ public void Dispose()
+ {
+ var file1 = Path.Combine(_outputDir, "split_001.pdf");
+ var file2 = Path.Combine(_outputDir, "split_002.pdf");
+
+ if (File.Exists(file1)) File.Delete(file1);
+ if (File.Exists(file2)) File.Delete(file2);
+ }
+
+ private void CheckFindocReturn(ExtractionResponse findocResponse)
+ {
+ Assert.True(findocResponse.Inference.Model.Id.Length > 0);
+
+ var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField;
+ Assert.NotNull(totalAmount);
+ Assert.True(totalAmount.Value > 0);
+ }
+
+ [Fact(Timeout = 180000)]
+ public async Task Extract_Splits_From_Pdf_Correctly()
+ {
+ var inputSource = new LocalInputSource(
+ Constants.V2ProductDir + "split/default_sample.pdf");
+ var splitParams = new SplitParameters(_splitModelId);
+
+ var response = await _client.EnqueueAndGetResultAsync(
+ inputSource, splitParams);
+
+ Assert.NotNull(response);
+ Assert.Equal(2, response.Inference.Result.Splits.Count);
+
+ var splitOperation = new Split(inputSource);
+ var extractedSplits = splitOperation.ExtractSplits(
+ response.Inference.Result.Splits.Select(s => s.PageRange).ToList());
+
+ Assert.Equal(2, extractedSplits.Count);
+ Assert.Equal("default_sample_001-001.pdf", extractedSplits[0].Filename);
+ Assert.Equal("default_sample_002-002.pdf", extractedSplits[1].Filename);
+
+ var extractionInput = extractedSplits[0].AsInputSource();
+ var findocParams = new ExtractionParameters(_findocModelId);
+
+ var invoice0 = await _client.EnqueueAndGetResultAsync(
+ extractionInput, findocParams);
+
+ CheckFindocReturn(invoice0);
+
+ extractedSplits.SaveAllToDisk(_outputDir);
+
+ for (int i = 0; i < extractedSplits.Count; i++)
+ {
+ var fileName = $"split_{i + 1:D3}.pdf";
+ var filePath = Path.Combine(_outputDir, fileName);
+ var fileInfo = new FileInfo(filePath);
+
+ Assert.True(fileInfo.Exists);
+ Assert.True(fileInfo.Length > 0);
+
+ var localInput = new LocalInputSource(fileInfo);
+ Assert.Equal(extractedSplits[i].PageCount, localInput.GetPageCount());
+ }
+ }
+ }
+}
diff --git a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
index e85640256..e3503329f 100644
--- a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
+++ b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs
@@ -1,5 +1,5 @@
using Mindee.Input;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.BarcodeReader;
using Mindee.V1.Product.MultiReceiptsDetector;
diff --git a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
index b88fd3d54..a62072732 100644
--- a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
+++ b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs
@@ -1,5 +1,5 @@
using Mindee.Input;
-using Mindee.V1.Extraction;
+using Mindee.V1.Image;
using Mindee.V1.Parsing.Common;
using Mindee.V1.Product.InvoiceSplitter;
diff --git a/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs
new file mode 100644
index 000000000..5a2fb359a
--- /dev/null
+++ b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs
@@ -0,0 +1,67 @@
+using Mindee.Input;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Parsing;
+using Mindee.V2.Product.Crop;
+
+namespace Mindee.UnitTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class CropTest
+ {
+ private readonly string _cropDataDir = Path.Combine(Constants.V2RootDir, "products", "crop");
+
+ [Fact]
+ public void Processes_SinglePage_CropSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_cropDataDir, "default_sample.jpg")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_cropDataDir, "crop_single.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var cropOperation = new Crop(inputSample);
+ var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops);
+
+ Assert.Single(extractedCrops);
+
+ Assert.Equal(0, extractedCrops[0].PageId);
+ Assert.Equal(0, extractedCrops[0].ElementId);
+
+ using var bitmap0 = extractedCrops[0].Image;
+ Assert.Equal(2822, bitmap0.Width);
+ Assert.Equal(1572, bitmap0.Height);
+ }
+
+ [Fact]
+ public void Processes_MultiPage_ReceiptSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_cropDataDir, "multipage_sample.pdf")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_cropDataDir, "crop_multiple.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var cropOperation = new Crop(inputSample);
+ var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops);
+
+ Assert.Equal(2, extractedCrops.Count);
+
+ Assert.Equal(0, extractedCrops[0].PageId);
+ Assert.Equal(0, extractedCrops[0].ElementId);
+
+ using var bitmap0 = extractedCrops[0].Image;
+ Assert.Equal(156, bitmap0.Width);
+ Assert.Equal(757, bitmap0.Height);
+
+ Assert.Equal(0, extractedCrops[1].PageId);
+ Assert.Equal(1, extractedCrops[1].ElementId);
+
+ using var bitmap1 = extractedCrops[1].Image;
+ Assert.Equal(188, bitmap1.Width);
+ Assert.Equal(691, bitmap1.Height);
+ }
+ }
+}
diff --git a/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs
new file mode 100644
index 000000000..88093be39
--- /dev/null
+++ b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs
@@ -0,0 +1,55 @@
+using Mindee.Input;
+using Mindee.V2.FileOperations;
+using Mindee.V2.Parsing;
+using Mindee.V2.Product.Split;
+
+namespace Mindee.UnitTests.V2.FileOperations
+{
+ [Trait("Category", "V2")]
+ [Trait("Category", "FileOperations")]
+ public class SplitTest
+ {
+ private readonly string _splitDataDir = Path.Combine(Constants.V2RootDir, "products", "split");
+ private readonly string _finDocDataDir = Path.Combine(Constants.V2RootDir, "products", "extraction", "financial_document");
+
+ [Fact]
+ public void Processes_SinglePage_Split_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_finDocDataDir, "default_sample.jpg")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_splitDataDir, "split_single.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var splitOperation = new Split(inputSample);
+ List splits = doc.Inference.Result.Splits;
+ var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList());
+
+ Assert.Single(extractedSplits);
+
+ Assert.Equal(1, extractedSplits[0].PageCount);
+ }
+
+ [Fact]
+ public void Processes_MultiPage_ReceiptSplit_Correctly()
+ {
+ var inputSample = new LocalInputSource(
+ new FileInfo(Path.Combine(_splitDataDir, "invoice_5p.pdf")));
+
+ var localResponse = new LocalResponse(
+ new FileInfo(Path.Combine(_splitDataDir, "split_multiple.json")));
+ var doc = localResponse.DeserializeResponse();
+
+ var splitOperation = new Split(inputSample);
+ List splits = doc.Inference.Result.Splits;
+ var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList());
+
+ Assert.Equal(3, extractedSplits.Count);
+
+ Assert.Equal(1, extractedSplits[0].PageCount);
+ Assert.Equal(3, extractedSplits[1].PageCount);
+ Assert.Equal(1, extractedSplits[2].PageCount);
+ }
+ }
+}