diff --git a/Directory.Build.props b/Directory.Build.props index c1dff72d1..9d41defe6 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -46,7 +46,7 @@ - + diff --git a/src/Mindee/Extraction/ImageExtractor.cs b/src/Mindee/Extraction/ImageExtractor.cs new file mode 100644 index 000000000..fb7afb27d --- /dev/null +++ b/src/Mindee/Extraction/ImageExtractor.cs @@ -0,0 +1,174 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Docnet.Core; +using Docnet.Core.Models; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; +using SkiaSharp; + +namespace Mindee.Extraction +{ + /// + /// Extract sub-images from an image. + /// + public class ImageExtractor + { + /// + /// Name of the file. + /// + protected readonly string _filename; + /// + /// List of SKBitmap representing the pages of the file. + /// + private readonly List _pageImages; + /// + /// Format to save the resulting images as. + /// + protected readonly string SaveFormat; + + /// + /// LocalInputSource object used by the ImageExtractor. + /// + public readonly LocalInputSource LocalInput; + + /// + /// Init from a Local Input Source. + /// + /// Locally loaded resource. + /// Format to save the resulting images as. + public ImageExtractor(LocalInputSource localInput, string saveFormat = null) + { + _filename = localInput.Filename; + _pageImages = []; + LocalInput = localInput; + if (saveFormat == null) + { + var extension = Path.GetExtension(localInput.Filename)?.Substring(1); + if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase)) + { + SaveFormat = extension; + } + else + { + SaveFormat = "jpg"; + } + } + else + { + SaveFormat = saveFormat; + } + + if (localInput.IsPdf()) + { + var pdfPageImages = PdfToImages(localInput.FileBytes); + _pageImages.AddRange(pdfPageImages); + } + else + { + _pageImages.Add(SKBitmap.Decode(localInput.FileBytes)); + } + } + + /// + /// Init from a path. + /// + /// Path to the file. + public ImageExtractor(string filePath) : this(new LocalInputSource(filePath)) + { + } + + /// + /// Renders the input Pdf's pages as individual images. + /// + /// Input pdf. + /// A list of pages, as SKBitmap. + private static List PdfToImages(byte[] fileBytes) + { + var images = new List(); + lock (DocLib.Instance) + { + using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1)); + for (var i = 0; i < docReader.GetPageCount(); i++) + { + using var pageReader = docReader.GetPageReader(i); + var width = pageReader.GetPageWidth(); + var height = pageReader.GetPageHeight(); + var bytes = pageReader.GetImage(); + var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height)); + images.Add(bmp); + } + + return images; + } + } + + /// + /// Splits the filename into name and extension. + /// + protected static string[] SplitNameStrict(string filename) + { + return + [ + Path.GetFileNameWithoutExtension(filename), + Path.GetExtension(filename).TrimStart('.') + ]; + } + + /// + /// Gets the number of pages in the file. + /// + /// The number of pages in the file. + public int GetPageCount() + { + return _pageImages.Count; + } + + /// + /// Extracts a single image from a field having position data. + /// + /// Bounding box of the field. + /// Index of the page containing the field. + /// Extracted image as an SKBitmap. + protected SKBitmap ExtractImage(Bbox bbox, int pageIndex) + { + var image = _pageImages[pageIndex]; + var width = image.Width; + var height = image.Height; + var minX = (int)Math.Round(bbox.MinX * width); + var maxX = (int)Math.Round(bbox.MaxX * width); + var minY = (int)Math.Round(bbox.MinY * height); + var maxY = (int)Math.Round(bbox.MaxY * height); + + var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY); + using var canvas = new SKCanvas(croppedBitmap); + var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height); + var sourceRect = new SKRect(minX, minY, maxX, maxY); + canvas.DrawBitmap(image, sourceRect, destRect); + + return croppedBitmap; + } + + /// + /// Extracts multiple images from a field having position data. + /// + /// The page index to extract, begins at 0. + /// The list of polygons representing the position data. + /// A list of extracted images. + public List ExtractMultipleImagesFromSource(int pageId, List polygons) + { + var filename = this.LocalInput.Filename; + var extractedImages = new List(); + int i = 0; + foreach (var polygon in polygons) + { + var bbox = Utils.BboxFromPolygon(polygon); + var fieldFilename = $"{filename}_page{pageId}-{polygons.IndexOf(polygon)}.{SaveFormat}"; + extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat, pageId, i)); + i++; + } + return extractedImages; + } + } +} diff --git a/src/Mindee/Extraction/PdfExtractor.cs b/src/Mindee/Extraction/PdfExtractor.cs new file mode 100644 index 000000000..625745f2d --- /dev/null +++ b/src/Mindee/Extraction/PdfExtractor.cs @@ -0,0 +1,115 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Docnet.Core; +using Microsoft.Extensions.Logging.Abstractions; +using Mindee.Exceptions; +using Mindee.Input; +using Mindee.Pdf; +using SkiaSharp; + +namespace Mindee.Extraction +{ + /// + /// PDF extraction class. + /// + public class PdfExtractor + { + /// + /// Local input source. + /// + protected readonly LocalInputSource LocalInput; + + /// + /// Source PDF bytes. + /// + protected byte[] SourcePdf; + + /// + /// Initializes a new instance of the class. + /// + /// Instance of a LocalInputSource, provided by the user. + public PdfExtractor(LocalInputSource localInput) + { + LocalInput = localInput; + } + + /// + /// Wrapper for PDF GetPageCount(); + /// + /// The number of pages in the file. + public int GetPageCount() + { + return LocalInput.GetPageCount(); + } + + /// + /// Extract the PDF bytes. + /// + /// + protected byte[] PdfBytes() + { + if (SourcePdf != null) + { + return this.SourcePdf; + } + if (LocalInput.IsPdf()) + { + SourcePdf = LocalInput.FileBytes; + } + else + { + var memoryStream = new MemoryStream(); + using var image = SKImage.FromEncodedData(LocalInput.FileBytes); + using var bmp = SKBitmap.FromImage(image); + var pageSize = new SKSize(bmp.Width, bmp.Height); + using (var document = SKDocument.CreatePdf(memoryStream)) + { + var canvas = document.BeginPage(pageSize.Width, pageSize.Height); + canvas.DrawBitmap(bmp, SKPoint.Empty); + document.EndPage(); + } + + SourcePdf = memoryStream.ToArray(); + } + + return SourcePdf; + } + + /// + /// Extracts sub-documents from the source document using list of page indexes. + /// + /// List of sub-lists of pages to keep. + /// Extracted documents. + /// + public List ExtractSubDocuments(List> pageIndexes) + { + var extractedPdfs = new List(); + + foreach (var pageIndexElem in pageIndexes) + { + if (pageIndexElem.Count == 0) + { + throw new MindeeInputException("Empty indexes not allowed for extraction."); + } + + var extension = Path.GetExtension(LocalInput.Filename); + var prefix = Path.GetFileNameWithoutExtension(LocalInput.Filename); + var fieldFilename = + $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}"; + + var splitQuery = new SplitQuery( + PdfBytes(), + new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray())); + lock (DocLib.Instance) + { + var pdfOperation = new DocNetApi(new NullLogger()); + var mergedPdfBytes = pdfOperation.Split(splitQuery).File; + extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename)); + } + } + + return extractedPdfs; + } + } +} diff --git a/src/Mindee/Image/ExtractedImage.cs b/src/Mindee/Image/ExtractedImage.cs index d71c48798..c160522a7 100644 --- a/src/Mindee/Image/ExtractedImage.cs +++ b/src/Mindee/Image/ExtractedImage.cs @@ -14,17 +14,31 @@ public class ExtractedImage /// private readonly string _saveFormat; + /// + /// Page number the image was extracted from. + /// + public int PageId; + + /// + /// ID of the image. + /// + public int ElementId; + /// /// Initializes a new instance of the class. /// /// The extracted image. /// The filename for the image. /// The format to save the image. - public ExtractedImage(SKBitmap image, string filename, string saveFormat) + /// The page number the image was extracted from. + /// The ID of the image. + public ExtractedImage(SKBitmap image, string filename, string saveFormat, int pageId, int elementId) { Image = image; Filename = filename; _saveFormat = saveFormat; + PageId = pageId; + ElementId = elementId; } /// @@ -35,45 +49,71 @@ public ExtractedImage(SKBitmap image, string filename, string saveFormat) /// /// Name of the file. /// - private string Filename { get; } + public string Filename { get; } /// /// Writes the image to a file. - /// Uses the default image format and filename. + /// If outputPath has an extension, it is treated as a full file path. + /// Otherwise, it is treated as a directory and uses the default filename. /// - /// The output directory (must exist). - public void WriteToFile(string outputPath) + /// The output directory (must exist) or full file path. + /// The quality of the image. Defaults to 100. + /// The desired format. If null, inferred from extension or default. + public void WriteToFile(string outputPath, int quality = 100, string fileFormat = null) { - var imagePath = Path.Combine(outputPath, Filename); - var format = GetEncodedImageFormat(_saveFormat); + string imagePath; + var targetFormat = fileFormat ?? _saveFormat; - using (var image = SKImage.FromBitmap(Image)) - using (var data = image.Encode(format, 100)) - using (var stream = File.OpenWrite(imagePath)) + if (Path.HasExtension(outputPath)) { - data.SaveTo(stream); + imagePath = outputPath; + if (string.IsNullOrWhiteSpace(fileFormat)) + { + var extension = Path.GetExtension(outputPath).TrimStart('.'); + if (!string.IsNullOrWhiteSpace(extension)) + { + targetFormat = extension.ToLower(); + } + } } + else + { + var finalFilename = Filename; + if (!string.IsNullOrWhiteSpace(fileFormat)) + { + var nameWithoutExtension = Path.GetFileNameWithoutExtension(Filename); + finalFilename = $"{nameWithoutExtension}.{targetFormat.ToLower()}"; + } + imagePath = Path.Combine(outputPath, finalFilename); + } + + var format = GetEncodedImageFormat(targetFormat); + + using var image = SKImage.FromBitmap(Image); + using var data = image.Encode(format, quality); + using var stream = File.OpenWrite(imagePath); + data.SaveTo(stream); } /// /// Returns the image in a format suitable for sending to a client for parsing. /// + /// The quality of the image. Defaults to 100. /// An instance of . - public LocalInputSource AsInputSource() + public LocalInputSource AsInputSource(int quality = 100) { - using (var image = SKImage.FromBitmap(Image)) - using (var data = image.Encode(GetEncodedImageFormat(_saveFormat), 100)) - using (var output = new MemoryStream()) - { - data.SaveTo(output); - return new LocalInputSource(output.ToArray(), Filename); - } + using var image = SKImage.FromBitmap(Image); + using var data = image.Encode(GetEncodedImageFormat(_saveFormat), quality); + using var output = new MemoryStream(); + data.SaveTo(output); + return new LocalInputSource(output.ToArray(), Filename); } - private SKEncodedImageFormat GetEncodedImageFormat(string saveFormat) + private static SKEncodedImageFormat GetEncodedImageFormat(string saveFormat) { return saveFormat.ToLower() switch { + "jpg" or "jpeg" => SKEncodedImageFormat.Jpeg, "png" => SKEncodedImageFormat.Png, "bmp" => SKEncodedImageFormat.Bmp, "gif" => SKEncodedImageFormat.Gif, diff --git a/src/Mindee/Pdf/ExtractedPdf.cs b/src/Mindee/Pdf/ExtractedPdf.cs index 8286d74eb..6191b1e4c 100644 --- a/src/Mindee/Pdf/ExtractedPdf.cs +++ b/src/Mindee/Pdf/ExtractedPdf.cs @@ -1,6 +1,5 @@ using System.IO; -using Docnet.Core; -using Docnet.Core.Models; +using Mindee.Exceptions; using Mindee.Input; namespace Mindee.Pdf @@ -11,24 +10,55 @@ namespace Mindee.Pdf public class ExtractedPdf { /// - /// Name of the original file. + /// Local input source. /// - public readonly string Filename; + public readonly LocalInputSource LocalInput; /// - /// File object for an ExtractedPdf. + /// Page count. /// - public readonly byte[] PdfBytes; + public int PageCount { get; set; } + + /// + /// Original filename. + /// + public readonly string Filename; /// /// Initializes a new instance of the class. /// - /// A byte array representation of the Pdf. + /// A byte array representation of the Pdf. /// Name of the original file. - public ExtractedPdf(byte[] pdfBytes, string filename) + public ExtractedPdf(byte[] fileBytes, string filename) { - PdfBytes = pdfBytes; - Filename = filename; + var tmpInput = new LocalInputSource(fileBytes, filename); + if (tmpInput.IsPdf()) + { + LocalInput = tmpInput; + } + else + { + byte[] pdfBytes = PdfUtils.ConvertImageToPdf(fileBytes, filename); + string newFilename = Path.ChangeExtension(filename, ".pdf"); + LocalInput = new LocalInputSource(pdfBytes, newFilename); + } + PageCount = LocalInput.GetPageCount(); + Filename = LocalInput.Filename; + } + + /// + /// Initializes a new instance of the class. + /// + /// LocalInputSource containing the Pdf bytes and filename. + public ExtractedPdf(LocalInputSource localInput) + { + LocalInput = localInput; + if (!localInput.IsPdf()) + { + throw new MindeeInputException("The input file is not a PDF."); + } + PageCount = LocalInput.GetPageCount(); + Filename = LocalInput.Filename; } /// @@ -37,11 +67,7 @@ public ExtractedPdf(byte[] pdfBytes, string filename) /// The number of pages in the file. public int GetPageCount() { - lock (DocLib.Instance) - { - using var docInstance = DocLib.Instance.GetDocReader(PdfBytes, new PageDimensions(1, 1)); - return docInstance.GetPageCount(); - } + return LocalInput.GetPageCount(); } /// @@ -50,13 +76,13 @@ public int GetPageCount() /// the output directory (must exist). public void WriteToFile(string outputPath) { - var pdfPath = Path.Combine(outputPath, Filename); + var pdfPath = Path.Combine(outputPath, LocalInput.Filename); if (Path.GetFileName(outputPath) != string.Empty) { pdfPath = Path.GetFullPath(outputPath); } - File.WriteAllBytes(pdfPath, PdfBytes); + File.WriteAllBytes(pdfPath, LocalInput.FileBytes); } /// @@ -65,7 +91,7 @@ public void WriteToFile(string outputPath) /// an instance of public LocalInputSource AsInputSource() { - return new LocalInputSource(PdfBytes, Filename); + return LocalInput; } } } diff --git a/src/Mindee/Pdf/PdfUtils.cs b/src/Mindee/Pdf/PdfUtils.cs index 1aff81561..a44a1406f 100644 --- a/src/Mindee/Pdf/PdfUtils.cs +++ b/src/Mindee/Pdf/PdfUtils.cs @@ -1,4 +1,5 @@ using System; +using System.IO; using System.Linq; using Docnet.Core; using Docnet.Core.Models; @@ -178,5 +179,34 @@ public static bool HasSourceText(byte[] fileBytes) return false; } + + /// + /// Converts an image to a PDF. + /// + /// Raw image bytes. + /// Name of the file. + /// + /// + public static byte[] ConvertImageToPdf(byte[] imageBytes, string filename) + { + using var ms = new MemoryStream(); + using var bitmap = SKBitmap.Decode(imageBytes); + if (bitmap == null) + { + throw new MindeeInputException($"The file {filename} is not a valid image."); + } + + using (var document = SKDocument.CreatePdf(ms)) + { + using (var canvas = document.BeginPage(bitmap.Width, bitmap.Height)) + { + canvas.DrawBitmap(bitmap, 0, 0); + document.EndPage(); + } + document.Close(); + } + + return ms.ToArray(); + } } } diff --git a/src/Mindee/V1/Extraction/PdfExtractor.cs b/src/Mindee/V1/Extraction/PdfExtractor.cs deleted file mode 100644 index 4c1bb1be7..000000000 --- a/src/Mindee/V1/Extraction/PdfExtractor.cs +++ /dev/null @@ -1,165 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Linq; -using Docnet.Core; -using Docnet.Core.Models; -using Microsoft.Extensions.Logging.Abstractions; -using Mindee.Input; -using Mindee.Pdf; -using Mindee.V1.Product.InvoiceSplitter; -using SkiaSharp; - -namespace Mindee.V1.Extraction -{ - /// - /// PDF extraction class. - /// - public class PdfExtractor - { - private readonly string Filename; - private readonly byte[] SourcePdf; - - /// - /// Initializes a new instance of the class. - /// - /// Instance of a LocalInputSource, provided by the user. - public PdfExtractor(LocalInputSource localInput) - { - Filename = localInput.Filename; - - if (localInput.IsPdf()) - { - SourcePdf = localInput.FileBytes; - } - else - { - var memoryStream = new MemoryStream(); - using var image = SKImage.FromEncodedData(localInput.FileBytes); - using var bmp = SKBitmap.FromImage(image); - var pageSize = new SKSize(bmp.Width, bmp.Height); - using (var document = SKDocument.CreatePdf(memoryStream)) - { - var canvas = document.BeginPage(pageSize.Width, pageSize.Height); - canvas.DrawBitmap(bmp, SKPoint.Empty); - document.EndPage(); - } - - SourcePdf = memoryStream.ToArray(); - } - } - - /// - /// Wrapper for pdf GetPageCount(); - /// - /// The number of pages in the file. - public int GetPageCount() - { - lock (DocLib.Instance) - { - using var docInstance = DocLib.Instance.GetDocReader(SourcePdf, new PageDimensions(1, 1)); - return docInstance.GetPageCount(); - } - } - - /// - /// Extracts sub-documents from the source document using list of page indexes. - /// - /// List of sub-lists of pages to keep. - /// Extracted documents. - /// - public List ExtractSubDocuments(List> pageIndexes) - { - var extractedPdfs = new List(); - - foreach (var pageIndexElem in pageIndexes) - { - if (!pageIndexElem.Any()) - { - throw new ArgumentException("Empty indexes not allowed for extraction."); - } - - var extension = Path.GetExtension(Filename); - var prefix = Path.GetFileNameWithoutExtension(Filename); - var fieldFilename = - $"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}"; - - var splitQuery = new SplitQuery( - SourcePdf, - new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray())); - lock (DocLib.Instance) - { - var pdfOperation = new DocNetApi(new NullLogger()); - var mergedPdfBytes = pdfOperation.Split(splitQuery).File; - extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename)); - } - } - - return extractedPdfs; - } - - /// - /// Extracts invoices as complete PDFs from the document. Include cuts for confidence scores below 1.0. - /// - /// List of sub-lists of pages to keep. - /// A list of extracted invoices. - public List ExtractInvoices(List pageIndexes) - { - var indexes = pageIndexes.Select(pi => pi.PageIndexes.ToList()).ToList(); - return ExtractSubDocuments(indexes.ToList()); - } - - /// - /// Extracts invoices as complete PDFs from the document. - /// - /// List of sub-lists of pages to keep. - /// Whether to trust confidence scores of 1.0 only or not. - /// A list of extracted invoices. - public List ExtractInvoices(IList pageIndexes, bool strict) - { - if (!strict) - { - return ExtractInvoices(pageIndexes.ToList()); - } - - var correctPageIndexes = new List>(); - var iterator = pageIndexes.GetEnumerator(); - using var iterator1 = (IDisposable)iterator; - var currentList = new List(); - double? previousConfidence = null; - - while (iterator.MoveNext()) - { - var pageIndex = iterator.Current; - Debug.Assert(pageIndex != null, nameof(pageIndex) + " != null"); - var confidence = pageIndex.Confidence ?? 0.0; - var pageList = pageIndex.PageIndexes; - - if (Math.Abs(confidence - 1.0) < 0.01 && previousConfidence == null) - { - currentList = new List(pageList); - } - else if (Math.Abs(confidence - 1.0) < 0.01) - { - correctPageIndexes.Add(currentList); - currentList = new List(pageList); - } - else if (confidence == 0.0 && !iterator.MoveNext()) - { - currentList.AddRange(pageList); - correctPageIndexes.Add(currentList); - } - else - { - correctPageIndexes.Add(currentList); - correctPageIndexes.Add(pageList.ToList()); - } - - previousConfidence = confidence; - } - - return ExtractSubDocuments(correctPageIndexes); - } - } -} diff --git a/src/Mindee/V1/Extraction/ImageExtractor.cs b/src/Mindee/V1/Image/ImageExtractor.cs similarity index 59% rename from src/Mindee/V1/Extraction/ImageExtractor.cs rename to src/Mindee/V1/Image/ImageExtractor.cs index b6f6576bd..91f544fe8 100644 --- a/src/Mindee/V1/Extraction/ImageExtractor.cs +++ b/src/Mindee/V1/Image/ImageExtractor.cs @@ -1,122 +1,35 @@ -using System; using System.Collections.Generic; -using System.IO; -using Docnet.Core; -using Docnet.Core.Models; +using System.Linq; using Mindee.Exceptions; using Mindee.Geometry; using Mindee.Image; using Mindee.Input; using Mindee.V1.Parsing.Standard; -using SkiaSharp; -namespace Mindee.V1.Extraction +namespace Mindee.V1.Image { /// - /// Extract sub-images from an image. + /// Legacy V1 Wrapper for ImageExtractor. /// - public class ImageExtractor + public sealed class ImageExtractor : Mindee.Extraction.ImageExtractor { - private readonly string _filename; - private readonly List _pageImages; - private readonly string _saveFormat; - - /// - /// LocalInputSource object used by the ImageExtractor. - /// - public readonly LocalInputSource LocalInput; - /// /// Init from a Local Input Source. /// /// Locally loaded resource. /// Format to save the resulting images as. public ImageExtractor(LocalInputSource localInput, string saveFormat = null) - { - _filename = localInput.Filename; - _pageImages = new List(); - LocalInput = localInput; - if (saveFormat == null) - { - var extension = Path.GetExtension(localInput.Filename)?.Substring(1); - if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase)) - { - _saveFormat = extension; - } - else - { - _saveFormat = "jpg"; - } - } - else - { - _saveFormat = saveFormat; - } - - if (localInput.IsPdf()) - { - var pdfPageImages = PdfToImages(localInput.FileBytes); - _pageImages.AddRange(pdfPageImages); - } - else - { - _pageImages.Add(SKBitmap.Decode(localInput.FileBytes)); - } - } + : base(localInput, saveFormat) + { } /// /// Init from a path. /// /// Path to the file. - public ImageExtractor(string filePath) : this(new LocalInputSource(filePath)) - { - } - - /// - /// Renders the input Pdf's pages as individual images. - /// - /// Input pdf. - /// A list of pages, as SKBitmap. - private static List PdfToImages(byte[] fileBytes) - { - var images = new List(); - lock (DocLib.Instance) - { - using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1)); - for (var i = 0; i < docReader.GetPageCount(); i++) - { - using var pageReader = docReader.GetPageReader(i); - var width = pageReader.GetPageWidth(); - var height = pageReader.GetPageHeight(); - var bytes = pageReader.GetImage(); - var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height)); - images.Add(bmp); - } + public ImageExtractor(string filePath) + : base(filePath) + { } - return images; - } - } - - /// - /// Splits the filename into name and extension. - /// - private static string[] SplitNameStrict(string filename) - { - return - [ - Path.GetFileNameWithoutExtension(filename), - Path.GetExtension(filename).TrimStart('.') - ]; - } - - /// - /// Gets the number of pages in the file. - /// - /// The number of pages in the file. - public int GetPageCount() - { - return _pageImages.Count; - } /// /// Extract multiple images on a given page from a list of fields having position data. @@ -143,7 +56,7 @@ public IList ExtractImagesFromPage(IList if (GetPageCount() > 1) { var splitName = SplitNameStrict(outputName); - filename = $"{splitName[0]}.{_saveFormat}"; + filename = $"{splitName[0]}.{SaveFormat}"; } else { @@ -167,7 +80,7 @@ public IList ExtractImagesFromPage(IList fields, if (GetPageCount() > 1) { var splitName = SplitNameStrict(outputName); - filename = $"{splitName[0]}.{_saveFormat}"; + filename = $"{splitName[0]}.{SaveFormat}"; } else { @@ -188,25 +101,15 @@ private List ExtractFromPage(IList field string outputName) where TBaseField : BaseField { var splitName = SplitNameStrict(outputName); - var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}"; - - var extractedImages = new List(); - for (var i = 0; i < fields.Count; i++) - { - var extractedImage = ExtractImage(fields[i], pageIndex, i + 1, filename); - if (extractedImage != null) - { - extractedImages.Add(extractedImage); - } - } + var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}"; - return extractedImages; + return fields.Select((t, i) => ExtractImage(t, pageIndex, i + 1, filename)).Where(extractedImage => extractedImage != null).ToList(); } private List ExtractFromPage(IList fields, int pageIndex, string outputName) { var splitName = SplitNameStrict(outputName); - var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{_saveFormat}"; + var filename = $"{splitName[0]}_page-{pageIndex + 1:D3}.{SaveFormat}"; var extractedImages = new List(); for (var i = 0; i < fields.Count; i++) @@ -256,8 +159,8 @@ public ExtractedImage ExtractImage(PositionField field, int pageIndex, int index } var bbox = Utils.BboxFromPolygon(boundingBox); - var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat); + var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index); } /// @@ -283,29 +186,8 @@ public ExtractedImage ExtractImage(BaseField field, int pageIndex, int index, st } var bbox = Utils.BboxFromPolygon(boundingBox); - var fieldFilename = $"{splitName[0]}_{index:D3}.{_saveFormat}"; - return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, _saveFormat); - } - - private SKBitmap ExtractImage(Bbox bbox, int pageIndex) - { - var image = _pageImages[pageIndex]; - var width = image.Width; - var height = image.Height; - var minX = (int)Math.Round(bbox.MinX * width); - var maxX = (int)Math.Round(bbox.MaxX * width); - var minY = (int)Math.Round(bbox.MinY * height); - var maxY = (int)Math.Round(bbox.MaxY * height); - - var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY); - using (var canvas = new SKCanvas(croppedBitmap)) - { - var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height); - var sourceRect = new SKRect(minX, minY, maxX, maxY); - canvas.DrawBitmap(image, sourceRect, destRect); - } - - return croppedBitmap; + var fieldFilename = $"{splitName[0]}_{index:D3}.{SaveFormat}"; + return new ExtractedImage(ExtractImage(bbox, pageIndex), fieldFilename, SaveFormat, pageIndex, index); } } } diff --git a/src/Mindee/V1/Image/PdfExtractor.cs b/src/Mindee/V1/Image/PdfExtractor.cs new file mode 100644 index 000000000..359852ba3 --- /dev/null +++ b/src/Mindee/V1/Image/PdfExtractor.cs @@ -0,0 +1,84 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using Mindee.Input; +using Mindee.Pdf; +using Mindee.V1.Product.InvoiceSplitter; + +namespace Mindee.V1.Image +{ + /// + /// V1 wrapper for the PDF extraction class. + /// + public class PdfExtractor : Mindee.Extraction.PdfExtractor + { + /// + public PdfExtractor(LocalInputSource localInput) : base(localInput) + { + } + + /// + /// Extracts invoices as complete PDFs from the document. Include cuts for confidence scores below 1.0. + /// + /// List of sub-lists of pages to keep. + /// A list of extracted invoices. + public List ExtractInvoices(List pageIndexes) + { + var indexes = pageIndexes.Select(pi => pi.PageIndexes.ToList()).ToList(); + return ExtractSubDocuments(indexes.ToList()); + } + + /// + /// Extracts invoices as complete PDFs from the document. + /// + /// List of sub-lists of pages to keep. + /// Whether to trust confidence scores of 1.0 only or not. + /// A list of extracted invoices. + public List ExtractInvoices(IList pageIndexes, bool strict) + { + if (!strict) + { + return ExtractInvoices(pageIndexes.ToList()); + } + + var correctPageIndexes = new List>(); + var iterator = pageIndexes.GetEnumerator(); + using var iterator1 = (IDisposable)iterator; + var currentList = new List(); + double? previousConfidence = null; + + while (iterator.MoveNext()) + { + var pageIndex = iterator.Current; + Debug.Assert(pageIndex != null, nameof(pageIndex) + " != null"); + var confidence = pageIndex.Confidence ?? 0.0; + var pageList = pageIndex.PageIndexes; + + if (Math.Abs(confidence - 1.0) < 0.01 && previousConfidence == null) + { + currentList = new List(pageList); + } + else if (Math.Abs(confidence - 1.0) < 0.01) + { + correctPageIndexes.Add(currentList); + currentList = new List(pageList); + } + else if (confidence == 0.0 && !iterator.MoveNext()) + { + currentList.AddRange(pageList); + correctPageIndexes.Add(currentList); + } + else + { + correctPageIndexes.Add(currentList); + correctPageIndexes.Add(pageList.ToList()); + } + + previousConfidence = confidence; + } + + return ExtractSubDocuments(correctPageIndexes); + } + } +} diff --git a/src/Mindee/V2/FileOperations/Crop.cs b/src/Mindee/V2/FileOperations/Crop.cs new file mode 100644 index 000000000..596011152 --- /dev/null +++ b/src/Mindee/V2/FileOperations/Crop.cs @@ -0,0 +1,59 @@ +using System.Collections.Generic; +using System.Linq; +using Mindee.Extraction; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; +using Mindee.V2.Product.Crop; + +namespace Mindee.V2.FileOperations +{ + /// + /// V2 Crop operation utility. + /// + public sealed class Crop + { + /// + /// LocalInputSource object. + /// + private readonly LocalInputSource _localInput; + + /// + /// + /// + /// + public Crop(LocalInputSource inputSource) + { + this._localInput = inputSource; + } + + /// + /// Extract a single crop item from a file. + /// + /// + /// + public ExtractedImage ExtractSingleCrop(CropItem crop) + { + var polygons = new List { crop.Location.Polygon }; + var imageExtractor = new ImageExtractor(this._localInput); + return imageExtractor.ExtractMultipleImagesFromSource(crop.Location.Page, polygons)[0]; + } + + /// + /// Extracts multiple crop zones from a file. + /// + /// List of crops. + /// + public CropFiles ExtractCrops(List crops) + { + var imageExtractor = new ImageExtractor(this._localInput); + CropFiles extractedImages = []; + var cropsPerPage = crops.GroupBy(c => c.Location.Page).ToList(); + foreach (var pageCrops in cropsPerPage) + { + extractedImages.AddRange(imageExtractor.ExtractMultipleImagesFromSource(pageCrops.Key, pageCrops.Select(c => c.Location.Polygon).ToList())); + } + return extractedImages; + } + } +} diff --git a/src/Mindee/V2/FileOperations/CropFiles.cs b/src/Mindee/V2/FileOperations/CropFiles.cs new file mode 100644 index 000000000..f547761a4 --- /dev/null +++ b/src/Mindee/V2/FileOperations/CropFiles.cs @@ -0,0 +1,50 @@ +using System.Collections.Generic; +using System.IO; +using Mindee.Image; + +namespace Mindee.V2.FileOperations +{ + /// + /// Collection of cropped files. + /// + public class CropFiles : List + { + /// + /// + /// + /// + public CropFiles(IEnumerable collection) : base(collection) + { + } + + /// + /// + /// + public CropFiles() : base() + { + } + + /// + /// Saves all cropped files to disk. + /// + /// Path for all files + /// Prefix for file names + /// Quality of the output image + /// File format for saving (default: null) + public void SaveAllToDisk(string path, int quality = 100, string prefix = "crop", string fileFormat = null) + { + Directory.CreateDirectory(path); + + int index = 1; + foreach (var crop in this) + { + string fileName = $"{prefix}_{index:D3}.jpg"; + string filePath = Path.Combine(path, fileName); + + crop.WriteToFile(filePath, quality, fileFormat); + + index++; + } + } + } +} diff --git a/src/Mindee/V2/FileOperations/Split.cs b/src/Mindee/V2/FileOperations/Split.cs new file mode 100644 index 000000000..25ca0cd47 --- /dev/null +++ b/src/Mindee/V2/FileOperations/Split.cs @@ -0,0 +1,83 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Mindee.Exceptions; +using Mindee.Extraction; +using Mindee.Input; +using Mindee.Pdf; +using Mindee.V2.Product.Split; + +namespace Mindee.V2.FileOperations +{ + /// + /// V2 Split operation utility. + /// + public sealed class Split + { + + /// + /// LocalInputSource object. + /// + private readonly LocalInputSource _localInput; + + /// + /// Expands a range of pages into a list of page indexes. + /// + /// Start of the range. + /// End of the range. + /// An array of page indexes. + public static List ExpandRange(int start, int end) + { + if (start > end) + { + throw new MindeeInputException("Invalid page range provided."); + } + + int count = end - start + 1; + return Enumerable.Range(start, count).ToList(); + } + + /// + /// Initializes an instance of a Split operation. + /// Transforms images to PDFs if necessary. + /// + /// + public Split(LocalInputSource inputSource) + { + if (inputSource.IsPdf()) + { + _localInput = inputSource; + } + else + { + byte[] pdfBytes = PdfUtils.ConvertImageToPdf(inputSource.FileBytes, inputSource.Filename); + string newFilename = Path.ChangeExtension(inputSource.Filename, ".pdf"); + _localInput = new LocalInputSource(pdfBytes, newFilename); + } + } + + /// + /// Extracts a single split from the input file. + /// + /// + /// + public ExtractedPdf ExtractSingleSplit(SplitRange splitRange) + { + return ExtractSplits([splitRange.PageRange])[0]; + } + + /// + /// Extracts the splits from the input file. + /// + /// List of subpage indexes to keep. + /// + public SplitFiles ExtractSplits(List> splits) + { + var pdfExtractor = new PdfExtractor(this._localInput); + + List> expandedPageIndexes = []; + expandedPageIndexes.AddRange(splits.Select(split => ExpandRange(split[0], split[1]))); + return new SplitFiles(pdfExtractor.ExtractSubDocuments(expandedPageIndexes)); + } + } +} diff --git a/src/Mindee/V2/FileOperations/SplitFiles.cs b/src/Mindee/V2/FileOperations/SplitFiles.cs new file mode 100644 index 000000000..443f08478 --- /dev/null +++ b/src/Mindee/V2/FileOperations/SplitFiles.cs @@ -0,0 +1,48 @@ +using System.Collections.Generic; +using System.IO; +using Mindee.Pdf; + +namespace Mindee.V2.FileOperations +{ + /// + /// Collection of split PDFs. + /// + public sealed class SplitFiles : List + { + /// + /// + /// + /// + public SplitFiles(IEnumerable collection) : base(collection) + { + } + + /// + /// + /// + public SplitFiles() : base() + { + } + + /// + /// Saves all the extracted pages to disk. + /// + /// Path for all files + /// Prefix for file names + public void SaveAllToDisk(string path, string prefix = "split") + { + Directory.CreateDirectory(path); + + int index = 1; + foreach (var crop in this) + { + string fileName = $"{prefix}_{index:D3}.pdf"; + string filePath = Path.Combine(path, fileName); + + crop.WriteToFile(filePath); + + index++; + } + } + } +} diff --git a/src/Mindee/V2/Product/Crop/CropItem.cs b/src/Mindee/V2/Product/Crop/CropItem.cs index 923811cfd..d703f28e7 100644 --- a/src/Mindee/V2/Product/Crop/CropItem.cs +++ b/src/Mindee/V2/Product/Crop/CropItem.cs @@ -1,4 +1,9 @@ +using System.Collections.Generic; using System.Text.Json.Serialization; +using Mindee.Extraction; +using Mindee.Geometry; +using Mindee.Image; +using Mindee.Input; using Mindee.V2.Parsing.Inference.Field; namespace Mindee.V2.Product.Crop @@ -28,5 +33,16 @@ public override string ToString() { return $"* :Location: {Location}\n :Object Type: {ObjectType}"; } + + /// + /// Extract the crop from the source document. + /// + /// + /// + public ExtractedImage ExtractFromFile(LocalInputSource inputSource) + { + var crop = new FileOperations.Crop(inputSource); + return crop.ExtractSingleCrop(this); + } } } diff --git a/src/Mindee/V2/Product/Split/SplitRange.cs b/src/Mindee/V2/Product/Split/SplitRange.cs index 6d0cb704c..7b93519d7 100644 --- a/src/Mindee/V2/Product/Split/SplitRange.cs +++ b/src/Mindee/V2/Product/Split/SplitRange.cs @@ -1,5 +1,7 @@ using System.Collections.Generic; using System.Text.Json.Serialization; +using Mindee.Input; +using Mindee.Pdf; namespace Mindee.V2.Product.Split { @@ -29,5 +31,16 @@ public override string ToString() string pageRange = string.Join(",", PageRange); return $"* :Page Range: {pageRange}\n :Document Type: {DocumentType}"; } + + /// + /// Extracts the split from the source document. + /// + /// + /// + public ExtractedPdf ExtractFromFile(LocalInputSource inputSource) + { + var split = new FileOperations.Split(inputSource); + return split.ExtractSingleSplit(this); + } } } diff --git a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs index 98234901b..616d3c16f 100644 --- a/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs +++ b/tests/Mindee.IntegrationTests/V1/InvoiceSplitterAutoExtractionTest.cs @@ -1,6 +1,6 @@ using Mindee.Input; using Mindee.Pdf; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.Invoice; using Mindee.V1.Product.InvoiceSplitter; diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs new file mode 100644 index 000000000..e851e459e --- /dev/null +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/CropTest.cs @@ -0,0 +1,108 @@ +using Mindee.Input; +using Mindee.V2; +using Mindee.V2.FileOperations; +using Mindee.V2.Product.Crop; +using Mindee.V2.Product.Crop.Params; +using Mindee.V2.Product.Extraction; +using Mindee.V2.Product.Extraction.Params; + +namespace Mindee.IntegrationTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class CropTest : IDisposable + { + private readonly string? _cropModelId; + private readonly string? _findocModelId; + private readonly Client _client; + private readonly string _outputDir; + + public CropTest() + { + var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey"); + _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey); + _cropModelId = Environment.GetEnvironmentVariable("MindeeV2__Crop__Model__Id"); + _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id"); + + _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output"); + if (!Directory.Exists(_outputDir)) + { + Directory.CreateDirectory(_outputDir); + } + } + + public void Dispose() + { + var file1 = Path.Combine(_outputDir, "crop_001.jpg"); + var file2 = Path.Combine(_outputDir, "crop_002.jpg"); + + if (File.Exists(file1)) File.Delete(file1); + if (File.Exists(file2)) File.Delete(file2); + } + + private void CheckFindocReturn(ExtractionResponse findocResponse) + { + Assert.True(findocResponse.Inference.Model.Id.Length > 0); + + var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField; + Assert.NotNull(totalAmount); + Assert.True(totalAmount.Value > 0); + } + + [Fact(Timeout = 180000)] + public async Task Extract_Crops_From_Image_Correctly() + { + var inputSource = new LocalInputSource(Path.Combine( + Constants.V2ProductDir, "crop/default_sample.jpg")); + var cropParams = new CropParameters(_cropModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, cropParams); + + Assert.NotNull(response); + Assert.Equal(2, response.Inference.Result.Crops.Count); + + var cropOperation = new Crop(inputSource); + var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops); + + Assert.Equal(2, extractedImages.Count); + Assert.Equal("default_sample.jpg_page0-0.jpg", extractedImages[0].Filename); + Assert.Equal("default_sample.jpg_page0-1.jpg", extractedImages[1].Filename); + + var extractionInput = extractedImages[0].AsInputSource(); + var findocParams = new ExtractionParameters(_findocModelId); + + var invoice0 = await _client.EnqueueAndGetResultAsync( + extractionInput, findocParams); + + CheckFindocReturn(invoice0); + + extractedImages.SaveAllToDisk(_outputDir, 50); + + var file1Info = new FileInfo(Path.Combine(_outputDir, "crop_001.jpg")); + Assert.InRange(file1Info.Length, 99000, 110000); + + var file2Info = new FileInfo(Path.Combine(_outputDir, "crop_002.jpg")); + Assert.InRange(file2Info.Length, 99000, 110000); + } + + [Fact(Timeout = 180000)] + public async Task Extract_Crops_From_Each_Pdf_Page_Correctly() + { + + var inputSource = new LocalInputSource( + new FileInfo(Path.Combine(Constants.V2ProductDir, "crop/multipage_sample.pdf"))); + + var cropParams = new CropParameters(_cropModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, cropParams); + var cropOperation = new Crop(inputSource); + var extractedImages = cropOperation.ExtractCrops(response.Inference.Result.Crops); + + Assert.Equal(5, extractedImages.Count); + Assert.Equal("multipage_sample.pdf_page0-0.jpg", extractedImages[0].Filename); + Assert.Equal("multipage_sample.pdf_page1-0.jpg", extractedImages[3].Filename); + } + } +} diff --git a/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs new file mode 100644 index 000000000..1801d90d1 --- /dev/null +++ b/tests/Mindee.IntegrationTests/V2/FileOperations/SplitTest.cs @@ -0,0 +1,97 @@ +using Mindee.Input; +using Mindee.V2; +using Mindee.V2.FileOperations; +using Mindee.V2.Product.Extraction; +using Mindee.V2.Product.Extraction.Params; +using Mindee.V2.Product.Split; +using Mindee.V2.Product.Split.Params; + +namespace Mindee.IntegrationTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class SplitTest : IDisposable + { + private readonly string? _splitModelId; + private readonly string? _findocModelId; + private readonly Client _client; + private readonly string _outputDir; + + public SplitTest() + { + var apiKey = Environment.GetEnvironmentVariable("MindeeV2__ApiKey"); + _client = TestingUtilities.GetOrGenerateMindeeClientV2(apiKey); + _splitModelId = Environment.GetEnvironmentVariable("MindeeV2__Split__Model__Id"); + _findocModelId = Environment.GetEnvironmentVariable("MindeeV2__Findoc__Model__Id"); + + _outputDir = Path.Combine(Directory.GetCurrentDirectory(), "output"); + if (!Directory.Exists(_outputDir)) + { + Directory.CreateDirectory(_outputDir); + } + } + + public void Dispose() + { + var file1 = Path.Combine(_outputDir, "split_001.pdf"); + var file2 = Path.Combine(_outputDir, "split_002.pdf"); + + if (File.Exists(file1)) File.Delete(file1); + if (File.Exists(file2)) File.Delete(file2); + } + + private void CheckFindocReturn(ExtractionResponse findocResponse) + { + Assert.True(findocResponse.Inference.Model.Id.Length > 0); + + var totalAmount = findocResponse.Inference.Result.Fields["total_amount"].SimpleField; + Assert.NotNull(totalAmount); + Assert.True(totalAmount.Value > 0); + } + + [Fact(Timeout = 180000)] + public async Task Extract_Splits_From_Pdf_Correctly() + { + var inputSource = new LocalInputSource( + Constants.V2ProductDir + "split/default_sample.pdf"); + var splitParams = new SplitParameters(_splitModelId); + + var response = await _client.EnqueueAndGetResultAsync( + inputSource, splitParams); + + Assert.NotNull(response); + Assert.Equal(2, response.Inference.Result.Splits.Count); + + var splitOperation = new Split(inputSource); + var extractedSplits = splitOperation.ExtractSplits( + response.Inference.Result.Splits.Select(s => s.PageRange).ToList()); + + Assert.Equal(2, extractedSplits.Count); + Assert.Equal("default_sample_001-001.pdf", extractedSplits[0].Filename); + Assert.Equal("default_sample_002-002.pdf", extractedSplits[1].Filename); + + var extractionInput = extractedSplits[0].AsInputSource(); + var findocParams = new ExtractionParameters(_findocModelId); + + var invoice0 = await _client.EnqueueAndGetResultAsync( + extractionInput, findocParams); + + CheckFindocReturn(invoice0); + + extractedSplits.SaveAllToDisk(_outputDir); + + for (int i = 0; i < extractedSplits.Count; i++) + { + var fileName = $"split_{i + 1:D3}.pdf"; + var filePath = Path.Combine(_outputDir, fileName); + var fileInfo = new FileInfo(filePath); + + Assert.True(fileInfo.Exists); + Assert.True(fileInfo.Length > 0); + + var localInput = new LocalInputSource(fileInfo); + Assert.Equal(extractedSplits[i].PageCount, localInput.GetPageCount()); + } + } + } +} diff --git a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs index e85640256..e3503329f 100644 --- a/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs +++ b/tests/Mindee.UnitTests/Extraction/ImageExtractorTest.cs @@ -1,5 +1,5 @@ using Mindee.Input; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.BarcodeReader; using Mindee.V1.Product.MultiReceiptsDetector; diff --git a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs index b88fd3d54..a62072732 100644 --- a/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs +++ b/tests/Mindee.UnitTests/Extraction/PdfExtractorTest.cs @@ -1,5 +1,5 @@ using Mindee.Input; -using Mindee.V1.Extraction; +using Mindee.V1.Image; using Mindee.V1.Parsing.Common; using Mindee.V1.Product.InvoiceSplitter; diff --git a/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs new file mode 100644 index 000000000..5a2fb359a --- /dev/null +++ b/tests/Mindee.UnitTests/V2/FileOperations/CropTest.cs @@ -0,0 +1,67 @@ +using Mindee.Input; +using Mindee.V2.FileOperations; +using Mindee.V2.Parsing; +using Mindee.V2.Product.Crop; + +namespace Mindee.UnitTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class CropTest + { + private readonly string _cropDataDir = Path.Combine(Constants.V2RootDir, "products", "crop"); + + [Fact] + public void Processes_SinglePage_CropSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_cropDataDir, "default_sample.jpg"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_cropDataDir, "crop_single.json"))); + var doc = localResponse.DeserializeResponse(); + + var cropOperation = new Crop(inputSample); + var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops); + + Assert.Single(extractedCrops); + + Assert.Equal(0, extractedCrops[0].PageId); + Assert.Equal(0, extractedCrops[0].ElementId); + + using var bitmap0 = extractedCrops[0].Image; + Assert.Equal(2822, bitmap0.Width); + Assert.Equal(1572, bitmap0.Height); + } + + [Fact] + public void Processes_MultiPage_ReceiptSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_cropDataDir, "multipage_sample.pdf"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_cropDataDir, "crop_multiple.json"))); + var doc = localResponse.DeserializeResponse(); + + var cropOperation = new Crop(inputSample); + var extractedCrops = cropOperation.ExtractCrops(doc.Inference.Result.Crops); + + Assert.Equal(2, extractedCrops.Count); + + Assert.Equal(0, extractedCrops[0].PageId); + Assert.Equal(0, extractedCrops[0].ElementId); + + using var bitmap0 = extractedCrops[0].Image; + Assert.Equal(156, bitmap0.Width); + Assert.Equal(757, bitmap0.Height); + + Assert.Equal(0, extractedCrops[1].PageId); + Assert.Equal(1, extractedCrops[1].ElementId); + + using var bitmap1 = extractedCrops[1].Image; + Assert.Equal(188, bitmap1.Width); + Assert.Equal(691, bitmap1.Height); + } + } +} diff --git a/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs new file mode 100644 index 000000000..88093be39 --- /dev/null +++ b/tests/Mindee.UnitTests/V2/FileOperations/SplitTest.cs @@ -0,0 +1,55 @@ +using Mindee.Input; +using Mindee.V2.FileOperations; +using Mindee.V2.Parsing; +using Mindee.V2.Product.Split; + +namespace Mindee.UnitTests.V2.FileOperations +{ + [Trait("Category", "V2")] + [Trait("Category", "FileOperations")] + public class SplitTest + { + private readonly string _splitDataDir = Path.Combine(Constants.V2RootDir, "products", "split"); + private readonly string _finDocDataDir = Path.Combine(Constants.V2RootDir, "products", "extraction", "financial_document"); + + [Fact] + public void Processes_SinglePage_Split_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_finDocDataDir, "default_sample.jpg"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_splitDataDir, "split_single.json"))); + var doc = localResponse.DeserializeResponse(); + + var splitOperation = new Split(inputSample); + List splits = doc.Inference.Result.Splits; + var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList()); + + Assert.Single(extractedSplits); + + Assert.Equal(1, extractedSplits[0].PageCount); + } + + [Fact] + public void Processes_MultiPage_ReceiptSplit_Correctly() + { + var inputSample = new LocalInputSource( + new FileInfo(Path.Combine(_splitDataDir, "invoice_5p.pdf"))); + + var localResponse = new LocalResponse( + new FileInfo(Path.Combine(_splitDataDir, "split_multiple.json"))); + var doc = localResponse.DeserializeResponse(); + + var splitOperation = new Split(inputSample); + List splits = doc.Inference.Result.Splits; + var extractedSplits = splitOperation.ExtractSplits(splits.Select(s => s.PageRange).ToList()); + + Assert.Equal(3, extractedSplits.Count); + + Assert.Equal(1, extractedSplits[0].PageCount); + Assert.Equal(3, extractedSplits[1].PageCount); + Assert.Equal(1, extractedSplits[2].PageCount); + } + } +}