Skip to content
Merged
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

<!-- Common packages for all frameworks -->
<ItemGroup>
<PackageReference Include="SkiaSharp" Version="3.119.1" />
<PackageReference Include="SkiaSharp" Version="3.119.2" />
<PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="3.119.1" />
<PackageReference Include="Docnet.Core" Version="2.6.0" />
</ItemGroup>
Expand Down
174 changes: 174 additions & 0 deletions src/Mindee/Extraction/ImageExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
using System;
using System.Collections.Generic;
using System.IO;
using Docnet.Core;
using Docnet.Core.Models;
using Mindee.Geometry;
using Mindee.Image;
using Mindee.Input;
using SkiaSharp;

namespace Mindee.Extraction
{
/// <summary>
/// Extract sub-images from an image.
/// </summary>
public class ImageExtractor
{
/// <summary>
/// Name of the file.
/// </summary>
protected readonly string _filename;
/// <summary>
/// List of SKBitmap representing the pages of the file.
/// </summary>
private readonly List<SKBitmap> _pageImages;
/// <summary>
/// Format to save the resulting images as.
/// </summary>
protected readonly string SaveFormat;

/// <summary>
/// LocalInputSource object used by the ImageExtractor.
/// </summary>
public readonly LocalInputSource LocalInput;

/// <summary>
/// Init from a Local Input Source.
/// </summary>
/// <param name="localInput">Locally loaded resource.</param>
/// <param name="saveFormat">Format to save the resulting images as.</param>
public ImageExtractor(LocalInputSource localInput, string saveFormat = null)
{
_filename = localInput.Filename;
_pageImages = [];
LocalInput = localInput;
if (saveFormat == null)
{
var extension = Path.GetExtension(localInput.Filename)?.Substring(1);
if (extension != null && !extension.Equals("pdf", StringComparison.CurrentCultureIgnoreCase))
{
SaveFormat = extension;
}
else
{
SaveFormat = "jpg";
}
}
else
{
SaveFormat = saveFormat;
}

if (localInput.IsPdf())
{
var pdfPageImages = PdfToImages(localInput.FileBytes);
_pageImages.AddRange(pdfPageImages);
}
else
{
_pageImages.Add(SKBitmap.Decode(localInput.FileBytes));
}
}

/// <summary>
/// Init from a path.
/// </summary>
/// <param name="filePath">Path to the file.</param>
public ImageExtractor(string filePath) : this(new LocalInputSource(filePath))
{
}

/// <summary>
/// Renders the input Pdf's pages as individual images.
/// </summary>
/// <param name="fileBytes">Input pdf.</param>
/// <returns>A list of pages, as SKBitmap.</returns>
private static List<SKBitmap> PdfToImages(byte[] fileBytes)
{
var images = new List<SKBitmap>();
lock (DocLib.Instance)
{
using var docReader = DocLib.Instance.GetDocReader(fileBytes, new PageDimensions(1));
for (var i = 0; i < docReader.GetPageCount(); i++)
{
using var pageReader = docReader.GetPageReader(i);
var width = pageReader.GetPageWidth();
var height = pageReader.GetPageHeight();
var bytes = pageReader.GetImage();
var bmp = ImageUtils.ArrayToImage(ImageUtils.ConvertTo3DArray(bytes, width, height));
images.Add(bmp);
}

return images;
}
}

/// <summary>
/// Splits the filename into name and extension.
/// </summary>
protected static string[] SplitNameStrict(string filename)
{
return
[
Path.GetFileNameWithoutExtension(filename),
Path.GetExtension(filename).TrimStart('.')
];
}

/// <summary>
/// Gets the number of pages in the file.
/// </summary>
/// <returns>The number of pages in the file.</returns>
public int GetPageCount()
{
return _pageImages.Count;
}

/// <summary>
/// Extracts a single image from a field having position data.
/// </summary>
/// <param name="bbox">Bounding box of the field.</param>
/// <param name="pageIndex">Index of the page containing the field.</param>
/// <returns>Extracted image as an SKBitmap.</returns>
protected SKBitmap ExtractImage(Bbox bbox, int pageIndex)
{
var image = _pageImages[pageIndex];
var width = image.Width;
var height = image.Height;
var minX = (int)Math.Round(bbox.MinX * width);
var maxX = (int)Math.Round(bbox.MaxX * width);
var minY = (int)Math.Round(bbox.MinY * height);
var maxY = (int)Math.Round(bbox.MaxY * height);

var croppedBitmap = new SKBitmap(maxX - minX, maxY - minY);
using var canvas = new SKCanvas(croppedBitmap);
var destRect = new SKRect(0, 0, croppedBitmap.Width, croppedBitmap.Height);
var sourceRect = new SKRect(minX, minY, maxX, maxY);
canvas.DrawBitmap(image, sourceRect, destRect);

return croppedBitmap;
}

/// <summary>
/// Extracts multiple images from a field having position data.
/// </summary>
/// <param name="pageId">The page index to extract, begins at 0.</param>
/// <param name="polygons">The list of polygons representing the position data.</param>
/// <returns>A list of extracted images.</returns>
public List<ExtractedImage> ExtractMultipleImagesFromSource(int pageId, List<Polygon> polygons)
{
var filename = this.LocalInput.Filename;
var extractedImages = new List<ExtractedImage>();
int i = 0;
foreach (var polygon in polygons)
{
var bbox = Utils.BboxFromPolygon(polygon);
var fieldFilename = $"{filename}_page{pageId}-{polygons.IndexOf(polygon)}.{SaveFormat}";
extractedImages.Add(new ExtractedImage(ExtractImage(bbox, pageId), fieldFilename, SaveFormat, pageId, i));
i++;
}
return extractedImages;
}
}
}
115 changes: 115 additions & 0 deletions src/Mindee/Extraction/PdfExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
using System;
using System.Collections.Generic;
using System.IO;
using Docnet.Core;
using Microsoft.Extensions.Logging.Abstractions;
using Mindee.Exceptions;
using Mindee.Input;
using Mindee.Pdf;
using SkiaSharp;

namespace Mindee.Extraction
{
/// <summary>
/// PDF extraction class.
/// </summary>
public class PdfExtractor
{
/// <summary>
/// Local input source.
/// </summary>
protected readonly LocalInputSource LocalInput;

/// <summary>
/// Source PDF bytes.
/// </summary>
protected byte[] SourcePdf;

/// <summary>
/// Initializes a new instance of the <see cref="PdfExtractor" /> class.
/// </summary>
/// <param name="localInput">Instance of a LocalInputSource, provided by the user.</param>
public PdfExtractor(LocalInputSource localInput)
{
LocalInput = localInput;
}

/// <summary>
/// Wrapper for PDF GetPageCount();
/// </summary>
/// <returns>The number of pages in the file.</returns>
public int GetPageCount()
{
return LocalInput.GetPageCount();
}

/// <summary>
/// Extract the PDF bytes.
/// </summary>
/// <returns></returns>
protected byte[] PdfBytes()
{
if (SourcePdf != null)
{
return this.SourcePdf;
}
if (LocalInput.IsPdf())
{
SourcePdf = LocalInput.FileBytes;
}
else
{
var memoryStream = new MemoryStream();
using var image = SKImage.FromEncodedData(LocalInput.FileBytes);
using var bmp = SKBitmap.FromImage(image);
var pageSize = new SKSize(bmp.Width, bmp.Height);
using (var document = SKDocument.CreatePdf(memoryStream))
{
var canvas = document.BeginPage(pageSize.Width, pageSize.Height);
canvas.DrawBitmap(bmp, SKPoint.Empty);
document.EndPage();
}

SourcePdf = memoryStream.ToArray();
}

return SourcePdf;
}

/// <summary>
/// Extracts sub-documents from the source document using list of page indexes.
/// </summary>
/// <param name="pageIndexes">List of sub-lists of pages to keep.</param>
/// <returns>Extracted documents.</returns>
/// <exception cref="ArgumentException"></exception>
public List<ExtractedPdf> ExtractSubDocuments(List<List<int>> pageIndexes)
{
var extractedPdfs = new List<ExtractedPdf>();

foreach (var pageIndexElem in pageIndexes)
{
if (pageIndexElem.Count == 0)
{
throw new MindeeInputException("Empty indexes not allowed for extraction.");
}

var extension = Path.GetExtension(LocalInput.Filename);
var prefix = Path.GetFileNameWithoutExtension(LocalInput.Filename);
var fieldFilename =
$"{prefix}_{pageIndexElem[0] + 1:D3}-{pageIndexElem[pageIndexElem.Count - 1] + 1:D3}{extension}";

var splitQuery = new SplitQuery(
PdfBytes(),
new PageOptions(pageIndexElem.ConvertAll(item => (short)item).ToArray()));
lock (DocLib.Instance)
{
var pdfOperation = new DocNetApi(new NullLogger<DocNetApi>());
var mergedPdfBytes = pdfOperation.Split(splitQuery).File;
extractedPdfs.Add(new ExtractedPdf(mergedPdfBytes, fieldFilename));
}
}

return extractedPdfs;
}
}
}
Loading
Loading