From a6c695bf37a33739128a600e2d361a907ed17949 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:32:09 +0200 Subject: [PATCH 1/6] :sparkles: add support for V2 crop & split operations --- examples/auto_invoice_splitter_extraction.rb | 2 +- lib/mindee.rb | 12 +- lib/mindee/error.rb | 6 + lib/mindee/{errors => error}/mindee_error.rb | 4 +- .../{errors => error}/mindee_http_error.rb | 6 +- .../{errors => error}/mindee_http_error_v2.rb | 2 +- .../mindee_http_unknown_error_v2.rb | 4 +- .../{errors => error}/mindee_input_error.rb | 4 +- lib/mindee/errors.rb | 6 - lib/mindee/http/http_error_handler.rb | 14 +- lib/mindee/image/extracted_image.rb | 37 ++-- lib/mindee/image/image_extractor.rb | 3 +- lib/mindee/image/image_utils.rb | 6 +- lib/mindee/input/base_parameters.rb | 4 +- lib/mindee/input/local_response.rb | 6 +- .../input/sources/local_input_source.rb | 8 +- lib/mindee/input/sources/url_input_source.rb | 8 +- lib/mindee/pdf/extracted_pdf.rb | 96 +++++----- lib/mindee/pdf/pdf_extractor.rb | 181 +++++++++--------- lib/mindee/v1/client.rb | 18 +- .../v1/extraction/multi_receipts_extractor.rb | 2 +- lib/mindee/v1/http/endpoint.rb | 12 +- lib/mindee/v1/http/workflow_endpoint.rb | 8 +- lib/mindee/v1/parsing/common/api_response.rb | 2 +- lib/mindee/v2.rb | 1 + lib/mindee/v2/client.rb | 6 +- lib/mindee/v2/file_operation.rb | 6 + lib/mindee/v2/file_operation/crop.rb | 51 +++++ lib/mindee/v2/file_operation/crop_files.rb | 25 +++ lib/mindee/v2/file_operation/split.rb | 37 ++++ lib/mindee/v2/file_operation/split_files.rb | 25 +++ lib/mindee/v2/http/api_settings.rb | 4 +- lib/mindee/v2/http/mindee_api.rb | 12 +- lib/mindee/v2/parsing/error_response.rb | 4 +- lib/mindee/v2/parsing/field/base_field.rb | 2 +- lib/mindee/v2/parsing/field/list_field.rb | 2 +- lib/mindee/v2/product/crop/crop_item.rb | 10 + lib/mindee/v2/product/crop/crop_response.rb | 11 ++ .../extraction/params/data_schema_replace.rb | 2 +- lib/mindee/v2/product/split/split_range.rb | 8 + lib/mindee/v2/product/split/split_response.rb | 10 + sig/mindee.rbs | 2 +- sig/mindee/{errors => error}/mindee_error.rbs | 4 +- .../{errors => error}/mindee_http_error.rbs | 6 +- .../mindee_http_error_v2.rbs | 4 +- .../mindee_http_unknown_error_v2.rbs | 4 +- .../{errors => error}/mindee_input_error.rbs | 4 +- sig/mindee/http/http_error_handler.rbs | 8 +- sig/mindee/image/extracted_image.rbs | 10 +- sig/mindee/pdf/extracted_pdf.rbs | 20 +- sig/mindee/pdf/pdf_extractor.rbs | 22 ++- sig/mindee/v2/file_operation/crop.rbs | 10 + sig/mindee/v2/file_operation/crop_files.rbs | 9 + sig/mindee/v2/file_operation/split.rbs | 11 ++ sig/mindee/v2/file_operation/split_files.rbs | 9 + sig/mindee/v2/product/crop/crop_item.rbs | 3 + sig/mindee/v2/product/crop/crop_response.rbs | 2 + sig/mindee/v2/product/split/split_range.rbs | 5 +- .../v2/product/split/split_response.rbs | 2 + spec/data.rb | 1 + spec/image/extracted_image_spec.rb | 10 +- spec/image/image_utils_spec.rb | 4 +- spec/input/sources/sources_spec.rb | 4 +- spec/input/sources/url_input_source_spec.rb | 6 +- spec/openssl_crl_workaround.rb | 2 +- spec/pdf/extracted_pdf_spec.rb | 14 +- spec/pdf/pdf_extractor_spec.rb | 6 +- spec/v1/client_spec.rb | 6 +- ...invoice_splitter_extraction_integration.rb | 2 +- .../multi_receipts_extractor_spec.rb | 2 +- spec/v1/http/error_handler_integration.rb | 6 +- spec/v1/http/error_handler_spec.rb | 12 +- spec/v1/input/local_response_v1_spec.rb | 4 +- spec/v2/client_v2_integration.rb | 10 +- spec/v2/client_v2_spec.rb | 8 +- .../crop_operation_integration.rb | 71 +++++++ spec/v2/file_operation/crop_operation_spec.rb | 61 ++++++ .../split_operation_integration.rb | 84 ++++++++ .../v2/file_operation/split_operation_spec.rb | 49 +++++ spec/v2/input/local_response_v2_spec.rb | 4 +- 80 files changed, 852 insertions(+), 316 deletions(-) create mode 100644 lib/mindee/error.rb rename lib/mindee/{errors => error}/mindee_error.rb (86%) rename lib/mindee/{errors => error}/mindee_http_error.rb (89%) rename lib/mindee/{errors => error}/mindee_http_error_v2.rb (99%) rename lib/mindee/{errors => error}/mindee_http_unknown_error_v2.rb (89%) rename lib/mindee/{errors => error}/mindee_input_error.rb (92%) delete mode 100644 lib/mindee/errors.rb create mode 100644 lib/mindee/v2/file_operation.rb create mode 100644 lib/mindee/v2/file_operation/crop.rb create mode 100644 lib/mindee/v2/file_operation/crop_files.rb create mode 100644 lib/mindee/v2/file_operation/split.rb create mode 100644 lib/mindee/v2/file_operation/split_files.rb rename sig/mindee/{errors => error}/mindee_error.rbs (81%) rename sig/mindee/{errors => error}/mindee_http_error.rbs (78%) rename sig/mindee/{errors => error}/mindee_http_error_v2.rbs (86%) rename sig/mindee/{errors => error}/mindee_http_unknown_error_v2.rbs (73%) rename sig/mindee/{errors => error}/mindee_input_error.rbs (86%) create mode 100644 sig/mindee/v2/file_operation/crop.rbs create mode 100644 sig/mindee/v2/file_operation/crop_files.rbs create mode 100644 sig/mindee/v2/file_operation/split.rbs create mode 100644 sig/mindee/v2/file_operation/split_files.rbs create mode 100644 spec/v2/file_operation/crop_operation_integration.rb create mode 100644 spec/v2/file_operation/crop_operation_spec.rb create mode 100644 spec/v2/file_operation/split_operation_integration.rb create mode 100644 spec/v2/file_operation/split_operation_spec.rb diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index 7d45e34d6..9056272c6 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -22,7 +22,7 @@ def parse_single_page(mindee_client, input_source) end def parse_multi_page(mindee_client, input_source) - pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(input_source) + pdf_extractor = Mindee::PDF::PDFExtractor.new(input_source) invoice_splitter_response = mindee_client.parse( input_source, Mindee::V1::Product::InvoiceSplitter::InvoiceSplitterV1, diff --git a/lib/mindee.rb b/lib/mindee.rb index c0a1f0209..fb157a3e8 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -7,7 +7,7 @@ module Mindee # Mindee internal error module. - module Errors + module Error end # Custom extraction module @@ -94,6 +94,16 @@ module V2 module HTTP end + module FileOperation + # Crop operations. + module Crop + end + + # Split operations. + module Split + end + end + # Product-specific module. module Product end diff --git a/lib/mindee/error.rb b/lib/mindee/error.rb new file mode 100644 index 000000000..e6c9b9e6e --- /dev/null +++ b/lib/mindee/error.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +require_relative 'error/mindee_error' +require_relative 'error/mindee_http_error' +require_relative 'error/mindee_http_error_v2' +require_relative 'error/mindee_input_error' diff --git a/lib/mindee/errors/mindee_error.rb b/lib/mindee/error/mindee_error.rb similarity index 86% rename from lib/mindee/errors/mindee_error.rb rename to lib/mindee/error/mindee_error.rb index 0f0cee391..db4211de4 100644 --- a/lib/mindee/errors/mindee_error.rb +++ b/lib/mindee/error/mindee_error.rb @@ -1,8 +1,8 @@ # frozen_string_literal: true module Mindee - module Errors - # Base class for all custom mindee errors. + module Error + # Base class for all custom mindee error. class MindeeError < StandardError; end # Errors relating to library issues. diff --git a/lib/mindee/errors/mindee_http_error.rb b/lib/mindee/error/mindee_http_error.rb similarity index 89% rename from lib/mindee/errors/mindee_http_error.rb rename to lib/mindee/error/mindee_http_error.rb index 1694ef2c6..db4ce35d5 100644 --- a/lib/mindee/errors/mindee_http_error.rb +++ b/lib/mindee/error/mindee_http_error.rb @@ -3,7 +3,7 @@ require_relative 'mindee_error' module Mindee - module Errors + module Error # API HttpError class MindeeHTTPError < MindeeError # @return [String] @@ -27,10 +27,10 @@ def initialize(http_error, url, code) end end - # Base class for all client-side errors. + # Base class for all client-side error. class MindeeHTTPClientError < MindeeHTTPError; end - # Base class for all server-side errors. + # Base class for all server-side error. class MindeeHTTPServerError < MindeeHTTPError; end end end diff --git a/lib/mindee/errors/mindee_http_error_v2.rb b/lib/mindee/error/mindee_http_error_v2.rb similarity index 99% rename from lib/mindee/errors/mindee_http_error_v2.rb rename to lib/mindee/error/mindee_http_error_v2.rb index a482d1df9..44d331ec0 100644 --- a/lib/mindee/errors/mindee_http_error_v2.rb +++ b/lib/mindee/error/mindee_http_error_v2.rb @@ -4,7 +4,7 @@ require_relative '../v2/parsing/error_item' module Mindee - module Errors + module Error # API V2 HttpError class MindeeHTTPErrorV2 < MindeeError # @return [Integer] The HTTP status code returned by the server. diff --git a/lib/mindee/errors/mindee_http_unknown_error_v2.rb b/lib/mindee/error/mindee_http_unknown_error_v2.rb similarity index 89% rename from lib/mindee/errors/mindee_http_unknown_error_v2.rb rename to lib/mindee/error/mindee_http_unknown_error_v2.rb index 5f2a9ebec..2bcbcfc5d 100644 --- a/lib/mindee/errors/mindee_http_unknown_error_v2.rb +++ b/lib/mindee/error/mindee_http_unknown_error_v2.rb @@ -3,7 +3,7 @@ require_relative 'mindee_error' module Mindee - module Errors + module Error # Unknown HTTP error for the V2 API. class MindeeHTTPUnknownErrorV2 < MindeeHTTPErrorV2 def initialize(http_error) @@ -11,7 +11,7 @@ def initialize(http_error) 'status' => -1, 'title' => 'Unknown Error', 'code' => '000-000', - 'errors' => nil }) + 'error' => nil }) end end end diff --git a/lib/mindee/errors/mindee_input_error.rb b/lib/mindee/error/mindee_input_error.rb similarity index 92% rename from lib/mindee/errors/mindee_input_error.rb rename to lib/mindee/error/mindee_input_error.rb index 6554198c4..015d92510 100644 --- a/lib/mindee/errors/mindee_input_error.rb +++ b/lib/mindee/error/mindee_input_error.rb @@ -1,8 +1,8 @@ # frozen_string_literal: true module Mindee - module Errors - # Base class for errors relating to input documents. + module Error + # Base class for error relating to input documents. class MindeeInputError < MindeeError; end # Errors relating to sources (documents) handling. diff --git a/lib/mindee/errors.rb b/lib/mindee/errors.rb deleted file mode 100644 index 3c403704f..000000000 --- a/lib/mindee/errors.rb +++ /dev/null @@ -1,6 +0,0 @@ -# frozen_string_literal: true - -require_relative 'errors/mindee_error' -require_relative 'errors/mindee_http_error' -require_relative 'errors/mindee_http_error_v2' -require_relative 'errors/mindee_input_error' diff --git a/lib/mindee/http/http_error_handler.rb b/lib/mindee/http/http_error_handler.rb index 8785aabef..b559d9c8a 100644 --- a/lib/mindee/http/http_error_handler.rb +++ b/lib/mindee/http/http_error_handler.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true require 'json' -require_relative '../errors/mindee_http_error' +require_relative '../error/mindee_http_error' module Mindee module HTTP @@ -81,11 +81,11 @@ def handle_error(url, response) error_obj = create_error_obj(parsed_hash) case code when 400..499 - Errors::MindeeHTTPClientError.new(error_obj || {}, url, code) + Error::MindeeHTTPClientError.new(error_obj || {}, url, code) when 500..599 - Errors::MindeeHTTPServerError.new(error_obj || {}, url, code) + Error::MindeeHTTPServerError.new(error_obj || {}, url, code) else - Errors::MindeeHTTPError.new(error_obj || {}, url, code) + Error::MindeeHTTPError.new(error_obj || {}, url, code) end end @@ -94,11 +94,11 @@ def handle_error(url, response) def generate_v2_error(hashed_response) code = hashed_response[:code].to_i if hashed_response.key?(:status) - Errors::MindeeHTTPErrorV2.new(hashed_response.transform_keys(&:to_s)) + Error::MindeeHTTPErrorV2.new(hashed_response.transform_keys(&:to_s)) elsif code < 200 || code > 399 - Errors::MindeeHTTPErrorV2.new({ 'status' => code, 'detail' => 'No details available.' }) + Error::MindeeHTTPErrorV2.new({ 'status' => code, 'detail' => 'No details available.' }) else - Errors::MindeeHTTPErrorV2.new({ 'status' => -1, 'detail' => 'Unknown Error.' }) + Error::MindeeHTTPErrorV2.new({ 'status' => -1, 'detail' => 'Unknown Error.' }) end end end diff --git a/lib/mindee/image/extracted_image.rb b/lib/mindee/image/extracted_image.rb index 71d525455..afd3f5864 100644 --- a/lib/mindee/image/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -18,23 +18,29 @@ class ExtractedImage attr_reader :buffer # Internal name for the file. - attr_reader :internal_file_name + attr_reader :filename # Initializes the ExtractedImage with a buffer and an internal file name. # - # @param input_source [LocalInputSource] Local source for input. + # @param input_source [LocalInputSource, BytesInputSource] Local source for input. # @param page_id [Integer] ID of the page the element was found on. # @param element_id [Integer, nil] ID of the element in a page. - def initialize(input_source, page_id, element_id) + # @param preserve_input_filename [Boolean] If true, keep the input source filename as-is. + def initialize(input_source, page_id, element_id, preserve_input_filename: false) @buffer = StringIO.new(input_source.io_stream.read.to_s) @buffer.rewind - extension = if input_source.pdf? - '.jpg' + + @filename = if preserve_input_filename + input_source.filename.to_s else - File.extname(input_source.filename) + extension = if input_source.pdf? + '.jpg' + else + File.extname(input_source.filename) + end + base_name = File.basename(input_source.filename, File.extname(input_source.filename)) + "#{base_name}_p#{page_id}_#{element_id}#{extension}" end - base_name = File.basename(input_source.filename, File.extname(input_source.filename)) - @internal_file_name = "#{base_name}_p#{page_id}_#{element_id}#{extension}" @page_id = page_id @element_id = element_id.nil? ? 0 : element_id end @@ -48,7 +54,7 @@ def initialize(input_source, page_id, element_id) def write_to_file(output_path, file_format = nil) resolved_path = Pathname.new(File.expand_path(output_path)) if file_format.nil? - raise Errors::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? + raise Error::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? file_format = resolved_path.extname.delete('.').upcase end @@ -59,8 +65,8 @@ def write_to_file(output_path, file_format = nil) image.write resolved_path.to_s logger.info("File saved successfully to '#{resolved_path}'") rescue StandardError - raise Errors::MindeeImageError, "Could not save file '#{output_path}'. " \ - 'Is the provided file path valid?.' + raise Error::MindeeImageError, "Could not save file '#{output_path}'. " \ + 'Is the provided file path valid?.' end end @@ -69,7 +75,14 @@ def write_to_file(output_path, file_format = nil) # @return [FileInputSource] A BufferInput source. def as_source @buffer.rewind - Mindee::Input::Source::BytesInputSource.new(@buffer.read || '', @internal_file_name) + Mindee::Input::Source::BytesInputSource.new(@buffer.read || '', @filename) + end + + # Return the file as a Mindee-compatible BufferInput source. + # + # @return [FileInputSource] A BufferInput source. + def as_input_source + as_source end end end diff --git a/lib/mindee/image/image_extractor.rb b/lib/mindee/image/image_extractor.rb index 685178553..0599b425d 100644 --- a/lib/mindee/image/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -94,7 +94,8 @@ def self.create_extracted_image(buffer, file_name, page_id, element_id) ExtractedImage.new( Input::Source::BytesInputSource.new(buffer.read.to_s, file_name), page_id, - element_id + element_id, + preserve_input_filename: true ) end diff --git a/lib/mindee/image/image_utils.rb b/lib/mindee/image/image_utils.rb index 5accd8f1b..a9f3bd694 100644 --- a/lib/mindee/image/image_utils.rb +++ b/lib/mindee/image/image_utils.rb @@ -37,7 +37,7 @@ def self.to_image(image) MiniMagick::Image.read(image) else img_class = image.class ? image.class.to_s : 'unknown format' - raise Errors::MindeeImageError, "Expected an I/O object or a MiniMagick::Image. '#{img_class}' given instead." + raise Error::MindeeImageError, "Expected an I/O object or a MiniMagick::Image. '#{img_class}' given instead." end end @@ -60,7 +60,7 @@ def self.image_to_stringio(image, format = 'JPEG') # @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same. # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same. def self.calculate_new_dimensions(original, max_width: nil, max_height: nil) - raise Errors::MindeeImageError, 'Provided image could not be processed for resizing.' if original.nil? + raise Error::MindeeImageError, 'Provided image could not be processed for resizing.' if original.nil? return [original.width, original.height] if max_width.nil? && max_height.nil? @@ -111,7 +111,7 @@ def self.normalize_polygon(polygon) elsif polygon.is_a?(Mindee::Geometry::Quadrilateral) polygon else - raise Errors::MindeeGeometryError, 'Provided polygon has an invalid type.' + raise Error::MindeeGeometryError, 'Provided polygon has an invalid type.' end end diff --git a/lib/mindee/input/base_parameters.rb b/lib/mindee/input/base_parameters.rb index 680353e5e..2de9352d4 100644 --- a/lib/mindee/input/base_parameters.rb +++ b/lib/mindee/input/base_parameters.rb @@ -31,7 +31,7 @@ def initialize( polling_options: nil, close_file: true ) - raise Errors::MindeeInputError, 'Model ID is required.' if model_id.empty? || model_id.nil? + raise Error::MindeeInputError, 'Model ID is required.' if model_id.empty? || model_id.nil? @model_id = model_id @file_alias = file_alias @@ -72,7 +72,7 @@ def self.load_from_hash(params: {}) params.transform_keys!(&:to_sym) if params.empty? || params[:model_id].nil? || params[:model_id].empty? - raise Errors::MindeeInputError, 'Model ID is required.' + raise Error::MindeeInputError, 'Model ID is required.' end polling_options_input = params.fetch(:page_options, PollingOptions.new) diff --git a/lib/mindee/input/local_response.rb b/lib/mindee/input/local_response.rb index b32711e5e..211953a0a 100644 --- a/lib/mindee/input/local_response.rb +++ b/lib/mindee/input/local_response.rb @@ -27,7 +27,7 @@ def initialize(input_file) end @file.rewind else - raise Errors::MindeeInputError, "Incompatible type for input '#{input_file.class}'." + raise Error::MindeeInputError, "Incompatible type for input '#{input_file.class}'." end end @@ -38,7 +38,7 @@ def as_hash file_str = @file.read or raise 'File could not be read' JSON.parse(file_str, object_class: Hash) rescue JSON::ParserError - raise Errors::MindeeInputError, "File is not a valid dict. #{file_str}" + raise Error::MindeeInputError, "File is not a valid dict. #{file_str}" end # Processes the secret key @@ -57,7 +57,7 @@ def get_hmac_signature(secret_key) mac = OpenSSL::HMAC.hexdigest(algorithm, self.class.process_secret_key(secret_key), @file.read || raise('File could not be read')) rescue StandardError - raise Errors::MindeeInputError, 'Could not get HMAC signature from payload.' + raise Error::MindeeInputError, 'Could not get HMAC signature from payload.' end mac end diff --git a/lib/mindee/input/sources/local_input_source.rb b/lib/mindee/input/sources/local_input_source.rb index 45e1116b5..7e64c41af 100644 --- a/lib/mindee/input/sources/local_input_source.rb +++ b/lib/mindee/input/sources/local_input_source.rb @@ -53,7 +53,7 @@ def initialize(io_stream, filename, repair_pdf: false) return if ALLOWED_MIME_TYPES.include? @file_mimetype end - raise Errors::MindeeMimeTypeError, @file_mimetype.to_s + raise Error::MindeeMimeTypeError, @file_mimetype.to_s end # @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead. @@ -69,7 +69,7 @@ def pdf? # Attempts to fix the PDF data in the file. # @param maximum_offset [Integer] Maximum offset to look for the PDF header. # @return [void] - # @raise [Mindee::Errors::MindeePDFError] + # @raise [Mindee::Error::MindeePDFError] def fix_pdf!(maximum_offset: 500) @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset) @io_stream.rewind @@ -80,11 +80,11 @@ def fix_pdf!(maximum_offset: 500) # @param stream [StringIO] The stream to fix. # @param maximum_offset [Integer] Maximum offset to look for the PDF header. # @return [StringIO] The fixed stream. - # @raise [Mindee::Errors::MindeePDFError] + # @raise [Mindee::Error::MindeePDFError] def self.fix_pdf(stream, maximum_offset: 500) out_stream = StringIO.new stream.gets('%PDF-') - raise Errors::MindeePDFError if stream.eof? || stream.pos > maximum_offset + raise Error::MindeePDFError if stream.eof? || stream.pos > maximum_offset stream.pos = stream.pos - 5 out_stream << stream.read diff --git a/lib/mindee/input/sources/url_input_source.rb b/lib/mindee/input/sources/url_input_source.rb index e344fe5b1..3947c7233 100644 --- a/lib/mindee/input/sources/url_input_source.rb +++ b/lib/mindee/input/sources/url_input_source.rb @@ -14,7 +14,7 @@ class URLInputSource attr_reader :url def initialize(url) - raise Errors::MindeeInputError, 'URL must be HTTPS' unless url.start_with? 'https://' + raise Error::MindeeInputError, 'URL must be HTTPS' unless url.start_with? 'https://' logger.debug("URL input: #{url}") @@ -75,9 +75,9 @@ def fetch_file_content(username: nil, password: nil, token: nil, max_redirects: response = make_request(uri, request, max_redirects) if response.code.to_i > 299 - raise Errors::MindeeAPIError, "Failed to download file: HTTP status code #{response.code}" + raise Error::MindeeAPIError, "Failed to download file: HTTP status code #{response.code}" elsif response.code.to_i < 200 - raise Errors::MindeeAPIError, "Failed to download file: Invalid response code #{response.code}." + raise Error::MindeeAPIError, "Failed to download file: Invalid response code #{response.code}." end response.body @@ -103,7 +103,7 @@ def make_request(uri, request, max_redirects) response = http.request(request) if response.is_a?(Net::HTTPRedirection) && max_redirects.positive? location = response['location'] - raise Errors::MindeeInputError, 'No location in redirection header.' if location.nil? + raise Error::MindeeInputError, 'No location in redirection header.' if location.nil? new_uri = URI.parse(location) request = Net::HTTP::Get.new(new_uri) diff --git a/lib/mindee/pdf/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb index 5ac3f7671..4c7a1ab0d 100644 --- a/lib/mindee/pdf/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -3,57 +3,61 @@ module Mindee # PDF Extraction Module. module PDF - module PDFExtractor - # An extracted sub-Pdf. - class ExtractedPDF - # Byte contents of the pdf - # @return [StringIO] - attr_reader :pdf_bytes - - # Name of the file. - # @return [String] - attr_reader :filename - - # @param pdf_bytes [StringIO] - # @param filename [String] - def initialize(pdf_bytes, filename) - @pdf_bytes = pdf_bytes - @filename = filename - end + # An extracted sub-Pdf. + class ExtractedPDF + # Byte contents of the pdf + # @return [StringIO] + attr_reader :pdf_bytes - # Retrieves the page count for a given pdf. - # @return [Integer] - def page_count - current_pdf = Mindee::PDF::PDFProcessor.open_pdf(pdf_bytes) - current_pdf.pages.size - rescue TypeError, Origami::InvalidPDFError - raise Errors::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.' - end + # Name of the file. + # @return [String] + attr_reader :filename - # Writes the contents of the current PDF object to a file. - # @param output_path [String] Path to write to. - # @param override [bool] Whether to override the destination file. - def write_to_file(output_path, override: false) - raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) - raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?( - File.expand_path('..', output_path) - ) && !override - - if File.extname(output_path).downcase == 'pdf' - base_path = File.expand_path('..', output_path) - output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path) - end - - File.write(output_path, @pdf_bytes) - end + # @param pdf_bytes [StringIO] + # @param filename [String] + def initialize(pdf_bytes, filename) + @pdf_bytes = pdf_bytes + @filename = filename + end + + # Retrieves the page count for a given pdf. + # @return [Integer] + def page_count + current_pdf = Mindee::PDF::PDFProcessor.open_pdf(pdf_bytes) + current_pdf.pages.size + rescue TypeError, Origami::InvalidPDFError + raise Error::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.' + end - # Returns the current PDF object as a usable BytesInputSource. - # @return [Mindee::Input::Source::BytesInputSource] - def as_input_source - raise Errors::MindeePDFError, 'Bytes object is nil.' if @pdf_bytes.nil? + # Writes the contents of the current PDF object to a file. + # @param output_path [String] Path to write to. + # @param override [bool] Whether to override the destination file. + def write_to_file(output_path, override: false) + raise Error::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) + raise Error::MindeePDFError, 'Invalid save path provided' unless File.exist?( + File.expand_path('..', output_path) + ) && !override - Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read || '', @filename) + if File.extname(output_path).downcase == 'pdf' + base_path = File.expand_path('..', output_path) + output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path) end + + @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind) + File.binwrite(output_path, @pdf_bytes.read.to_s) + @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind) + end + + # Returns the current PDF object as a usable BytesInputSource. + # @return [Mindee::Input::Source::BytesInputSource] + def as_input_source + raise Error::MindeePDFError, 'Bytes object is nil.' if @pdf_bytes.nil? + + @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind) + data = @pdf_bytes.read || '' + @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind) + + Mindee::Input::Source::BytesInputSource.new(data, @filename) end end end diff --git a/lib/mindee/pdf/pdf_extractor.rb b/lib/mindee/pdf/pdf_extractor.rb index 45be549d8..c2b0df615 100644 --- a/lib/mindee/pdf/pdf_extractor.rb +++ b/lib/mindee/pdf/pdf_extractor.rb @@ -3,118 +3,115 @@ module Mindee # Pdf Extraction Module. module PDF - # Pdf Extraction class. - module PDFExtractor - # Pdf extraction class. - class PDFExtractor - # @param local_input [Mindee::Input::Source::LocalInputSource] - def initialize(local_input) - @filename = local_input.filename - if local_input.pdf? - @source_pdf = local_input.io_stream - else - pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream) - io_buffer = StringIO.new - pdf_image.save(io_buffer) + # Pdf extraction class. + class PDFExtractor + # @param local_input [Mindee::Input::Source::LocalInputSource] + def initialize(local_input) + @filename = local_input.filename + if local_input.pdf? + @source_pdf = local_input.io_stream + else + pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream) + io_buffer = StringIO.new + pdf_image.save(io_buffer) - @source_pdf = io_buffer - end + @source_pdf = io_buffer end + end - # Retrieves the page count for the Pdf object. - # @return [Integer] - def page_count - Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size - end + # Retrieves the page count for the Pdf object. + # @return [Integer] + def page_count + Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size + end - # Creates a new Pdf from pages and save it into a buffer. - # @param page_indexes [Array] List of page number to use for merging in the original Pdf. - # @return [StringIO] The buffer containing the new Pdf. - def cut_pages(page_indexes) - options = PageOptions.new(params: { - page_indexes: page_indexes, - }) + # Creates a new Pdf from pages and save it into a buffer. + # @param page_indexes [Array] List of page number to use for merging in the original Pdf. + # @return [StringIO] The buffer containing the new Pdf. + def cut_pages(page_indexes) + options = PageOptions.new(params: { + page_indexes: page_indexes, + }) - Mindee::PDF::PDFProcessor.parse(@source_pdf, options) - end + Mindee::PDF::PDFProcessor.parse(@source_pdf, options) + end - # Extract the sub-documents from the main pdf, based on the given list of page indexes. - # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. - # @return [Array] The buffer containing the new Pdf. - def extract_sub_documents(page_indexes) - extracted_pdfs = [] - extension = File.extname(@filename) - basename = File.basename(@filename, extension) - page_indexes.each do |page_index_list| - if page_index_list.nil? || page_index_list.empty? - raise Errors::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}" - end + # Extract the sub-documents from the main pdf, based on the given list of page indexes. + # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. + # @return [Array] The buffer containing the new Pdf. + def extract_sub_documents(page_indexes) + extracted_pdfs = [] + extension = File.extname(@filename) + basename = File.basename(@filename, extension) + page_indexes.each do |page_index_list| + if page_index_list.nil? || page_index_list.empty? + raise Error::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}" + end - page_index_list.each do |page_index| - if (page_index > page_count) || page_index.negative? - raise Errors::MindeePDFError, - "Index #{page_index} is out of range." - end + page_index_list.each do |page_index| + if (page_index > page_count) || page_index.negative? + raise Error::MindeePDFError, + "Index #{page_index} is out of range." end - formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s - field_filename = "#{basename}_#{format('%03d', - page_index_list[0] + 1)}-#{formatted_max_index}#{extension}" - extracted_pdf = Mindee::PDF::PDFExtractor::ExtractedPDF.new(cut_pages(page_index_list), - field_filename) - extracted_pdfs << extracted_pdf end - extracted_pdfs + formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s + field_filename = "#{basename}_#{format('%03d', + page_index_list[0] + 1)}-#{formatted_max_index}#{extension}" + extracted_pdf = Mindee::PDF::ExtractedPDF.new(cut_pages(page_index_list), + field_filename) + extracted_pdfs << extracted_pdf end + extracted_pdfs + end - # rubocop:disable Metrics/CyclomaticComplexity - # rubocop:disable Metrics/PerceivedComplexity + # rubocop:disable Metrics/CyclomaticComplexity + # rubocop:disable Metrics/PerceivedComplexity - # Extracts invoices as complete PDFs from the document. - # @param page_indexes [Array, InvoiceSplitterV1InvoicePageGroup>] - # @param strict [bool] - # @return [Array] - def extract_invoices(page_indexes, strict: false) - raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty? + # Extracts invoices as complete PDFs from the document. + # @param page_indexes [Array, InvoiceSplitterV1InvoicePageGroup>] + # @param strict [bool] + # @return [Array] + def extract_invoices(page_indexes, strict: false) + raise Error::MindeePDFError, 'No indexes provided.' if page_indexes.empty? - if page_indexes[0].is_a?(Array) && page_indexes[0].all? { |i| i.is_a?(Integer) } - page_indexes_as_array = page_indexes # @type var page_indexes : Array[Array[Integer]] - return extract_sub_documents(page_indexes_as_array) - end - p_ids = page_indexes # @type var page_indexes: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroups - return extract_sub_documents(p_ids.map(&:page_indexes)) unless strict + if page_indexes[0].is_a?(Array) && page_indexes[0].all? { |i| i.is_a?(Integer) } + page_indexes_as_array = page_indexes # @type var page_indexes : Array[Array[Integer]] + return extract_sub_documents(page_indexes_as_array) + end + p_ids = page_indexes # @type var page_indexes: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroups + return extract_sub_documents(p_ids.map(&:page_indexes)) unless strict - correct_page_indexes = [] - current_list = [] - previous_confidence = nil - p_ids.each_with_index do |p_i, i| - page_index = p_i # @type var page_index: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup - confidence = page_index.confidence.to_f - page_list = page_index.page_indexes + correct_page_indexes = [] + current_list = [] + previous_confidence = nil + p_ids.each_with_index do |p_i, i| + page_index = p_i # @type var page_index: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup + confidence = page_index.confidence.to_f + page_list = page_index.page_indexes - if confidence >= 0.5 && previous_confidence.nil? - current_list = page_list - elsif confidence >= 0.5 && i < p_ids.length - 1 - correct_page_indexes << current_list - current_list = page_list - elsif confidence < 0.5 && i == p_ids.length - 1 - current_list.concat page_list - correct_page_indexes << current_list - else - correct_page_indexes << current_list - correct_page_indexes << page_list - end - previous_confidence = confidence + if confidence >= 0.5 && previous_confidence.nil? + current_list = page_list + elsif confidence >= 0.5 && i < p_ids.length - 1 + correct_page_indexes << current_list + current_list = page_list + elsif confidence < 0.5 && i == p_ids.length - 1 + current_list.concat page_list + correct_page_indexes << current_list + else + correct_page_indexes << current_list + correct_page_indexes << page_list end - extract_sub_documents(correct_page_indexes) + previous_confidence = confidence end + extract_sub_documents(correct_page_indexes) + end - # rubocop:enable Metrics/CyclomaticComplexity - # rubocop:enable Metrics/PerceivedComplexity + # rubocop:enable Metrics/CyclomaticComplexity + # rubocop:enable Metrics/PerceivedComplexity - private + private - attr_reader :source_pdf, :filename - end + attr_reader :source_pdf, :filename end end end diff --git a/lib/mindee/v1/client.rb b/lib/mindee/v1/client.rb index 3e781bd4b..96319a263 100644 --- a/lib/mindee/v1/client.rb +++ b/lib/mindee/v1/client.rb @@ -251,14 +251,14 @@ def parse_queued(job_id, product_class, endpoint: nil) def enqueue_and_parse(input_source, product_class, endpoint, options) validate_async_params(options.initial_delay_sec, options.delay_sec, options.max_retries) enqueue_res = enqueue(input_source, product_class, endpoint: endpoint, options: options) - job = enqueue_res.job or raise Errors::MindeeAPIError, 'Expected job to be present' + job = enqueue_res.job or raise Error::MindeeAPIError, 'Expected job to be present' job_id = job.id sleep(options.initial_delay_sec) polling_attempts = 1 logger.debug("Successfully enqueued document with job id: '#{job_id}'") queue_res = parse_queued(job_id, product_class, endpoint: endpoint) - queue_res_job = queue_res.job or raise Errors::MindeeAPIError, 'Expected job to be present' + queue_res_job = queue_res.job or raise Error::MindeeAPIError, 'Expected job to be present' valid_statuses = [ Mindee::V1::Parsing::Common::JobStatus::WAITING, Mindee::V1::Parsing::Common::JobStatus::PROCESSING, @@ -268,13 +268,13 @@ def enqueue_and_parse(input_source, product_class, endpoint, options) logger.debug("Polling server for parsing result with job id: '#{job_id}'. Attempt #{polling_attempts}") sleep(options.delay_sec) queue_res = parse_queued(job_id, product_class, endpoint: endpoint) - queue_res_job = queue_res.job or raise Errors::MindeeAPIError, 'Expected job to be present' + queue_res_job = queue_res.job or raise Error::MindeeAPIError, 'Expected job to be present' polling_attempts += 1 end if queue_res_job.status != Mindee::V1::Parsing::Common::JobStatus::COMPLETED elapsed = options.initial_delay_sec + (polling_attempts * options.delay_sec.to_f) - raise Errors::MindeeAPIError, + raise Error::MindeeAPIError, "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)" end @@ -325,14 +325,14 @@ def execute_workflow(input_source, workflow_id, options: {}) # @param local_response [Mindee::Input::LocalResponse] # @return [Mindee::V1::Parsing::Common::ApiResponse] def load_prediction(product_class, local_response) - raise Errors::MindeeAPIError, 'Expected LocalResponse to not be nil.' if local_response.nil? + raise Error::MindeeAPIError, 'Expected LocalResponse to not be nil.' if local_response.nil? response_hash = local_response.as_hash || {} - raise Errors::MindeeAPIError, 'Expected LocalResponse#as_hash to return a hash.' if response_hash.nil? + raise Error::MindeeAPIError, 'Expected LocalResponse#as_hash to return a hash.' if response_hash.nil? Mindee::V1::Parsing::Common::ApiResponse.new(product_class, response_hash, response_hash.to_json) - rescue KeyError, Errors::MindeeAPIError - raise Errors::MindeeInputError, 'No prediction found in local response.' + rescue KeyError, Error::MindeeAPIError + raise Error::MindeeInputError, 'No prediction found in local response.' end # Load a document from an absolute path, as a string. @@ -430,7 +430,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) # @return [Mindee::V1::HTTP::Endpoint] def initialize_endpoint(product_class, endpoint_name: '', account_name: '', version: '') if (endpoint_name.nil? || endpoint_name.empty?) && product_class == Mindee::V1::Product::Universal::Universal - raise Mindee::Errors::MindeeConfigurationError, 'Missing argument endpoint_name when using custom class' + raise Mindee::Error::MindeeConfigurationError, 'Missing argument endpoint_name when using custom class' end endpoint_name = fix_endpoint_name(product_class, endpoint_name) diff --git a/lib/mindee/v1/extraction/multi_receipts_extractor.rb b/lib/mindee/v1/extraction/multi_receipts_extractor.rb index da5b9b2a2..0b87f2724 100644 --- a/lib/mindee/v1/extraction/multi_receipts_extractor.rb +++ b/lib/mindee/v1/extraction/multi_receipts_extractor.rb @@ -13,7 +13,7 @@ module Extraction def self.extract_receipts(input_source, inference) images = [] unless inference.prediction.receipts - raise Errors::MindeeInputError, + raise Error::MindeeInputError, 'No possible receipts candidates found for Multi-Receipts extraction.' end diff --git a/lib/mindee/v1/http/endpoint.rb b/lib/mindee/v1/http/endpoint.rb index c33df9372..9465e93c7 100644 --- a/lib/mindee/v1/http/endpoint.rb +++ b/lib/mindee/v1/http/endpoint.rb @@ -69,7 +69,7 @@ def predict(input_source, opts) Mindee::HTTP::ResponseValidation.clean_request!(response) end - raise Errors::MindeeError, 'Could not resolve server response.' if response.nil? + raise Error::MindeeError, 'Could not resolve server response.' if response.nil? error = Mindee::HTTP::ErrorHandler.handle_error(@url_name, response) raise error @@ -89,7 +89,7 @@ def predict_async(input_source, opts) Mindee::HTTP::ResponseValidation.clean_request!(response) end - raise Errors::MindeeError, 'Could not resolve server response.' if response.nil? + raise Error::MindeeError, 'Could not resolve server response.' if response.nil? raise Mindee::HTTP::ErrorHandler.handle_error(@url_name, response) end @@ -139,7 +139,7 @@ def predict_req_post(input_source, opts) Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @request_timeout) do |http| return http.request(req) end - raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + raise Mindee::Error::MindeeError, 'Could not resolve server response.' end # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::URLInputSource] @@ -176,7 +176,7 @@ def document_queue_req_post(input_source, opts) Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @request_timeout) do |http| return http.request(req) end - raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + raise Mindee::Error::MindeeError, 'Could not resolve server response.' end # @param job_id [String] @@ -195,7 +195,7 @@ def document_queue_req_get(job_id) http.request(req) end - raise Errors::MindeeError, 'Could not resolve server response.' if response.nil? + raise Error::MindeeError, 'Could not resolve server response.' if response.nil? if response.code.to_i > 299 && response.code.to_i < 400 req = Net::HTTP::Get.new(response['location'], headers) @@ -210,7 +210,7 @@ def document_queue_req_get(job_id) def check_api_key return unless @api_key.nil? || @api_key.empty? - raise Errors::MindeeAPIError, + raise Error::MindeeAPIError, "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ "check your Client Configuration.\nYou can set this using the " \ "'#{HTTP::API_KEY_ENV_NAME}' environment variable." diff --git a/lib/mindee/v1/http/workflow_endpoint.rb b/lib/mindee/v1/http/workflow_endpoint.rb index 03df634e5..46d55cf65 100644 --- a/lib/mindee/v1/http/workflow_endpoint.rb +++ b/lib/mindee/v1/http/workflow_endpoint.rb @@ -31,7 +31,7 @@ def execute_workflow(input_source, opts) check_api_key response = workflow_execution_req_post(input_source, opts) if response.nil? - raise Mindee::Errors::MindeeHTTPError.new( + raise Mindee::Error::MindeeHTTPError.new( { code: 0, details: 'Server response was nil.', message: 'Unknown error.' }, @url, 0 ) end @@ -83,9 +83,9 @@ def workflow_execution_req_post(input_source, opts) def check_api_key return unless @api_key.nil? || @api_key.empty? - raise Errors::MindeeConfigurationError, "Missing API key. Check your Client Configuration.\n" \ - "You can set this using the '#{HTTP::API_KEY_ENV_NAME}'" \ - 'environment variable.' + raise Error::MindeeConfigurationError, "Missing API key. Check your Client Configuration.\n" \ + "You can set this using the '#{HTTP::API_KEY_ENV_NAME}'" \ + 'environment variable.' end end end diff --git a/lib/mindee/v1/parsing/common/api_response.rb b/lib/mindee/v1/parsing/common/api_response.rb index 552fa4d2a..e1b166589 100644 --- a/lib/mindee/v1/parsing/common/api_response.rb +++ b/lib/mindee/v1/parsing/common/api_response.rb @@ -44,7 +44,7 @@ class ApiResponse def initialize(product_class, http_response, raw_http) logger.debug('Handling API response') @raw_http = raw_http - raise Errors::MindeeAPIError, 'Invalid response format.' unless http_response.key?('api_request') + raise Error::MindeeAPIError, 'Invalid response format.' unless http_response.key?('api_request') @api_request = Mindee::V1::Parsing::Common::ApiRequest.new(http_response['api_request']) diff --git a/lib/mindee/v2.rb b/lib/mindee/v2.rb index 408dcec68..bdb3fcc3f 100644 --- a/lib/mindee/v2.rb +++ b/lib/mindee/v2.rb @@ -2,5 +2,6 @@ require_relative 'v2/client' require_relative 'v2/http' +require_relative 'v2/file_operation' require_relative 'v2/parsing' require_relative 'v2/product' diff --git a/lib/mindee/v2/client.rb b/lib/mindee/v2/client.rb index 061738236..12d6d2d6f 100644 --- a/lib/mindee/v2/client.rb +++ b/lib/mindee/v2/client.rb @@ -69,7 +69,7 @@ def enqueue_and_get_result( if enqueue_response.job.id.nil? || enqueue_response.job.id.empty? logger.error("Failed enqueueing:\n#{enqueue_response.raw_http}") - raise Mindee::Errors::MindeeError, 'Enqueueing of the document failed.' + raise Mindee::Error::MindeeError, 'Enqueueing of the document failed.' end job_id = enqueue_response.job.id @@ -99,13 +99,13 @@ def enqueue_and_get_result( error = poll_results.job.error unless error.nil? - err_to_raise = Mindee::Errors::MindeeHTTPErrorV2.new(error) + err_to_raise = Mindee::Error::MindeeHTTPErrorV2.new(error) # NOTE: purposefully decoupled from the line above, otherwise rubocop thinks `error` is a `message` param. raise err_to_raise end sec_count = normalized_params.polling_options.delay_sec * retry_counter - raise Mindee::Errors::MindeeError, + raise Mindee::Error::MindeeError, "Asynchronous parsing request timed out after #{sec_count} seconds" end diff --git a/lib/mindee/v2/file_operation.rb b/lib/mindee/v2/file_operation.rb new file mode 100644 index 000000000..60547647d --- /dev/null +++ b/lib/mindee/v2/file_operation.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +require_relative 'file_operation/crop' +require_relative 'file_operation/crop_files' +require_relative 'file_operation/split' +require_relative 'file_operation/split_files' diff --git a/lib/mindee/v2/file_operation/crop.rb b/lib/mindee/v2/file_operation/crop.rb new file mode 100644 index 000000000..63c6bf0ca --- /dev/null +++ b/lib/mindee/v2/file_operation/crop.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module FileOperation + # Crop operations. + module Crop + # Extracts a single crop as complete PDFs from the document. + # + # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from. + # @param crop [FieldLocation] Crop to extract. + # @return [ExtractedImage] + def self.extract_single_crop(input_source, crop) + polygons = [crop.polygon] + Mindee::Image::ImageExtractor.extract_multiple_images_from_source( + input_source, crop.page, polygons + ).first + end + + # Extracts individual receipts from multi-receipts documents. + # + # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from. + # @param crops [Array] List of crops. + # @return [CropFiles] Individual extracted receipts as an array of ExtractedImage. + # @raise [MindeeError] if the crops array is empty. + def self.extract_crops(input_source, crops) + if crops.nil? || crops.empty? + raise Mindee::Error::MindeeError, + 'No possible candidates found for Crop extraction.' + end + + polygons = Array.new(input_source.page_count) { [] } + + crops.each do |crop| + polygons[crop.location.page] << crop.location.polygon + end + + images = [] + polygons.each_with_index do |page_polygons, page_index| + extracted = Mindee::Image::ImageExtractor.extract_multiple_images_from_source( + input_source, page_index, page_polygons + ) + images.concat(extracted) + end + + CropFiles.new(images) + end + end + end + end +end diff --git a/lib/mindee/v2/file_operation/crop_files.rb b/lib/mindee/v2/file_operation/crop_files.rb new file mode 100644 index 000000000..c57db2463 --- /dev/null +++ b/lib/mindee/v2/file_operation/crop_files.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module FileOperation + # Collection of cropped files. + class CropFiles < Array + # Save all extracted crops to disk. + # + # @param path [String, Pathname] Path to save the extracted crops to. + # @param prefix [String] Prefix to add to the filename, defaults to 'crop'. + # @param file_format [String, nil] File format to save the crops as, defaults to jpg if nil.] + def save_all_to_disk(path, prefix: 'crop', file_format: nil) + FileUtils.mkdir_p(path) + each.with_index(1) do |crop, idx| + filename = "#{prefix}_#{format('%03d', idx)}.jpg" + file_path = File.join(path.to_s, filename) + + crop.write_to_file(file_path, file_format) + end + end + end + end + end +end diff --git a/lib/mindee/v2/file_operation/split.rb b/lib/mindee/v2/file_operation/split.rb new file mode 100644 index 000000000..60dea7fcf --- /dev/null +++ b/lib/mindee/v2/file_operation/split.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module FileOperation + # Split operations. + module Split + # Extracts a single split as a complete PDF from the document. + # + # @param input_source [LocalInputSource] Input source to split. + # @param split [Array] List of pages to keep. + # @return [ExtractedPDF] Extracted PDF + def self.extract_single_split(input_source, split) + extract_splits(input_source, [split]).first + end + + # Extracts splits as complete PDFs from the document. + # + # @param input_source [LocalInputSource] Input source to split. + # @param splits [Array>] List of sub-lists of pages to keep. + # @return [SplitFiles] A list of extracted invoices. + # @raise [MindeeError] if no indexes are provided. + def self.extract_splits(input_source, splits) + raise Mindee::Error::MindeeError, 'No indexes provided.' if splits.nil? || splits.empty? + + pdf_extractor = Mindee::PDF::PDFExtractor.new(input_source) + + page_groups = splits.map do |split| + (split[0]..split[1]).to_a + end + + SplitFiles.new(pdf_extractor.extract_sub_documents(page_groups)) + end + end + end + end +end diff --git a/lib/mindee/v2/file_operation/split_files.rb b/lib/mindee/v2/file_operation/split_files.rb new file mode 100644 index 000000000..d75b3f46d --- /dev/null +++ b/lib/mindee/v2/file_operation/split_files.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module FileOperation + # Collection of split files. + class SplitFiles < Array + # Save all extracted splits to disk. + # + # @param path [String, Pathname] Path to save the extracted splits to. + # @param prefix [String] Prefix to add to the filename, defaults to 'split'. + def save_all_to_disk(path, prefix: 'split') + FileUtils.mkdir_p(path) + + each.with_index(1) do |split, idx| + filename = "#{prefix}_#{format('%03d', idx)}.pdf" + file_path = File.join(path.to_s, filename) + + split.write_to_file(file_path) + end + end + end + end + end +end diff --git a/lib/mindee/v2/http/api_settings.rb b/lib/mindee/v2/http/api_settings.rb index 0d04bca83..d689fd7c3 100644 --- a/lib/mindee/v2/http/api_settings.rb +++ b/lib/mindee/v2/http/api_settings.rb @@ -51,11 +51,11 @@ def initialize(api_key: nil) # Checks API key for a value. # @return - # @raise [Errors::MindeeAPIError] Raises if the api key is empty or nil. + # @raise [Error::MindeeAPIError] Raises if the api key is empty or nil. def check_api_key return unless @api_key.nil? || @api_key.to_s.empty? - raise Errors::MindeeAPIError, + raise Error::MindeeAPIError, "Missing API key. check your Client Configuration.\nYou can set this using the " \ "'#{MINDEE_V2_API_KEY_ENV_NAME}' environment variable." end diff --git a/lib/mindee/v2/http/mindee_api.rb b/lib/mindee/v2/http/mindee_api.rb index 97e7e7379..a82c76004 100644 --- a/lib/mindee/v2/http/mindee_api.rb +++ b/lib/mindee/v2/http/mindee_api.rb @@ -2,7 +2,7 @@ require_relative '../../input' require_relative '../../http' -require_relative '../../errors' +require_relative '../../error' module Mindee module V2 @@ -22,7 +22,7 @@ def initialize(api_key: nil) # @param input_source [Input::Source::LocalInputSource, Input::Source::URLInputSource] # @param params [Input::BaseParameters] # @return [Mindee::V2::Parsing::JobResponse] - # @raise [Mindee::Errors::MindeeHttpErrorV2] + # @raise [Mindee::Error::MindeeHttpErrorV2] def req_post_enqueue(input_source, params) @settings.check_api_key response = enqueue( @@ -91,14 +91,14 @@ def req_get_search_models(model_name, model_type) Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @settings.request_timeout) do |http| return http.request(req) end - raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + raise Mindee::Error::MindeeError, 'Could not resolve server response.' end # @param resource [String] Resource to check. # @return [Boolean] def uri?(resource) uri = URI.parse(resource) - throw Mindee::Errors::MindeeError, 'HTTP is not supported.' if uri.scheme == 'http' + throw Mindee::Error::MindeeError, 'HTTP is not supported.' if uri.scheme == 'http' uri.scheme == 'https' rescue URI::BadURIError, URI::InvalidURIError false @@ -160,7 +160,7 @@ def poll(url) Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @settings.request_timeout) do |http| return http.request(req) end - raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + raise Mindee::Error::MindeeError, 'Could not resolve server response.' end # Polls the API for the result of an inference. @@ -222,7 +222,7 @@ def enqueue(input_source, params) Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @settings.request_timeout) do |http| return http.request(req) end - raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + raise Mindee::Error::MindeeError, 'Could not resolve server response.' end end end diff --git a/lib/mindee/v2/parsing/error_response.rb b/lib/mindee/v2/parsing/error_response.rb index cec156101..a5b5ad26f 100644 --- a/lib/mindee/v2/parsing/error_response.rb +++ b/lib/mindee/v2/parsing/error_response.rb @@ -22,8 +22,8 @@ def initialize(server_response) @detail = server_response['detail'] @title = server_response['title'] @code = server_response['code'] - @errors = if server_response.key?('errors') - server_response['errors'].map do |error| + @errors = if server_response.key?('error') + server_response['error'].map do |error| ErrorItem.new(error) end else diff --git a/lib/mindee/v2/parsing/field/base_field.rb b/lib/mindee/v2/parsing/field/base_field.rb index 89913d73e..53482da94 100644 --- a/lib/mindee/v2/parsing/field/base_field.rb +++ b/lib/mindee/v2/parsing/field/base_field.rb @@ -54,7 +54,7 @@ def self.create_field(raw_prediction, indent_level = 0) return SimpleField.new(raw_prediction, indent_level) end - raise Errors::MindeeError, "Unrecognized field format in #{raw_prediction.to_json}" + raise Error::MindeeError, "Unrecognized field format in #{raw_prediction.to_json}" end end end diff --git a/lib/mindee/v2/parsing/field/list_field.rb b/lib/mindee/v2/parsing/field/list_field.rb index ab8c36a1c..7d17a25e7 100644 --- a/lib/mindee/v2/parsing/field/list_field.rb +++ b/lib/mindee/v2/parsing/field/list_field.rb @@ -18,7 +18,7 @@ def initialize(server_response, indent_level = 0) super unless server_response.key?('items') && server_response['items'].is_a?(Array) - raise Errors::MindeeError, + raise Error::MindeeError, "Expected \"items\" to be an array in #{server_response.to_json}." end diff --git a/lib/mindee/v2/product/crop/crop_item.rb b/lib/mindee/v2/product/crop/crop_item.rb index 592fc999d..74f024d4c 100644 --- a/lib/mindee/v2/product/crop/crop_item.rb +++ b/lib/mindee/v2/product/crop/crop_item.rb @@ -22,6 +22,16 @@ def initialize(server_response) def to_s "* :Location: #{location}\n :Object Type: #{object_type}" end + + # Extract all crop items from this page + # + # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from + # @return [ExtractedImage] + def extract_from_file(input_source) + Image::ImageExtractor.extract_multiple_images_from_source( + input_source, @location.page, [@location.polygon] + )[0] + end end end end diff --git a/lib/mindee/v2/product/crop/crop_response.rb b/lib/mindee/v2/product/crop/crop_response.rb index 90ada7242..378cb05e1 100644 --- a/lib/mindee/v2/product/crop/crop_response.rb +++ b/lib/mindee/v2/product/crop/crop_response.rb @@ -25,6 +25,17 @@ def initialize(server_response) def to_s @inference.to_s end + + # Apply the crop inference to a file and return a list of extracted images. + # + # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from + # @return [FileOperation::CropFiles] List of extracted PDFs + def extract_from_file(input_source) + crop_files = @inference.result.crops.map do |crop| + crop.extract_from_file(input_source) + end + FileOperation::CropFiles.new(crop_files) + end end end end diff --git a/lib/mindee/v2/product/extraction/params/data_schema_replace.rb b/lib/mindee/v2/product/extraction/params/data_schema_replace.rb index 376be497d..8ad45ce79 100644 --- a/lib/mindee/v2/product/extraction/params/data_schema_replace.rb +++ b/lib/mindee/v2/product/extraction/params/data_schema_replace.rb @@ -16,7 +16,7 @@ class DataSchemaReplace def initialize(data_schema_replace) data_schema_replace.transform_keys!(&:to_sym) fields_list = data_schema_replace[:fields] - raise Mindee::Errors::MindeeError, 'Invalid Data Schema provided.' if fields_list.nil? + raise Mindee::Error::MindeeError, 'Invalid Data Schema provided.' if fields_list.nil? raise TypeError, 'Data Schema replacement fields cannot be empty.' if fields_list.empty? @fields = fields_list.map { |field| DataSchemaField.new(field) } diff --git a/lib/mindee/v2/product/split/split_range.rb b/lib/mindee/v2/product/split/split_range.rb index 6eaabd12c..450b0f123 100644 --- a/lib/mindee/v2/product/split/split_range.rb +++ b/lib/mindee/v2/product/split/split_range.rb @@ -23,6 +23,14 @@ def initialize(server_response) def to_s "* :Page Range: #{@page_range}\n :Document Type: #{@document_type}" end + + # Apply the split range inference to a file and return a single extracted PDF. + # + # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from + # @return [Image::ExtractedImage] + def extract_from_file(input_source) + FileOperation::Split.extract_single_split(input_source, @page_range) + end end end end diff --git a/lib/mindee/v2/product/split/split_response.rb b/lib/mindee/v2/product/split/split_response.rb index 9fdafae18..8e5817ccd 100644 --- a/lib/mindee/v2/product/split/split_response.rb +++ b/lib/mindee/v2/product/split/split_response.rb @@ -25,6 +25,16 @@ def initialize(server_response) def to_s @inference.to_s end + + # Extracts the crops from the input source. + # @param input_source [Mindee::Input::Source::LocalInputSource] Path to the file or a File object. + # @return [FileOperation::SplitFiles] + def extract_from_file(input_source) + crop_files = @inference.result.splits.map do |crop| + crop.extract_from_file(input_source) + end + FileOperation::SplitFiles.new(crop_files) + end end end end diff --git a/sig/mindee.rbs b/sig/mindee.rbs index c33abc741..bdf2c27fb 100644 --- a/sig/mindee.rbs +++ b/sig/mindee.rbs @@ -5,7 +5,7 @@ class Object end module Mindee - module Errors + module Error end module Extraction diff --git a/sig/mindee/errors/mindee_error.rbs b/sig/mindee/error/mindee_error.rbs similarity index 81% rename from sig/mindee/errors/mindee_error.rbs rename to sig/mindee/error/mindee_error.rbs index 10dac7a6a..2c1838d01 100644 --- a/sig/mindee/errors/mindee_error.rbs +++ b/sig/mindee/error/mindee_error.rbs @@ -1,6 +1,6 @@ -# lib/mindee/errors/mindee_error.rb +# lib/mindee/error/mindee_error.rb module Mindee - module Errors + module Error class MindeeError < StandardError end class MindeeAPIError < MindeeError diff --git a/sig/mindee/errors/mindee_http_error.rbs b/sig/mindee/error/mindee_http_error.rbs similarity index 78% rename from sig/mindee/errors/mindee_http_error.rbs rename to sig/mindee/error/mindee_http_error.rbs index 2811fa700..56322508b 100644 --- a/sig/mindee/errors/mindee_http_error.rbs +++ b/sig/mindee/error/mindee_http_error.rbs @@ -1,7 +1,7 @@ -# lib/mindee/errors/mindee_http_error.rb +# lib/mindee/error/mindee_http_error.rb module Mindee - module Errors - class MindeeHTTPError < Errors::MindeeError + module Error + class MindeeHTTPError < Error::MindeeError attr_reader api_code: Integer attr_reader api_details: String attr_reader api_message: String diff --git a/sig/mindee/errors/mindee_http_error_v2.rbs b/sig/mindee/error/mindee_http_error_v2.rbs similarity index 86% rename from sig/mindee/errors/mindee_http_error_v2.rbs rename to sig/mindee/error/mindee_http_error_v2.rbs index 0c4381e1f..e50c86a4e 100644 --- a/sig/mindee/errors/mindee_http_error_v2.rbs +++ b/sig/mindee/error/mindee_http_error_v2.rbs @@ -1,6 +1,6 @@ -# lib/mindee/errors/mindee_http_error_v2.rb +# lib/mindee/error/mindee_http_error_v2.rb module Mindee - module Errors + module Error # API V2 HttpError class MindeeHTTPErrorV2 < MindeeError attr_reader detail: String diff --git a/sig/mindee/errors/mindee_http_unknown_error_v2.rbs b/sig/mindee/error/mindee_http_unknown_error_v2.rbs similarity index 73% rename from sig/mindee/errors/mindee_http_unknown_error_v2.rbs rename to sig/mindee/error/mindee_http_unknown_error_v2.rbs index 459cef087..e81b1e33a 100644 --- a/sig/mindee/errors/mindee_http_unknown_error_v2.rbs +++ b/sig/mindee/error/mindee_http_unknown_error_v2.rbs @@ -1,6 +1,6 @@ -# lib/mindee/errors/mindee_http_unknown_error_v2.rb +# lib/mindee/error/mindee_http_unknown_error_v2.rb module Mindee - module Errors + module Error # Unknown HTTP error for the V2 API. class MindeeHTTPUnknownErrorV2 < MindeeHTTPErrorV2 def initialize: (Hash[String|Symbol, untyped]) -> void diff --git a/sig/mindee/errors/mindee_input_error.rbs b/sig/mindee/error/mindee_input_error.rbs similarity index 86% rename from sig/mindee/errors/mindee_input_error.rbs rename to sig/mindee/error/mindee_input_error.rbs index 94fd985f5..9a34092e5 100644 --- a/sig/mindee/errors/mindee_input_error.rbs +++ b/sig/mindee/error/mindee_input_error.rbs @@ -1,6 +1,6 @@ -# lib/mindee/errors/mindee_input_error.rb +# lib/mindee/error/mindee_input_error.rb module Mindee - module Errors + module Error class MindeeInputError < MindeeError end class MindeeSourceError < MindeeInputError diff --git a/sig/mindee/http/http_error_handler.rbs b/sig/mindee/http/http_error_handler.rbs index 1ab8b3697..e5b30622b 100644 --- a/sig/mindee/http/http_error_handler.rbs +++ b/sig/mindee/http/http_error_handler.rbs @@ -4,11 +4,11 @@ module Mindee module ErrorHandler def extract_error: (Hash[String | Symbol, untyped]) -> Hash[String | Symbol, untyped]? def create_error_obj: (Hash[String | Symbol, untyped]) -> Hash[String | Symbol, untyped] - def self.generate_v2_error: (Hash[String, Integer | String]) -> Errors::MindeeHTTPErrorV2 - def generate_v2_error: (Hash[Symbol, String | Integer]) -> Errors::MindeeHTTPErrorV2 + def self.generate_v2_error: (Hash[String, Integer | String]) -> Error::MindeeHTTPErrorV2 + def generate_v2_error: (Hash[Symbol, String | Integer]) -> Error::MindeeHTTPErrorV2 - def self.handle_error: (String, Net::HTTPResponse) -> Errors::MindeeHTTPError - def handle_error: (String, Net::HTTPResponse) -> Errors::MindeeHTTPError + def self.handle_error: (String, Net::HTTPResponse) -> Error::MindeeHTTPError + def handle_error: (String, Net::HTTPResponse) -> Error::MindeeHTTPError def handle_v2_error: (Hash[String | Symbol, untyped]) -> void end end diff --git a/sig/mindee/image/extracted_image.rbs b/sig/mindee/image/extracted_image.rbs index 8f7c3d872..f60abe127 100644 --- a/sig/mindee/image/extracted_image.rbs +++ b/sig/mindee/image/extracted_image.rbs @@ -6,10 +6,16 @@ module Mindee def page_id: -> Integer def element_id: -> Integer def buffer: -> StringIO - def internal_file_name: -> String - def initialize: (Input::Source::LocalInputSource, Integer, Integer) -> Integer + def filename: -> String + def initialize: ( + Input::Source::LocalInputSource | Input::Source::BytesInputSource, + Integer, + Integer?, + ?preserve_input_filename: bool + ) -> Integer def write_to_file: (String, ?String?) -> void def as_source: -> Input::Source::BytesInputSource + def as_input_source: -> Input::Source::BytesInputSource end end end diff --git a/sig/mindee/pdf/extracted_pdf.rbs b/sig/mindee/pdf/extracted_pdf.rbs index d2f2427aa..658400f98 100644 --- a/sig/mindee/pdf/extracted_pdf.rbs +++ b/sig/mindee/pdf/extracted_pdf.rbs @@ -1,15 +1,17 @@ # lib/mindee/pdf/extracted_pdf.rb module Mindee module PDF - module PDFExtractor - class ExtractedPDF - attr_reader pdf_bytes: StringIO - attr_reader filename: String - def initialize: (StringIO, String) -> void - def page_count: -> Integer - def write_to_file: (String, ?override: bool) -> Integer - def as_input_source: -> Input::Source::BytesInputSource - end + class ExtractedPDF + attr_reader pdf_bytes: StringIO + attr_reader filename: String + + def initialize: (StringIO, String) -> void + + def page_count: -> Integer + + def write_to_file: (String, ?override: bool) -> void + + def as_input_source: -> Input::Source::BytesInputSource end end end diff --git a/sig/mindee/pdf/pdf_extractor.rbs b/sig/mindee/pdf/pdf_extractor.rbs index c5b91a190..5007823ae 100644 --- a/sig/mindee/pdf/pdf_extractor.rbs +++ b/sig/mindee/pdf/pdf_extractor.rbs @@ -1,17 +1,19 @@ # lib/mindee/pdf/pdf_extractor.rb module Mindee module PDF - module PDFExtractor - class PDFExtractor - attr_reader filename: String - attr_reader source_pdf: StringIO + class PDFExtractor + attr_reader filename: String + attr_reader source_pdf: StringIO - def initialize: (Input::Source::LocalInputSource) -> void - def page_count: -> Integer - def cut_pages: (Array[Integer]) -> StringIO - def extract_sub_documents: (Array[Array[Integer]]) -> Array[ExtractedPDF] - def extract_invoices: (Array[V1::Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup] | Array [Array[Integer]], ?strict: bool) -> Array[ExtractedPDF] - end + def initialize: (Input::Source::LocalInputSource) -> void + + def page_count: -> Integer + + def cut_pages: (Array[Integer]) -> StringIO + + def extract_sub_documents: (Array[Array[Integer]]) -> Array[ExtractedPDF] + + def extract_invoices: (Array[V1::Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup] | Array[Array[Integer]], ?strict: bool) -> Array[ExtractedPDF] end end end diff --git a/sig/mindee/v2/file_operation/crop.rbs b/sig/mindee/v2/file_operation/crop.rbs new file mode 100644 index 000000000..f10349df0 --- /dev/null +++ b/sig/mindee/v2/file_operation/crop.rbs @@ -0,0 +1,10 @@ +module Mindee + module V2 + module FileOperation + module Crop + def self.extract_single_crop: (Input::Source::LocalInputSource, Parsing::Field::FieldLocation) -> Image::ExtractedImage + def self.extract_crops: (Input::Source::LocalInputSource, Array[Product::Crop::CropItem]) -> CropFiles + end + end + end +end diff --git a/sig/mindee/v2/file_operation/crop_files.rbs b/sig/mindee/v2/file_operation/crop_files.rbs new file mode 100644 index 000000000..7994d94c9 --- /dev/null +++ b/sig/mindee/v2/file_operation/crop_files.rbs @@ -0,0 +1,9 @@ +module Mindee + module V2 + module FileOperation + class CropFiles < Array[Image::ExtractedImage] + def save_all_to_disk: (String | Pathname, ?prefix: String, ?file_format: String?) -> void + end + end + end +end diff --git a/sig/mindee/v2/file_operation/split.rbs b/sig/mindee/v2/file_operation/split.rbs new file mode 100644 index 000000000..5424c987d --- /dev/null +++ b/sig/mindee/v2/file_operation/split.rbs @@ -0,0 +1,11 @@ +module Mindee + module V2 + module FileOperation + module Split + def self.extract_single_split: (Input::Source::LocalInputSource, Array[Integer]) -> PDF::ExtractedPDF + + def self.extract_splits: (Input::Source::LocalInputSource, Array[Array[Integer]]) -> SplitFiles + end + end + end +end diff --git a/sig/mindee/v2/file_operation/split_files.rbs b/sig/mindee/v2/file_operation/split_files.rbs new file mode 100644 index 000000000..38a864575 --- /dev/null +++ b/sig/mindee/v2/file_operation/split_files.rbs @@ -0,0 +1,9 @@ +module Mindee + module V2 + module FileOperation + class SplitFiles < Array[PDF::ExtractedPDF] + def save_all_to_disk: (String | Pathname, ?prefix: String?) -> void + end + end + end +end diff --git a/sig/mindee/v2/product/crop/crop_item.rbs b/sig/mindee/v2/product/crop/crop_item.rbs index f9594bd5e..b87efc6d4 100644 --- a/sig/mindee/v2/product/crop/crop_item.rbs +++ b/sig/mindee/v2/product/crop/crop_item.rbs @@ -7,6 +7,9 @@ module Mindee attr_reader location: Mindee::V2::Parsing::Field::FieldLocation def initialize: (Hash[String | Symbol, untyped]) -> void + + def extract_from_file: (Input::Source::LocalInputSource) -> Image::ExtractedImage + def to_s: -> String end end diff --git a/sig/mindee/v2/product/crop/crop_response.rbs b/sig/mindee/v2/product/crop/crop_response.rbs index 773045a92..7ebadb1a1 100644 --- a/sig/mindee/v2/product/crop/crop_response.rbs +++ b/sig/mindee/v2/product/crop/crop_response.rbs @@ -13,6 +13,8 @@ module Mindee def _params_type: -> singleton(Params::CropParameters) + def extract_from_file: (Input::Source::LocalInputSource) -> FileOperation::CropFiles + def to_s: -> String def self._params_type: () -> singleton(Params::CropParameters) def self.slug: () -> String diff --git a/sig/mindee/v2/product/split/split_range.rbs b/sig/mindee/v2/product/split/split_range.rbs index 14a390efc..7ec193c55 100644 --- a/sig/mindee/v2/product/split/split_range.rbs +++ b/sig/mindee/v2/product/split/split_range.rbs @@ -3,10 +3,13 @@ module Mindee module Product module Split class SplitRange - attr_reader page_range: Array[int] + attr_reader page_range: Array[Integer] attr_reader document_type: String def initialize: (Hash[String | Symbol, untyped]) -> void + + def extract_from_file: (Input::Source::LocalInputSource) -> PDF::ExtractedPDF + def to_s: -> String end end diff --git a/sig/mindee/v2/product/split/split_response.rbs b/sig/mindee/v2/product/split/split_response.rbs index 9a0232517..bd04c9696 100644 --- a/sig/mindee/v2/product/split/split_response.rbs +++ b/sig/mindee/v2/product/split/split_response.rbs @@ -13,6 +13,8 @@ module Mindee def _params_type: -> singleton(Params::SplitParameters) + def extract_from_file: (Mindee::Input::Source::LocalInputSource) -> FileOperation::SplitFiles + def to_s: -> String def self._params_type: () -> singleton(Params::SplitParameters) def self.slug: () -> String diff --git a/spec/data.rb b/spec/data.rb index 3550b5de0..d12a2abf6 100644 --- a/spec/data.rb +++ b/spec/data.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true ROOT_DATA_DIR = File.join(__dir__, 'data').freeze +OUTPUT_DIR = File.join(ROOT_DATA_DIR, 'output').freeze FILE_TYPES_DIR = File.join(ROOT_DATA_DIR, 'file_types').freeze V1_DATA_DIR = File.join(ROOT_DATA_DIR, 'v1').freeze V2_DATA_DIR = File.join(ROOT_DATA_DIR, 'v2').freeze diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb index 7462ddca1..1fbba1853 100644 --- a/spec/image/extracted_image_spec.rb +++ b/spec/image/extracted_image_spec.rb @@ -23,7 +23,7 @@ expect(extracted_image.page_id).to eq(page_id) expect(extracted_image.element_id).to eq(element_id) - expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + expect(extracted_image.filename).to eq('default_sample_p1_42.jpg') # NOTE: ruby messes up the formatting of binary strings, I don't think it worth it to correct this behavior, but # the result is that we have to remove them from the comparisons. @@ -47,7 +47,7 @@ extracted_image = described_class.new(input_source, page_id, element_id) - expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + expect(extracted_image.filename).to eq('default_sample_p1_42.jpg') end end @@ -68,7 +68,7 @@ expect do extracted_image.write_to_file(invalid_output_path) - end.to raise_error(Mindee::Errors::MindeeImageError, %r{Invalid file format}) + end.to raise_error(Mindee::Error::MindeeImageError, %r{Invalid file format}) end it 'raises an error if the file cannot be saved' do @@ -77,7 +77,7 @@ expect do extracted_image.write_to_file(invalid_output_path) - end.to raise_error(Mindee::Errors::MindeeImageError) + end.to raise_error(Mindee::Error::MindeeImageError) end end @@ -108,7 +108,7 @@ Tempfile.create(['output', '.jpg']) do |tempfile| expect do extracted_image.write_to_file(tempfile.path, 'jpg') - end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) + end.to raise_error(Mindee::Error::MindeeImageError, %r{Could not save file}) end end diff --git a/spec/image/image_utils_spec.rb b/spec/image/image_utils_spec.rb index 0579770b8..7304a4a36 100644 --- a/spec/image/image_utils_spec.rb +++ b/spec/image/image_utils_spec.rb @@ -24,7 +24,7 @@ it 'Should raise an error for invalid input types' do expect do Mindee::Image::ImageUtils.to_image(123) - end.to raise_error(Mindee::Errors::MindeeImageError, %r{Expected an I/O object or a MiniMagick::Image}) + end.to raise_error(Mindee::Error::MindeeImageError, %r{Expected an I/O object or a MiniMagick::Image}) end it 'Should convert MiniMagick image to StringIO' do @@ -51,7 +51,7 @@ it 'Should raise an error if the original image is nil' do expect do Mindee::Image::ImageUtils.calculate_new_dimensions(nil) - end.to raise_error(Mindee::Errors::MindeeImageError, %r{Provided image could not be processed for resizing}) + end.to raise_error(Mindee::Error::MindeeImageError, %r{Provided image could not be processed for resizing}) end it 'Should return dimensions from media box if provided' do diff --git a/spec/input/sources/sources_spec.rb b/spec/input/sources/sources_spec.rb index 71d77bb4a..777cb9f31 100644 --- a/spec/input/sources/sources_spec.rb +++ b/spec/input/sources/sources_spec.rb @@ -2,7 +2,7 @@ require 'mindee' require 'mindee/input/sources' -require 'mindee/errors' +require 'mindee/error' require 'pdf-reader' require_relative '../../data' @@ -102,7 +102,7 @@ mindee_client.source_from_path( "#{FILE_TYPES_DIR}/pdf/broken_unfixable.pdf", repair_pdf: true ) - end.to raise_error Mindee::Errors::MindeePDFError + end.to raise_error Mindee::Error::MindeePDFError end end diff --git a/spec/input/sources/url_input_source_spec.rb b/spec/input/sources/url_input_source_spec.rb index d2f1cb487..87407af02 100644 --- a/spec/input/sources/url_input_source_spec.rb +++ b/spec/input/sources/url_input_source_spec.rb @@ -19,7 +19,7 @@ context 'with invalid URL' do it 'raises an error for invalid URLs' do - expect { described_class.new(invalid_url) }.to raise_error(Mindee::Errors::MindeeInputError) + expect { described_class.new(invalid_url) }.to raise_error(Mindee::Error::MindeeInputError) end end end @@ -60,7 +60,7 @@ it 'raises an error' do expect do url_input_source.as_local_input_source - end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) + end.to raise_error(Mindee::Error::MindeeAPIError, %r{Failed to download file}) end end end @@ -105,7 +105,7 @@ it 'raises an error' do expect do url_input_source.write_to_file('/tmp') - end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) + end.to raise_error(Mindee::Error::MindeeAPIError, %r{Failed to download file}) end end end diff --git a/spec/openssl_crl_workaround.rb b/spec/openssl_crl_workaround.rb index 1f10c085a..35e6b6432 100644 --- a/spec/openssl_crl_workaround.rb +++ b/spec/openssl_crl_workaround.rb @@ -2,7 +2,7 @@ require 'openssl' -# Workaround for errors in SSL certificates validations on macOS. +# Workaround for error in SSL certificates validations on macOS. params = OpenSSL::SSL::SSLContext::DEFAULT_PARAMS params[:verify_mode] = OpenSSL::SSL::VERIFY_PEER diff --git a/spec/pdf/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb index 06a8ca4a4..5fb37bd4f 100644 --- a/spec/pdf/extracted_pdf_spec.rb +++ b/spec/pdf/extracted_pdf_spec.rb @@ -2,7 +2,7 @@ require 'mindee' -describe Mindee::PDF::PDFExtractor::ExtractedPDF do +describe Mindee::PDF::ExtractedPDF do let(:output_dir) { File.join(V1_DATA_DIR, 'output') } let(:valid_pdf_path) { "#{V1_PRODUCT_DATA_DIR}/invoices/invoice.pdf" } let(:invalid_pdf_path) { "#{FILE_TYPES_DIR}/receipt.txt" } @@ -12,7 +12,7 @@ allow(File).to receive(:directory?).and_return(false) allow(File).to receive(:exist?).and_return(true) allow(File).to receive(:extname).and_return('.pdf') - allow(File).to receive(:write) + allow(File).to receive(:binwrite) end describe '#initialize' do @@ -32,7 +32,7 @@ expect do pdf_wrapper.page_count - end.to raise_error Mindee::Errors::MindeePDFError, %r{Could not retrieve page count} + end.to raise_error Mindee::Error::MindeePDFError, %r{Could not retrieve page count} end it 'returns the correct page count for a valid PDF' do @@ -47,10 +47,12 @@ describe '#write_to_file' do it 'writes the PDF bytes to a specified file path' do pdf_stream = File.open(valid_pdf_path, 'r') + expected_pdf_content = pdf_stream.read + pdf_stream.rewind pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') expect { pdf_wrapper.write_to_file(output_path) }.not_to raise_error - expect(File).to have_received(:write).with(output_path, pdf_stream) + expect(File).to have_received(:binwrite).with(output_path, expected_pdf_content) end it 'raises an error if the output path is a directory' do @@ -60,7 +62,7 @@ expect do pdf_wrapper.write_to_file(output_path) - end.to raise_error Mindee::Errors::MindeePDFError, %r{Provided path is not a file} + end.to raise_error Mindee::Error::MindeePDFError, %r{Provided path is not a file} end it 'raises an error if the save path is invalid' do @@ -70,7 +72,7 @@ expect do pdf_wrapper.write_to_file(output_path) - end.to raise_error Mindee::Errors::MindeePDFError, %r{Invalid save path provided} + end.to raise_error Mindee::Error::MindeePDFError, %r{Invalid save path provided} end end diff --git a/spec/pdf/pdf_extractor_spec.rb b/spec/pdf/pdf_extractor_spec.rb index 0bdfe0b9d..418d666d5 100644 --- a/spec/pdf/pdf_extractor_spec.rb +++ b/spec/pdf/pdf_extractor_spec.rb @@ -18,13 +18,13 @@ jpg_input = Mindee::Input::Source::PathInputSource.new(invoice_default_sample_path) expect(jpg_input.pdf?).to eq(false) - extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(jpg_input) + extractor = Mindee::PDF::PDFExtractor.new(jpg_input) expect(extractor.page_count).to eq(1) end it 'should extract invoices from a PDF (no strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) @@ -43,7 +43,7 @@ it 'should extract invoices from a PDF (strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) expect(loaded_prediction.invoice_page_groups.length).to eq(3) diff --git a/spec/v1/client_spec.rb b/spec/v1/client_spec.rb index a92576fb5..ef0136638 100644 --- a/spec/v1/client_spec.rb +++ b/spec/v1/client_spec.rb @@ -68,7 +68,7 @@ local_resp = Mindee::Input::LocalResponse.new("#{V1_DATA_DIR}/geometry/polygon.json") expect do mindee_client.load_prediction(Mindee::V1::Product::Invoice::InvoiceV4, local_resp) - end.to raise_error Mindee::Errors::MindeeInputError + end.to raise_error Mindee::Error::MindeeInputError end it 'should not validate improper async parameters' do @@ -106,7 +106,7 @@ account_name: 'account_name', version: 'version' ) - end.to raise_error Mindee::Errors::MindeeConfigurationError + end.to raise_error Mindee::Error::MindeeConfigurationError expect do mindee_client.send( @@ -116,7 +116,7 @@ account_name: 'account_name', version: 'version' ) - end.to raise_error Mindee::Errors::MindeeConfigurationError + end.to raise_error Mindee::Error::MindeeConfigurationError end end end diff --git a/spec/v1/extraction/invoice_splitter_extraction_integration.rb b/spec/v1/extraction/invoice_splitter_extraction_integration.rb index c6c5cdfda..0b8fd7731 100644 --- a/spec/v1/extraction/invoice_splitter_extraction_integration.rb +++ b/spec/v1/extraction/invoice_splitter_extraction_integration.rb @@ -28,7 +28,7 @@ def prepare_invoice_return(rst_file_path, invoice_prediction) ) inference = response.document.inference - pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(invoice_splitter_input) + pdf_extractor = Mindee::PDF::PDFExtractor.new(invoice_splitter_input) expect(pdf_extractor.page_count).to eq(2) extracted_pdfs_strict = pdf_extractor.extract_invoices(inference.prediction.invoice_page_groups, strict: true) diff --git a/spec/v1/extraction/multi_receipts_extractor_spec.rb b/spec/v1/extraction/multi_receipts_extractor_spec.rb index 46e1dbc8f..d966f1c50 100644 --- a/spec/v1/extraction/multi_receipts_extractor_spec.rb +++ b/spec/v1/extraction/multi_receipts_extractor_spec.rb @@ -126,7 +126,7 @@ it 'raises a MindeeInputError' do expect do Mindee::V1::Extraction.extract_receipts(empty_input_source, empty_inference) - end.to raise_error(Mindee::Errors::MindeeInputError, + end.to raise_error(Mindee::Error::MindeeInputError, 'No possible receipts candidates found for Multi-Receipts extraction.') end end diff --git a/spec/v1/http/error_handler_integration.rb b/spec/v1/http/error_handler_integration.rb index 9cee3d846..a494ceb16 100644 --- a/spec/v1/http/error_handler_integration.rb +++ b/spec/v1/http/error_handler_integration.rb @@ -13,7 +13,7 @@ doc_class = Mindee::V1::Product::Receipt::ReceiptV5 expect do mindee_client1.parse(input_source, doc_class, options: { all_words: false, close_file: true }) - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError end it 'should make an invalid API async enqueue call raising an exception' do @@ -23,7 +23,7 @@ doc_class = Mindee::V1::Product::Invoice::InvoiceV4 expect do mindee_client1.enqueue(input_source, doc_class) - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError end it 'should make an invalid API async parse call raising an exception' do @@ -31,7 +31,7 @@ doc_class = Mindee::V1::Product::InvoiceSplitter::InvoiceSplitterV1 expect do mindee_client1.parse_queued('invalid-job-id', doc_class) - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError end end end diff --git a/spec/v1/http/error_handler_spec.rb b/spec/v1/http/error_handler_spec.rb index 17157fd80..bfb792e5a 100644 --- a/spec/v1/http/error_handler_spec.rb +++ b/spec/v1/http/error_handler_spec.rb @@ -16,7 +16,7 @@ error400 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error400 - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError expect(error400.status_code).to eq(400) expect(error400.api_code).to eq('SomeCode') expect(error400.api_message).to eq('Some scary message here') @@ -29,7 +29,7 @@ error401 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error401 - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError expect(error401.status_code).to eq(401) expect(error401.api_code).to eq('Unauthorized') expect(error401.api_message).to eq('Authorization required') @@ -42,7 +42,7 @@ error429 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error429 - end.to raise_error Mindee::Errors::MindeeHTTPClientError + end.to raise_error Mindee::Error::MindeeHTTPClientError expect(error429.status_code).to eq(429) expect(error429.api_code).to eq('TooManyRequests') expect(error429.api_message).to eq('Too many requests') @@ -55,7 +55,7 @@ error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::Errors::MindeeHTTPServerError + end.to raise_error Mindee::Error::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('failure') expect(error500.api_message).to eq('Inference failed') @@ -68,7 +68,7 @@ error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::Errors::MindeeHTTPServerError + end.to raise_error Mindee::Error::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('UnknownError') expect(error500.api_message).to eq('Server sent back an unexpected reply.') @@ -86,7 +86,7 @@ error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::Errors::MindeeHTTPServerError + end.to raise_error Mindee::Error::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('ServerError') expect(error500.api_message).to eq('An error occurred') diff --git a/spec/v1/input/local_response_v1_spec.rb b/spec/v1/input/local_response_v1_spec.rb index abfad1a06..97f84b652 100644 --- a/spec/v1/input/local_response_v1_spec.rb +++ b/spec/v1/input/local_response_v1_spec.rb @@ -56,14 +56,14 @@ it 'should trigger an error when something invalid is passed' do expect do Mindee::Input::LocalResponse.new(123) - end.to raise_error Mindee::Errors::MindeeInputError + end.to raise_error Mindee::Error::MindeeInputError end it 'should trigger an error when the payload is not hashable' do local_response = Mindee::Input::LocalResponse.new('Your mother was a hamster.') expect do local_response.as_hash - end.to raise_error Mindee::Errors::MindeeInputError + end.to raise_error Mindee::Error::MindeeInputError end end end diff --git a/spec/v2/client_v2_integration.rb b/spec/v2/client_v2_integration.rb index 67fbe8a64..8cda5f3cd 100644 --- a/spec/v2/client_v2_integration.rb +++ b/spec/v2/client_v2_integration.rb @@ -124,7 +124,7 @@ expect do client.enqueue(Mindee::V2::Product::Extraction::Extraction, input, inference_params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(422) } + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(422) } end it 'raises MindeeHTTPErrorV2 (422) on invalid webhook id' do @@ -138,7 +138,7 @@ expect do client.enqueue(Mindee::V2::Product::Extraction::Extraction, input, params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(422) expect(e.code).to start_with('422-') expect(e.detail).to_not be_nil @@ -159,7 +159,7 @@ expect do client.enqueue(Mindee::V2::Product::Extraction::Extraction, input, inference_params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(422) expect(e.code).to start_with('422-') expect(e.detail).to_not be_nil @@ -173,7 +173,7 @@ it 'raises MindeeHTTPErrorV2 on invalid job id' do expect do client.get_result(Mindee::V2::Product::Extraction::Extraction, 'INVALID_JOB_ID') - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(422) expect(e.code).to start_with('422-') expect(e.detail).to_not be_nil @@ -200,7 +200,7 @@ file_alias: 'rb_integration_test' ) client.enqueue_and_get_result(Mindee::V2::Product::Extraction::Extraction, input, inference_params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(404) expect(e.code).to start_with('404-') expect(e.detail).to_not be_nil diff --git a/spec/v2/client_v2_spec.rb b/spec/v2/client_v2_spec.rb index 6c8bace44..b5daabe07 100644 --- a/spec/v2/client_v2_spec.rb +++ b/spec/v2/client_v2_spec.rb @@ -55,7 +55,7 @@ def stub_next_request_with(method, hash:, status_code: 0) text_context: 'Hello my name is mud.' ) client.enqueue(Mindee::V2::Product::Extraction::Extraction, input_doc, params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(400) expect(e.detail).to eq('Unsupported content.') } @@ -66,20 +66,20 @@ def stub_next_request_with(method, hash:, status_code: 0) stub_next_request_with(:enqueue, hash: JSON.generate(json400)) params = Mindee::V2::Product::Extraction::Params::ExtractionParameters.new('dummy-model') client.enqueue_and_get_result(Mindee::V2::Product::Extraction::Extraction, input_doc, params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(400) expect(e.detail).to eq('Unsupported content.') } end - it 'bubbles-up HTTP errors with details' do + it 'bubbles-up HTTP error with details' do error_hash = json400.merge({ status: 413, detail: 'File exceeds size limit' }) expect do stub_next_request_with(:enqueue, hash: JSON.generate(error_hash)) params = Mindee::V2::Product::Extraction::Params::ExtractionParameters.new('dummy-model') client.enqueue(Mindee::V2::Product::Extraction::Extraction, input_doc, params) - end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| + end.to raise_error(Mindee::Error::MindeeHTTPErrorV2) { |e| expect(e.status).to eq(413) expect(e.detail).to include('File exceeds size limit') } diff --git a/spec/v2/file_operation/crop_operation_integration.rb b/spec/v2/file_operation/crop_operation_integration.rb new file mode 100644 index 000000000..58d508c48 --- /dev/null +++ b/spec/v2/file_operation/crop_operation_integration.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require 'mindee' +require 'mindee/v2/file_operation' +require 'mindee/v2/product' + +describe Mindee::V2::FileOperation::Crop, :integration, :v2 do + let(:crop_sample) do + File.join(V2_PRODUCT_DATA_DIR, 'crop', 'default_sample.jpg') + end + + let(:v2_client) do + Mindee::V2::Client.new + end + + let(:crop_model_id) do + ENV.fetch('MINDEE_V2_SE_TESTS_CROP_MODEL_ID') + end + + let(:findoc_model_id) do + ENV.fetch('MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID') + end + + after(:all) do + FileUtils.rm_f("#{OUTPUT_DIR}/crop_001.jpg") + FileUtils.rm_f("#{OUTPUT_DIR}/crop_002.jpg") + end + + # Validates the parsed financial document response properties. + # + # @param findoc_response [Mindee::V2::InferenceResponse] The inference response to check. + def check_findoc_return(findoc_response) + expect(findoc_response.inference.model.id.length).to be > 0 + expect(findoc_response.inference.result.fields['total_amount'].value).to be > 0 + end + + it 'extracts crops from image correctly' do + crop_input = Mindee::Input::Source::PathInputSource.new(crop_sample) + + crop_params = { model_id: crop_model_id, close_file: false } + + response = v2_client.enqueue_and_get_result( + Mindee::V2::Product::Crop::Crop, + crop_input, + crop_params + ) + + expect(response.inference.result.crops.size).to eq(2) + + extracted_images = described_class.extract_crops(crop_input, response.inference.result.crops) + + expect(extracted_images.size).to eq(2) + expect(extracted_images[0].filename).to eq('default_sample.jpg_page0-0.jpg') + expect(extracted_images[1].filename).to eq('default_sample.jpg_page0-1.jpg') + + findoc_params = { model_id: findoc_model_id, close_file: false } + + invoice0 = v2_client.enqueue_and_get_result( + Mindee::V2::Product::Extraction::Extraction, + extracted_images[0].as_input_source, + findoc_params + ) + + check_findoc_return(invoice0) + + extracted_images.save_all_to_disk(OUTPUT_DIR) + + expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to eq(672_913) + expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to eq(675_728) + end +end diff --git a/spec/v2/file_operation/crop_operation_spec.rb b/spec/v2/file_operation/crop_operation_spec.rb new file mode 100644 index 000000000..ca8d86fea --- /dev/null +++ b/spec/v2/file_operation/crop_operation_spec.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +require 'json' +require 'mini_magick' +require 'mindee' +require 'mindee/v2/file_operation' +require 'mindee/v2/product' + +describe Mindee::V2::FileOperation::Crop, :v2 do + let(:crops_single_page_path) do + File.join(V2_PRODUCT_DATA_DIR, 'crop', 'default_sample.jpg') + end + + let(:crops_multi_page_path) do + File.join(V2_PRODUCT_DATA_DIR, 'crop', 'multipage_sample.pdf') + end + + let(:crops_single_page_json_path) do + File.join(V2_PRODUCT_DATA_DIR, 'crop', 'crop_single.json') + end + + let(:crops_multi_page_json_path) do + File.join(V2_PRODUCT_DATA_DIR, 'crop', 'crop_multiple.json') + end + + it 'processes single page crop split correctly' do + input_sample = Mindee::Input::Source::PathInputSource.new(crops_single_page_path) + response_hash = JSON.parse(File.read(crops_single_page_json_path)) + doc = Mindee::V2::Product::Crop::CropResponse.new(response_hash) + + extracted_crops = described_class.extract_crops(input_sample, doc.inference.result.crops) + + expect(extracted_crops.size).to eq(1) + + expect(extracted_crops[0].page_id).to eq(0) + expect(extracted_crops[0].element_id).to eq(0) + + image_buffer0 = MiniMagick::Image.read(extracted_crops[0].buffer) + expect(image_buffer0.dimensions).to eq([2822, 1572]) + end + + it 'processes multi page receipt split correctly' do + input_sample = Mindee::Input::Source::PathInputSource.new(crops_multi_page_path) + response_hash = JSON.parse(File.read(crops_multi_page_json_path)) + doc = Mindee::V2::Product::Crop::CropResponse.new(response_hash) + + extracted_crops = described_class.extract_crops(input_sample, doc.inference.result.crops) + + expect(extracted_crops.size).to eq(2) + + expect(extracted_crops[0].page_id).to eq(0) + expect(extracted_crops[0].element_id).to eq(0) + image_buffer0 = MiniMagick::Image.read(extracted_crops[0].buffer) + expect(image_buffer0.dimensions).to eq([156, 758]) + + expect(extracted_crops[1].page_id).to eq(0) + expect(extracted_crops[1].element_id).to eq(1) + image_buffer1 = MiniMagick::Image.read(extracted_crops[1].buffer) + expect(image_buffer1.dimensions).to eq([187, 690]) + end +end diff --git a/spec/v2/file_operation/split_operation_integration.rb b/spec/v2/file_operation/split_operation_integration.rb new file mode 100644 index 000000000..40e699b5e --- /dev/null +++ b/spec/v2/file_operation/split_operation_integration.rb @@ -0,0 +1,84 @@ +# frozen_string_literal: true + +require 'mindee' +require 'mindee/v2/file_operation' +require 'mindee/v2/product' +require 'fileutils' + +describe Mindee::V2::Product::Split::Split, :integration, :v2 do + let(:split_sample) do + File.join(V2_PRODUCT_DATA_DIR, 'split', 'default_sample.pdf') + end + + let(:invoice_splitter_5p_path) do + File.join(V2_PRODUCT_DATA_DIR, 'split', 'invoice_5p.pdf') + end + + let(:v2_client) do + Mindee::V2::Client.new + end + + let(:split_model_id) do + ENV.fetch('MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID') + end + + let(:findoc_model_id) do + ENV.fetch('MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID') + end + + after(:all) do + FileUtils.rm_f("#{OUTPUT_DIR}/split_001.pdf") + FileUtils.rm_f("#{OUTPUT_DIR}/split_002.pdf") + end + + # Validates the parsed financial document response properties. + # + # @param findoc_response [Mindee::V2::InferenceResponse] The inference response to check. + def check_findoc_return(findoc_response) + expect(findoc_response.inference.model.id.length).to be > 0 + expect(findoc_response.inference.result.fields['total_amount'].value).to be > 0 + end + + it 'extracts splits from pdf correctly' do + split_input = Mindee::Input::Source::PathInputSource.new(split_sample) + + split_params = { model_id: split_model_id, close_file: false } + + response = v2_client.enqueue_and_get_result( + Mindee::V2::Product::Split::Split, + split_input, + split_params + ) + + expect(response.inference.file.page_count).to eq(2) + + extracted_pdfs = response.extract_from_file(split_input) + + expect(extracted_pdfs.size).to eq(2) + expect(extracted_pdfs[0].filename).to eq('default_sample_001-001.pdf') + expect(extracted_pdfs[1].filename).to eq('default_sample_002-002.pdf') + + findoc_params = { model_id: findoc_model_id, close_file: false } + + invoice0 = v2_client.enqueue_and_get_result( + Mindee::V2::Product::Extraction::Extraction, + extracted_pdfs[0].as_input_source, + findoc_params + ) + + check_findoc_return(invoice0) + + extracted_pdfs.save_all_to_disk(OUTPUT_DIR) + + extracted_pdfs.each_with_index do |pdf, i| + local_input = Mindee::Input::Source::PathInputSource.new(File.join(OUTPUT_DIR, format('split_%03d.pdf', i + 1))) + begin + expect(local_input.page_count).to eq(pdf.page_count) + ensure + local_input.close if local_input.respond_to?(:close) + end + end + ensure + split_input.close if split_input.respond_to?(:close) + end +end diff --git a/spec/v2/file_operation/split_operation_spec.rb b/spec/v2/file_operation/split_operation_spec.rb new file mode 100644 index 000000000..a8977ff4d --- /dev/null +++ b/spec/v2/file_operation/split_operation_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +require 'json' +require 'mindee' +require 'mindee/v2/product' + +describe Mindee::V2::Product::Split::SplitResponse, :v2 do + let(:splits_default) do + File.join(V2_PRODUCT_DATA_DIR, 'extraction', 'financial_document', 'default_sample.jpg') + end + + let(:splits_5p) do + File.join(V2_PRODUCT_DATA_DIR, 'split', 'invoice_5p.pdf') + end + + let(:splits_single_page_json_path) do + File.join(V2_PRODUCT_DATA_DIR, 'split', 'split_single.json') + end + + let(:splits_multi_page_json_path) do + File.join(V2_PRODUCT_DATA_DIR, 'split', 'split_multiple.json') + end + + it 'processes single page split correctly' do + input_sample = Mindee::Input::Source::PathInputSource.new(splits_default) + response_hash = JSON.parse(File.read(splits_single_page_json_path)) + doc = described_class.new(response_hash) + + extracted_splits = doc.extract_from_file(input_sample) + + expect(extracted_splits.size).to eq(1) + + expect(extracted_splits[0].page_count).to eq(1) + end + + it 'processes multi page receipt split correctly' do + input_sample = Mindee::Input::Source::PathInputSource.new(splits_5p) + response_hash = JSON.parse(File.read(splits_multi_page_json_path)) + doc = described_class.new(response_hash) + + extracted_splits = doc.extract_from_file(input_sample) + + expect(extracted_splits.size).to eq(3) + + expect(extracted_splits[0].page_count).to eq(1) + expect(extracted_splits[1].page_count).to eq(3) + expect(extracted_splits[2].page_count).to eq(1) + end +end diff --git a/spec/v2/input/local_response_v2_spec.rb b/spec/v2/input/local_response_v2_spec.rb index 7cf1d54bd..21582c149 100644 --- a/spec/v2/input/local_response_v2_spec.rb +++ b/spec/v2/input/local_response_v2_spec.rb @@ -50,14 +50,14 @@ def assert_local_response(local_response) it 'should trigger an error when something invalid is passed' do expect do Mindee::Input::LocalResponse.new(123) - end.to raise_error Mindee::Errors::MindeeInputError + end.to raise_error Mindee::Error::MindeeInputError end it 'should trigger an error when the payload is not hashable' do local_response = Mindee::Input::LocalResponse.new('Your mother was a hamster.') expect do local_response.as_hash - end.to raise_error Mindee::Errors::MindeeInputError + end.to raise_error Mindee::Error::MindeeInputError end end end From c4366b2b41d6edef25ce23016c1a200b19ecb8ce Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:26:28 +0200 Subject: [PATCH 2/6] fix integration tests? --- .github/workflows/_test-integrations.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test-integrations.yml b/.github/workflows/_test-integrations.yml index 79323684c..b0c861654 100644 --- a/.github/workflows/_test-integrations.yml +++ b/.github/workflows/_test-integrations.yml @@ -45,11 +45,11 @@ jobs: ruby-version: ${{ matrix.ruby }} bundler-cache: true - - name: Install Ghostscript on Ubuntu + - name: Install Ghostscript and ImageMagick on Ubuntu if: runner.os == 'Linux' run: | sudo apt update - sudo apt-get install -y ghostscript + sudo apt-get install -y ghostscript imagemagick - name: Install Ghostscript and ImageMagick on macOS if: runner.os == 'macOS' From 2262d67192a151557b0bac2b78daf56776839a61 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:45:55 +0200 Subject: [PATCH 3/6] fix mac CI? --- spec/v2/file_operation/crop_operation_integration.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/v2/file_operation/crop_operation_integration.rb b/spec/v2/file_operation/crop_operation_integration.rb index 58d508c48..4d176e7bd 100644 --- a/spec/v2/file_operation/crop_operation_integration.rb +++ b/spec/v2/file_operation/crop_operation_integration.rb @@ -65,7 +65,7 @@ def check_findoc_return(findoc_response) extracted_images.save_all_to_disk(OUTPUT_DIR) - expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to eq(672_913) - expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to eq(675_728) + expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to be_between(600_000, 672_913) + expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to be_between(600_000, 675_728) end end From 7662658dcf54ac9b095dbe31702aab30e8bf3a68 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Thu, 2 Apr 2026 00:01:05 +0200 Subject: [PATCH 4/6] fix typos --- lib/mindee/error/mindee_error.rb | 2 +- lib/mindee/error/mindee_http_error.rb | 4 ++-- spec/openssl_crl_workaround.rb | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/mindee/error/mindee_error.rb b/lib/mindee/error/mindee_error.rb index db4211de4..1d8aa63f4 100644 --- a/lib/mindee/error/mindee_error.rb +++ b/lib/mindee/error/mindee_error.rb @@ -2,7 +2,7 @@ module Mindee module Error - # Base class for all custom mindee error. + # Base class for all custom mindee errors. class MindeeError < StandardError; end # Errors relating to library issues. diff --git a/lib/mindee/error/mindee_http_error.rb b/lib/mindee/error/mindee_http_error.rb index db4ce35d5..60868305f 100644 --- a/lib/mindee/error/mindee_http_error.rb +++ b/lib/mindee/error/mindee_http_error.rb @@ -27,10 +27,10 @@ def initialize(http_error, url, code) end end - # Base class for all client-side error. + # Base class for all client-side errors. class MindeeHTTPClientError < MindeeHTTPError; end - # Base class for all server-side error. + # Base class for all server-side errors. class MindeeHTTPServerError < MindeeHTTPError; end end end diff --git a/spec/openssl_crl_workaround.rb b/spec/openssl_crl_workaround.rb index 35e6b6432..1f10c085a 100644 --- a/spec/openssl_crl_workaround.rb +++ b/spec/openssl_crl_workaround.rb @@ -2,7 +2,7 @@ require 'openssl' -# Workaround for error in SSL certificates validations on macOS. +# Workaround for errors in SSL certificates validations on macOS. params = OpenSSL::SSL::SSLContext::DEFAULT_PARAMS params[:verify_mode] = OpenSSL::SSL::VERIFY_PEER From a760098b4a118fc3192ddb3052d622d7b2551a4f Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:10:39 +0200 Subject: [PATCH 5/6] fix typo in 'error' --- lib/mindee/error/mindee_http_unknown_error_v2.rb | 2 +- lib/mindee/v2/parsing/error_response.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/mindee/error/mindee_http_unknown_error_v2.rb b/lib/mindee/error/mindee_http_unknown_error_v2.rb index 2bcbcfc5d..214264524 100644 --- a/lib/mindee/error/mindee_http_unknown_error_v2.rb +++ b/lib/mindee/error/mindee_http_unknown_error_v2.rb @@ -11,7 +11,7 @@ def initialize(http_error) 'status' => -1, 'title' => 'Unknown Error', 'code' => '000-000', - 'error' => nil }) + 'errors' => nil }) end end end diff --git a/lib/mindee/v2/parsing/error_response.rb b/lib/mindee/v2/parsing/error_response.rb index a5b5ad26f..cec156101 100644 --- a/lib/mindee/v2/parsing/error_response.rb +++ b/lib/mindee/v2/parsing/error_response.rb @@ -22,8 +22,8 @@ def initialize(server_response) @detail = server_response['detail'] @title = server_response['title'] @code = server_response['code'] - @errors = if server_response.key?('error') - server_response['error'].map do |error| + @errors = if server_response.key?('errors') + server_response['errors'].map do |error| ErrorItem.new(error) end else From d65f147955f79904b3a38127d03c76c94e8bd840 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:21:59 +0200 Subject: [PATCH 6/6] fix wrong usage of module --- lib/mindee/v2/product/crop/crop_response.rb | 5 +---- lib/mindee/v2/product/split/split_range.rb | 2 +- lib/mindee/v2/product/split/split_response.rb | 6 ++---- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/lib/mindee/v2/product/crop/crop_response.rb b/lib/mindee/v2/product/crop/crop_response.rb index 378cb05e1..c2ec8f4f7 100644 --- a/lib/mindee/v2/product/crop/crop_response.rb +++ b/lib/mindee/v2/product/crop/crop_response.rb @@ -31,10 +31,7 @@ def to_s # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from # @return [FileOperation::CropFiles] List of extracted PDFs def extract_from_file(input_source) - crop_files = @inference.result.crops.map do |crop| - crop.extract_from_file(input_source) - end - FileOperation::CropFiles.new(crop_files) + FileOperation::Crop.extract_crops(input_source, @inference.result.crops) end end end diff --git a/lib/mindee/v2/product/split/split_range.rb b/lib/mindee/v2/product/split/split_range.rb index 450b0f123..f53d81f31 100644 --- a/lib/mindee/v2/product/split/split_range.rb +++ b/lib/mindee/v2/product/split/split_range.rb @@ -27,7 +27,7 @@ def to_s # Apply the split range inference to a file and return a single extracted PDF. # # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from - # @return [Image::ExtractedImage] + # @return [PDF::ExtractedPDF] def extract_from_file(input_source) FileOperation::Split.extract_single_split(input_source, @page_range) end diff --git a/lib/mindee/v2/product/split/split_response.rb b/lib/mindee/v2/product/split/split_response.rb index 8e5817ccd..53d552bbe 100644 --- a/lib/mindee/v2/product/split/split_response.rb +++ b/lib/mindee/v2/product/split/split_response.rb @@ -30,10 +30,8 @@ def to_s # @param input_source [Mindee::Input::Source::LocalInputSource] Path to the file or a File object. # @return [FileOperation::SplitFiles] def extract_from_file(input_source) - crop_files = @inference.result.splits.map do |crop| - crop.extract_from_file(input_source) - end - FileOperation::SplitFiles.new(crop_files) + splits = @inference.result.splits.map(&:page_range) + FileOperation::Split.extract_splits(input_source, splits) end end end