From 401849933198f8c29bde1fcb395500641a6e3209 Mon Sep 17 00:00:00 2001 From: Ibitope Fatoki Date: Wed, 7 May 2025 23:29:58 +1000 Subject: [PATCH 1/2] This commit features a new helper method to sanitize pdfs and includes a test for it. --- app/helpers/file_helper.rb | 53 ++++++++++++++++++++++++++++++++ test/helpers/file_helper_test.rb | 19 ++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 test/helpers/file_helper_test.rb diff --git a/app/helpers/file_helper.rb b/app/helpers/file_helper.rb index 2ae3597b2f..b6980d327d 100644 --- a/app/helpers/file_helper.rb +++ b/app/helpers/file_helper.rb @@ -83,6 +83,16 @@ def accept_file(file, name, kind) msg: msg } end + + # Sanitize the PDF + sanitized_result = sanitize_pdf(file["tempfile"].path) + unless sanitized_result[:success] + logger.debug "PDF sanitization failed: #{sanitized_result[:msg]}" + return { accepted: false, msg: sanitized_result[:msg] } + end + + # Replace the original file with the sanitized version + FileUtils.mv(sanitized_result[:sanitized_path], file["tempfile"].path) end logger.debug "Uploaded file is accepted" @@ -398,6 +408,48 @@ def validate_pdf(filename) { valid: true, encrypted: false } end + # + # Sanitize a PDF file + # + def sanitize_pdf(input_path, output_path = nil) + return { success: false, msg: 'File does not exist' } unless File.exist?(input_path) + + output_path ||= File.join(Dir.tmpdir, "sanitized-#{File.basename(input_path)}") + + begin + # Step 1: Validate the PDF + validation_result = validate_pdf(input_path) + unless validation_result[:valid] + return { success: false, msg: 'Invalid or corrupted PDF' } + end + + # Step 2: Use qpdf to sanitize the PDF (remove JavaScript and re-encode) + logger.debug "Sanitizing PDF #{input_path} using qpdf" + qpdf(input_path) # Reuse the existing qpdf function + + # Step 3: Further sanitize using ghostscript + sanitized_tmp = File.join(Dir.tmpdir, "gs-sanitized-#{File.basename(input_path)}") + logger.debug "Further sanitizing PDF #{input_path} using ghostscript" + exec = "gs -sDEVICE=pdfwrite -dDetectDuplicateImages=true -dPDFSETTINGS=/printer -dNOPAUSE -dBATCH -dQUIET -sOutputFile=\"#{sanitized_tmp}\" \"#{input_path}\"" + system(exec) + + # Replace the output file with the ghostscript-sanitized version if successful + if File.exist?(sanitized_tmp) + FileUtils.mv(sanitized_tmp, output_path) + end + + # Step 4: Validate the sanitized PDF + if File.exist?(output_path) && validate_pdf(output_path)[:valid] + return { success: true, sanitized_path: output_path } + else + return { success: false, msg: 'Failed to sanitize PDF' } + end + rescue => e + logger.error "Failed to sanitize PDF #{input_path}. Error: #{e.message}" + return { success: false, msg: "Error during sanitization: #{e.message}" } + end + end + # # Copy a PDF into place # @@ -636,6 +688,7 @@ def line_wrap(path, width: 160) module_function :qpdf module_function :move_files module_function :validate_pdf + module_function :sanitize_pdf module_function :copy_pdf module_function :read_file_to_str module_function :path_to_plagarism_html diff --git a/test/helpers/file_helper_test.rb b/test/helpers/file_helper_test.rb new file mode 100644 index 0000000000..5dd2cfc339 --- /dev/null +++ b/test/helpers/file_helper_test.rb @@ -0,0 +1,19 @@ +require 'test_helper' + +class FileHelperTest < ActiveSupport::TestCase + test 'sanitize_pdf should sanitize a valid PDF' do + input_path = 'test/fixtures/files/valid.pdf' + output_path = File.join(Dir.tmpdir, 'sanitized-valid.pdf') + + result = FileHelper.sanitize_pdf(input_path, output_path) + assert result[:success], "Expected sanitization to succeed, but got: #{result[:msg]}" + assert File.exist?(result[:sanitized_path]), 'Sanitized file does not exist' + end + + test 'sanitize_pdf should fail for an invalid PDF' do + input_path = 'test/fixtures/files/invalid.pdf' + + result = FileHelper.sanitize_pdf(input_path) + assert_not result[:success], 'Expected sanitization to fail for invalid PDF' + end +end From c5e75245e477f5bcd7d0685ebf393a432ece9502 Mon Sep 17 00:00:00 2001 From: Ibitope Fatoki Date: Thu, 8 May 2025 03:09:39 +1000 Subject: [PATCH 2/2] Added loggers for the sanitization process --- app/helpers/file_helper.rb | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/app/helpers/file_helper.rb b/app/helpers/file_helper.rb index b6980d327d..46f616de98 100644 --- a/app/helpers/file_helper.rb +++ b/app/helpers/file_helper.rb @@ -417,21 +417,24 @@ def sanitize_pdf(input_path, output_path = nil) output_path ||= File.join(Dir.tmpdir, "sanitized-#{File.basename(input_path)}") begin + logger.debug "Starting PDF sanitization for #{input_path}" + # Step 1: Validate the PDF + logger.debug "Validating PDF: #{input_path}" validation_result = validate_pdf(input_path) unless validation_result[:valid] return { success: false, msg: 'Invalid or corrupted PDF' } end - # Step 2: Use qpdf to sanitize the PDF (remove JavaScript and re-encode) - logger.debug "Sanitizing PDF #{input_path} using qpdf" - qpdf(input_path) # Reuse the existing qpdf function + # Step 2: Use qpdf to sanitize the PDF + logger.debug "Running qpdf on: #{input_path}" + qpdf(input_path) # Step 3: Further sanitize using ghostscript sanitized_tmp = File.join(Dir.tmpdir, "gs-sanitized-#{File.basename(input_path)}") - logger.debug "Further sanitizing PDF #{input_path} using ghostscript" + logger.debug "Running ghostscript on: #{input_path}" exec = "gs -sDEVICE=pdfwrite -dDetectDuplicateImages=true -dPDFSETTINGS=/printer -dNOPAUSE -dBATCH -dQUIET -sOutputFile=\"#{sanitized_tmp}\" \"#{input_path}\"" - system(exec) + TimeoutHelper.system_try_within(30, "Ghostscript sanitization timeout", exec) # Replace the output file with the ghostscript-sanitized version if successful if File.exist?(sanitized_tmp) @@ -440,12 +443,14 @@ def sanitize_pdf(input_path, output_path = nil) # Step 4: Validate the sanitized PDF if File.exist?(output_path) && validate_pdf(output_path)[:valid] + logger.debug "Sanitization complete for #{input_path}" return { success: true, sanitized_path: output_path } else return { success: false, msg: 'Failed to sanitize PDF' } end rescue => e logger.error "Failed to sanitize PDF #{input_path}. Error: #{e.message}" + logger.error "Backtrace: #{e.backtrace.join("\n")}" return { success: false, msg: "Error during sanitization: #{e.message}" } end end