From 1a3fe40f97fd134402cbb2b382fb28f5d601f975 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Sat, 23 Aug 2025 20:56:13 +0000 Subject: [PATCH 1/3] Add support for mixed encodings processing in command line options and help documentation --- src/HELP.in | 1 + src/common.h | 1 + src/enca.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/options.c | 8 ++- 4 files changed, 199 insertions(+), 1 deletion(-) diff --git a/src/HELP.in b/src/HELP.in index 642b61b..a35a28c 100644 --- a/src/HELP.in +++ b/src/HELP.in @@ -20,6 +20,7 @@ Output type selectors: Guessing parameters: -L, --language=LANG Set language of FILEs; obligatory, when cannot be determined from locale settings + -M, --mixed-encodings Handle files with mixed encodings (chunk-based detection) Conversion parameters: -E, --external-converter-program=PATH diff --git a/src/common.h b/src/common.h index 3697aef..b6a2b43 100644 --- a/src/common.h +++ b/src/common.h @@ -142,6 +142,7 @@ struct _Options { EncaEncoding target_enc; /* Target encoding for conversion. */ char *target_enc_str; /* How user specified the target encoding. */ int prefix_filename; /* Do prepend filename: before results? */ + int mixed_encodings; /* Handle files with mixed encodings? */ }; /* Enca options. */ diff --git a/src/enca.c b/src/enca.c index 119a850..05dbc77 100644 --- a/src/enca.c +++ b/src/enca.c @@ -22,6 +22,8 @@ /* Local prototypes. */ static int process_file (EncaAnalyser an, const char *fname); +static int process_mixed_file (EncaAnalyser an, + const char *fname); static void dwim_libenca_options(EncaAnalyser an, const File *file); static int print_results (const char *fname, @@ -136,6 +138,13 @@ process_file(EncaAnalyser an, if (!ot_is_convert) file_close(file); + /* Check if mixed encoding processing is requested */ + if (options.mixed_encodings) { + file_close(file); + file_free(file); + return process_mixed_file(an, fname); + } + /* Guess encoding. */ dwim_libenca_options(an, file); if (ot_is_convert) @@ -345,5 +354,186 @@ double_utf8_chk(EncaAnalyser an, enca_free(candidates); } +/** + * Structure to hold information about a detected encoding segment + */ +typedef struct { + size_t start; /* Start position in file */ + size_t length; /* Length of this segment */ + EncaEncoding encoding; /* Detected encoding for this segment */ +} EncodingSegment; + +/** + * Process a file with potentially mixed encodings by analyzing it in chunks + * and handling conversions segment by segment. + */ +static int +process_mixed_file(EncaAnalyser an, const char *fname) +{ + static Buffer *buffer = NULL; + static int utf8 = ENCA_CS_UNKNOWN; + FILE *infile = NULL; + FILE *outfile = NULL; + char *temp_filename = NULL; + int ot_is_convert = (options.output_type == OTYPE_CONVERT); + int res = ERR_OK; + + const size_t CHUNK_SIZE = 1024; /* Process file in 1KB chunks */ + + EncodingSegment *segments = NULL; + size_t segment_count = 0; + size_t segment_capacity = 8; + + unsigned char *chunk_buffer = NULL; + size_t file_pos = 0; + + if (buffer == NULL) + buffer = buffer_new(buffer_size); + + if (!enca_charset_is_known(utf8)) { + utf8 = enca_name_to_charset("utf8"); + assert(enca_charset_is_known(utf8)); + } + + /* Open input file */ + if (fname == NULL) { + infile = stdin; + } else { + infile = fopen(fname, "rb"); + if (infile == NULL) { + fprintf(stderr, "%s: Cannot open file %s: %s\n", + program_name, fname, strerror(errno)); + return EXIT_TROUBLE; + } + } + + /* Allocate segment array and chunk buffer */ + segments = NEW(EncodingSegment, segment_capacity); + chunk_buffer = NEW(unsigned char, CHUNK_SIZE); + + if (!segments || !chunk_buffer) { + fprintf(stderr, "%s: Memory allocation failed\n", program_name); + res = EXIT_TROUBLE; + goto cleanup; + } + + if (options.verbosity_level > 1) { + fprintf(stderr, "Processing file with mixed encodings in %zu-byte chunks\n", CHUNK_SIZE); + } + + /* Process file chunk by chunk */ + while (!feof(infile)) { + size_t bytes_read; + EncaEncoding detected; + + /* Read chunk */ + bytes_read = fread(chunk_buffer, 1, CHUNK_SIZE, infile); + if (bytes_read == 0) break; + + /* Detect encoding for this chunk */ + detected = enca_analyse_const(an, chunk_buffer, bytes_read); + + /* Expand segments array if needed */ + if (segment_count >= segment_capacity) { + segment_capacity *= 2; + segments = realloc(segments, segment_capacity * sizeof(EncodingSegment)); + if (!segments) { + fprintf(stderr, "%s: Memory reallocation failed\n", program_name); + res = EXIT_TROUBLE; + goto cleanup; + } + } + + /* Add segment or merge with previous if same encoding */ + if (segment_count > 0 && + segments[segment_count - 1].encoding.charset == detected.charset && + segments[segment_count - 1].encoding.surface == detected.surface) { + /* Merge with previous segment */ + segments[segment_count - 1].length += bytes_read; + } else { + /* Create new segment */ + segments[segment_count].start = file_pos; + segments[segment_count].length = bytes_read; + segments[segment_count].encoding = detected; + segment_count++; + } + + file_pos += bytes_read; + } + + /* Print detection results */ + if (!ot_is_convert) { + if (options.prefix_filename && fname != NULL) { + printf("%s: ", fname); + } + + if (segment_count == 0) { + printf("No data processed\n"); + } else if (segment_count == 1) { + /* Only one encoding found */ + if (enca_charset_is_known(segments[0].encoding.charset)) { + print_results(fname, an, segments[0].encoding, enca_errno(an)); + } else { + printf("Single segment with unrecognized encoding\n"); + } + } else { + /* Multiple encodings found */ + printf("Mixed encodings detected (%zu segments):\n", segment_count); + for (size_t i = 0; i < segment_count; i++) { + printf(" Segment %zu (offset %zu, %zu bytes): ", + i + 1, segments[i].start, segments[i].length); + + if (enca_charset_is_known(segments[i].encoding.charset)) { + const char *enc_name = enca_charset_name(segments[i].encoding.charset, + ENCA_NAME_STYLE_HUMAN); + if (enc_name != NULL) { + printf("%s", enc_name); + } else { + printf("Known but unnamed charset %d", segments[i].encoding.charset); + } + + if (segments[i].encoding.surface) { + const char *surface_name = enca_get_surface_name(segments[i].encoding.surface, + ENCA_NAME_STYLE_HUMAN); + if (surface_name) { + printf(" (%s)", surface_name); + } + } + } else { + printf("Unrecognized encoding"); + } + printf("\n"); + } + } + } + + /* Handle conversion if requested */ + if (ot_is_convert && enca_charset_is_known(options.target_enc.charset)) { + fprintf(stderr, "Mixed encoding conversion not fully implemented yet.\n"); + fprintf(stderr, "Would convert %zu segments to %s\n", + segment_count, options.target_enc_str); + } + +cleanup: + if (infile != NULL && infile != stdin) { + fclose(infile); + } + if (outfile != NULL && outfile != stdout) { + fclose(outfile); + } + if (temp_filename != NULL) { + unlink(temp_filename); /* Remove temp file on error */ + enca_free(temp_filename); + } + if (segments != NULL) { + enca_free(segments); + } + if (chunk_buffer != NULL) { + enca_free(chunk_buffer); + } + + return res; +} + /* vim: ts=2 */ diff --git a/src/options.c b/src/options.c index 9cc1c33..c087121 100644 --- a/src/options.c +++ b/src/options.c @@ -61,6 +61,7 @@ static const Options DEFAULTS = { { ENCA_CS_UNKNOWN, 0 }, /* target_enc */ NULL, /* target_enc_str */ -1, /* prefix_filename */ + 0, /* mixed_encodings */ }; extern const char *const COPYING_text[]; @@ -209,7 +210,7 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) { /* Short command line options. */ static const char *short_options = - "cC:deE:fgGhil:L:mn:pPrsvVx:"; + "cC:deE:fgGhil:L:mn:pPrsvVx:M"; /* Long `GNU style' command line options {{{. */ static const struct option long_options[] = { @@ -227,6 +228,7 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) { "license", no_argument, NULL, 'G' }, { "list", required_argument, NULL, 'l' }, { "mime-name", no_argument, NULL, 'm' }, + { "mixed-encodings", no_argument, NULL, 'M' }, { "name", required_argument, NULL, 'n' }, { "no-filename", no_argument, NULL, 'P' }, { "rfc1345-name", no_argument, NULL, 'r' }, @@ -325,6 +327,10 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) options.language = optarg; break; + case 'M': /* Mixed encodings mode. */ + options.mixed_encodings = 1; + break; + case 'C': /* Add converters to converter list. */ add_parsed_converters(optarg); break; From b53a65e2361d7b3388b18f922b3e01480bba01ad Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Sun, 24 Aug 2025 12:10:38 +0000 Subject: [PATCH 2/3] Implement mixed encoding conversion with iconv support and enhanced verbosity logging --- src/enca.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 1 deletion(-) diff --git a/src/enca.c b/src/enca.c index 05dbc77..ba4eacd 100644 --- a/src/enca.c +++ b/src/enca.c @@ -19,6 +19,10 @@ #include #include "common.h" +#ifdef HAVE_GOOD_ICONV +#include +#endif + /* Local prototypes. */ static int process_file (EncaAnalyser an, const char *fname); @@ -509,9 +513,141 @@ process_mixed_file(EncaAnalyser an, const char *fname) /* Handle conversion if requested */ if (ot_is_convert && enca_charset_is_known(options.target_enc.charset)) { - fprintf(stderr, "Mixed encoding conversion not fully implemented yet.\n"); +#ifdef HAVE_GOOD_ICONV + if (options.verbosity_level) { + fprintf(stderr, "%s: converting mixed encoding file `%s' (%zu segments) to %s\n", + program_name, fname ? fname : "stdin", + segment_count, options.target_enc_str); + } + + /* Create output file for conversion */ + FILE *outfile = NULL; + char *out_filename = NULL; + + if (fname) { + /* Create temporary file for output */ + out_filename = enca_malloc(strlen(fname) + 10); + sprintf(out_filename, "%s.encatmp", fname); + outfile = fopen(out_filename, "wb"); + if (!outfile) { + fprintf(stderr, "%s: cannot create temporary file %s\n", + program_name, out_filename); + enca_free(out_filename); + goto cleanup; + } + } else { + outfile = stdout; + } + + /* Read entire file into buffer for conversion */ + fseek(infile, 0, SEEK_END); + long file_size = ftell(infile); + fseek(infile, 0, SEEK_SET); + + if (file_size < 0) { + fprintf(stderr, "%s: cannot determine file size\n", program_name); + if (fname) enca_free(out_filename); + goto cleanup; + } + + char *file_data = enca_malloc(file_size + 1); + size_t total_read = fread(file_data, 1, file_size, infile); + + if (options.verbosity_level > 2) { + fprintf(stderr, "Read %zu bytes from file for conversion\n", total_read); + } + + /* Convert each segment */ + for (size_t i = 0; i < segment_count; i++) { + if (options.verbosity_level > 1) { + fprintf(stderr, " converting segment %zu: %s -> %s (%zu bytes)\n", + i + 1, + enca_charset_name(segments[i].encoding.charset, ENCA_NAME_STYLE_HUMAN), + options.target_enc_str, + segments[i].length); + } + + /* Skip if already target encoding */ + if (segments[i].encoding.charset == options.target_enc.charset && + segments[i].encoding.surface == options.target_enc.surface) { + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + if (options.verbosity_level > 2) { + fprintf(stderr, " segment already in target encoding, copied unchanged\n"); + } + continue; + } + + /* Convert segment using iconv if possible */ + if (enca_charset_is_known(segments[i].encoding.charset)) { + const char *from_name = enca_charset_name(segments[i].encoding.charset, ENCA_NAME_STYLE_ICONV); + const char *to_name = enca_charset_name(options.target_enc.charset, ENCA_NAME_STYLE_ICONV); + + if (from_name && to_name) { + iconv_t cd = iconv_open(to_name, from_name); + if (cd != (iconv_t)-1) { + char *inbuf = (char *)(file_data + segments[i].start); + size_t inbytesleft = segments[i].length; + + /* Allocate output buffer (generous size) */ + size_t outbuf_size = segments[i].length * 4 + 1024; + char *outbuf_start = enca_malloc(outbuf_size); + char *outbuf = outbuf_start; + size_t outbytesleft = outbuf_size; + + size_t result = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + if (result != (size_t)-1 || inbytesleft == 0) { + size_t converted_bytes = outbuf_size - outbytesleft; + fwrite(outbuf_start, 1, converted_bytes, outfile); + + if (options.verbosity_level > 2) { + fprintf(stderr, " converted %zu bytes -> %zu bytes\n", + segments[i].length, converted_bytes); + } + } else { + fprintf(stderr, "%s: conversion failed for segment %zu, copying unchanged\n", + program_name, i + 1); + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } + + enca_free(outbuf_start); + iconv_close(cd); + } else { + fprintf(stderr, "%s: cannot open converter from %s to %s for segment %zu\n", + program_name, from_name, to_name, i + 1); + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } + } else { + fprintf(stderr, "%s: no iconv name for charset in segment %zu\n", program_name, i + 1); + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } + } else { + /* Unknown encoding, copy as-is */ + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } + } + + /* Close and replace original file if successful */ + if (fname) { + fclose(outfile); + if (rename(out_filename, fname) != 0) { + fprintf(stderr, "%s: cannot replace original file %s\n", + program_name, fname); + unlink(out_filename); + } else { + if (options.verbosity_level) { + fprintf(stderr, "%s: successfully converted `%s'\n", + program_name, fname); + } + } + enca_free(out_filename); + } + + enca_free(file_data); +#else + fprintf(stderr, "%s: mixed encoding conversion requires iconv support\n", program_name); fprintf(stderr, "Would convert %zu segments to %s\n", segment_count, options.target_enc_str); +#endif } cleanup: From a23b9a32d7934635f1bfd41836d19fc9ccaf3990 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Thu, 28 Aug 2025 21:33:59 +0000 Subject: [PATCH 3/3] Add mixed encoding support with configurable options and error handling --- man/enca.1 | 61 +++++++++++++++++++- src/HELP.in | 4 ++ src/common.h | 3 + src/enca.c | 151 +++++++++++++++++++++++++++++++++++--------------- src/options.c | 24 +++++++- 5 files changed, 196 insertions(+), 47 deletions(-) diff --git a/man/enca.1 b/man/enca.1 index 53a14e3..d0d652a 100644 --- a/man/enca.1 +++ b/man/enca.1 @@ -48,6 +48,17 @@ and find out it's KOI8\-R (for example). Be warned, currently there are not many supported languages (see section \fBLANGUAGES\fR). .PP +For files containing mixed encodings (common with concatenated documents, +email archives, or multi-source data files), use the mixed encoding mode: +.XA "enca \-L pl \-M mixed_file.txt" +This will detect and report all encoding segments within the file. +To convert all segments to a uniform encoding: +.XA "enca \-L pl \-M \-x utf8 mixed_file.txt" +For finer control over segment detection: +.XA "enca \-L pl \-M \-B 256 \-I \-x utf8 mixed_file.txt" +This uses 256-byte chunks for detection and ignores conversion errors +for problematic segments. +.PP Another warning concerns the fact several Enca's features, namely its charset conversion capabilities, strongly depend on what other tools are installed on your system (see section \fBCONVERSION)\fR\-\-run @@ -88,6 +99,14 @@ piece of text/data. In case of multipart files (e.g. mailboxes), you have to use some tool knowing the structure to extract the individual parts first. It's the cost of ability to detect encodings of any damaged, incomplete or otherwise incorrect files. +.PP +However, Enca provides a mixed encoding mode (\fB\-M\fR) that can handle +files containing segments with different character encodings. +This mode analyzes files in configurable chunks and can detect multiple +encodings within a single file, making it useful for concatenated documents, +email archives, or files created by combining content from multiple sources. +When used with conversion, each segment is converted individually, +resulting in a file with uniform encoding throughout. . . .SH "OPTIONS" @@ -232,6 +251,46 @@ to get list of supported languages. When you don't specify any language Enca tries to guess your language from locale settings and assumes input files use this language. See section \fBLANGUAGES\fR for details. +.TP +\fB\-M\fR, \fB\-\-mixed\-encodings\fR +Enables mixed encoding detection and handling mode. +Instead of detecting a single predominant encoding for the entire file, +Enca will analyze the file in chunks and detect different encodings +in different segments of the file. +This is useful for files that contain text from multiple sources +with different character encodings, such as concatenated documents, +email archives, or data files with mixed content. +.sp +When this option is used, Enca will report all detected encoding segments +with their byte offsets and lengths. If used with conversion (\fB\-x\fR), +each segment will be converted individually from its detected encoding +to the target encoding. +.sp +The granularity of detection can be controlled with \fB\-B\fR option. +.TP +\fB\-B\fR, \fB\-\-mixed\-buffer\-size=\fR\fISIZE\fR +Sets the buffer size in bytes for mixed encoding detection when +\fB\-M\fR option is used. +.sp +Default buffer size is 1024 bytes. Smaller values (e.g., 256) provide +finer granularity and may detect more encoding segments, while larger +values (e.g., 4096) provide coarser granularity and better performance. +Valid range is 1\-1048576 bytes. +.sp +This option has no effect unless \fB\-M\fR is also specified. +.TP +\fB\-I\fR, \fB\-\-mixed\-ignore\-errors\fR +Enables graceful error handling in mixed encoding mode. +When conversion fails for a particular segment due to unknown encoding +or conversion errors, the problematic segment will be handled using +the predominant encoding detected in the file, or copied unchanged +if no suitable fallback is available. +.sp +This option is particularly useful when processing files with +segments of unknown or corrupted encodings, allowing the conversion +to continue rather than failing completely. +.sp +This option has no effect unless \fB\-M\fR is also specified. .PP . .SS Conversion parameters @@ -672,7 +731,6 @@ Belarusian @CP1251 IBM866 ISO\-8859\-5 KOI8\-UNI maccyr IBM855 Bulgarian @CP1251 ISO\-8859\-5 IBM855 maccyr ECMA\-113 Czech @ISO\-8859\-2 CP1250 IBM852 KEYBCS2 macce KOI\-8_CS_2 CORK Estonian @ISO\-8859\-4 CP1257 IBM775 ISO\-8859\-13 macce baltic -Finnish @ISO\-8859\-4 CP1257 Croatian @CP1250 ISO\-8859\-2 IBM852 macce CORK Hungarian @ISO\-8859\-2 CP1250 IBM852 macce CORK Lithuanian @CP1257 ISO\-8859\-4 IBM775 ISO\-8859\-13 macce baltic @@ -697,7 +755,6 @@ Belarusian @be Bulgarian @bg Czech @cs Estonian @et -Finnish @fi Croatian @hr Hungarian @hu Lithuanian @lt diff --git a/src/HELP.in b/src/HELP.in index a35a28c..9e7ccee 100644 --- a/src/HELP.in +++ b/src/HELP.in @@ -21,6 +21,10 @@ Guessing parameters: -L, --language=LANG Set language of FILEs; obligatory, when cannot be determined from locale settings -M, --mixed-encodings Handle files with mixed encodings (chunk-based detection) + -B, --mixed-buffer-size=SIZE Set buffer size for mixed encoding detection + (default: 1024 bytes, range: 1-1048576) + -I, --mixed-ignore-errors Ignore conversion errors in mixed mode, copy + problematic segments unchanged Conversion parameters: -E, --external-converter-program=PATH diff --git a/src/common.h b/src/common.h index b6a2b43..cc83bba 100644 --- a/src/common.h +++ b/src/common.h @@ -143,6 +143,9 @@ struct _Options { char *target_enc_str; /* How user specified the target encoding. */ int prefix_filename; /* Do prepend filename: before results? */ int mixed_encodings; /* Handle files with mixed encodings? */ + int mixed_buffer_size; /* Buffer size for mixed encoding detection (bytes). */ + int mixed_ignore_errors; /* Ignore conversion errors in mixed mode? */ + int mixed_use_predominant; /* Use predominant encoding for unknown segments? */ }; /* Enca options. */ diff --git a/src/enca.c b/src/enca.c index ba4eacd..9ca57f5 100644 --- a/src/enca.c +++ b/src/enca.c @@ -382,7 +382,7 @@ process_mixed_file(EncaAnalyser an, const char *fname) int ot_is_convert = (options.output_type == OTYPE_CONVERT); int res = ERR_OK; - const size_t CHUNK_SIZE = 1024; /* Process file in 1KB chunks */ + const size_t CHUNK_SIZE = options.mixed_buffer_size; /* Use user-configurable buffer size */ EncodingSegment *segments = NULL; size_t segment_count = 0; @@ -391,6 +391,10 @@ process_mixed_file(EncaAnalyser an, const char *fname) unsigned char *chunk_buffer = NULL; size_t file_pos = 0; + /* Statistics for predominant encoding detection */ + EncaEncoding predominant_encoding = { ENCA_CS_UNKNOWN, 0 }; + size_t predominant_bytes = 0; + if (buffer == NULL) buffer = buffer_new(buffer_size); @@ -461,6 +465,24 @@ process_mixed_file(EncaAnalyser an, const char *fname) segments[segment_count].encoding = detected; segment_count++; } + + /* Track predominant encoding (most bytes) */ + if (enca_charset_is_known(detected.charset)) { + size_t current_total = 0; + + /* Calculate total bytes for this encoding across all segments */ + for (size_t i = 0; i < segment_count; i++) { + if (segments[i].encoding.charset == detected.charset && + segments[i].encoding.surface == detected.surface) { + current_total += segments[i].length; + } + } + + if (current_total > predominant_bytes) { + predominant_encoding = detected; + predominant_bytes = current_total; + } + } file_pos += bytes_read; } @@ -513,11 +535,17 @@ process_mixed_file(EncaAnalyser an, const char *fname) /* Handle conversion if requested */ if (ot_is_convert && enca_charset_is_known(options.target_enc.charset)) { -#ifdef HAVE_GOOD_ICONV if (options.verbosity_level) { fprintf(stderr, "%s: converting mixed encoding file `%s' (%zu segments) to %s\n", program_name, fname ? fname : "stdin", segment_count, options.target_enc_str); + + if (enca_charset_is_known(predominant_encoding.charset)) { + fprintf(stderr, "%s: predominant encoding: %s (%zu bytes)\n", + program_name, + enca_charset_name(predominant_encoding.charset, ENCA_NAME_STYLE_HUMAN), + predominant_bytes); + } } /* Create output file for conversion */ @@ -557,7 +585,7 @@ process_mixed_file(EncaAnalyser an, const char *fname) fprintf(stderr, "Read %zu bytes from file for conversion\n", total_read); } - /* Convert each segment */ + /* Convert each segment using proper converter system */ for (size_t i = 0; i < segment_count; i++) { if (options.verbosity_level > 1) { fprintf(stderr, " converting segment %zu: %s -> %s (%zu bytes)\n", @@ -577,53 +605,93 @@ process_mixed_file(EncaAnalyser an, const char *fname) continue; } - /* Convert segment using iconv if possible */ - if (enca_charset_is_known(segments[i].encoding.charset)) { - const char *from_name = enca_charset_name(segments[i].encoding.charset, ENCA_NAME_STYLE_ICONV); - const char *to_name = enca_charset_name(options.target_enc.charset, ENCA_NAME_STYLE_ICONV); + /* Handle unknown encodings */ + if (!enca_charset_is_known(segments[i].encoding.charset)) { + if (options.mixed_ignore_errors) { + if (enca_charset_is_known(predominant_encoding.charset)) { + /* Use predominant encoding for unknown segments */ + segments[i].encoding = predominant_encoding; + if (options.verbosity_level > 1) { + fprintf(stderr, " unknown encoding, using predominant: %s\n", + enca_charset_name(predominant_encoding.charset, ENCA_NAME_STYLE_HUMAN)); + } + } else { + /* Copy unchanged */ + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + if (options.verbosity_level > 1) { + fprintf(stderr, " unknown encoding, copying unchanged\n"); + } + continue; + } + } else { + /* Copy unchanged if not ignoring errors */ + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + if (options.verbosity_level > 1) { + fprintf(stderr, " unknown encoding, copying unchanged\n"); + } + continue; + } + } + + /* Create temporary file for this segment */ + char temp_segment_name[256]; + sprintf(temp_segment_name, "/tmp/enca_segment_%zu_%d", i, getpid()); + + FILE *temp_segment = fopen(temp_segment_name, "wb"); + if (!temp_segment) { + fprintf(stderr, "%s: cannot create temporary segment file\n", program_name); + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + continue; + } + + /* Write segment data to temporary file */ + fwrite(file_data + segments[i].start, 1, segments[i].length, temp_segment); + fclose(temp_segment); + + /* Use existing conversion system */ + File *temp_file = file_new(temp_segment_name, buffer); + if (temp_file && file_open(temp_file, "r+b") == 0) { + int conv_result = convert(temp_file, segments[i].encoding); - if (from_name && to_name) { - iconv_t cd = iconv_open(to_name, from_name); - if (cd != (iconv_t)-1) { - char *inbuf = (char *)(file_data + segments[i].start); - size_t inbytesleft = segments[i].length; - - /* Allocate output buffer (generous size) */ - size_t outbuf_size = segments[i].length * 4 + 1024; - char *outbuf_start = enca_malloc(outbuf_size); - char *outbuf = outbuf_start; - size_t outbytesleft = outbuf_size; - - size_t result = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (result != (size_t)-1 || inbytesleft == 0) { - size_t converted_bytes = outbuf_size - outbytesleft; - fwrite(outbuf_start, 1, converted_bytes, outfile); - - if (options.verbosity_level > 2) { - fprintf(stderr, " converted %zu bytes -> %zu bytes\n", - segments[i].length, converted_bytes); - } - } else { - fprintf(stderr, "%s: conversion failed for segment %zu, copying unchanged\n", - program_name, i + 1); - fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + if (conv_result == ERR_OK) { + /* Read converted data back */ + FILE *converted = fopen(temp_segment_name, "rb"); + if (converted) { + char copy_buffer[4096]; + size_t copied; + while ((copied = fread(copy_buffer, 1, sizeof(copy_buffer), converted)) > 0) { + fwrite(copy_buffer, 1, copied, outfile); } + fclose(converted); - enca_free(outbuf_start); - iconv_close(cd); + if (options.verbosity_level > 2) { + fprintf(stderr, " segment converted successfully\n"); + } } else { - fprintf(stderr, "%s: cannot open converter from %s to %s for segment %zu\n", - program_name, from_name, to_name, i + 1); + fprintf(stderr, "%s: cannot read converted segment\n", program_name); fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); } } else { - fprintf(stderr, "%s: no iconv name for charset in segment %zu\n", program_name, i + 1); - fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + if (options.mixed_ignore_errors) { + if (options.verbosity_level > 1) { + fprintf(stderr, " conversion failed, copying unchanged (ignore errors mode)\n"); + } + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } else { + fprintf(stderr, "%s: conversion failed for segment %zu\n", program_name, i + 1); + fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); + } } + + file_close(temp_file); + file_free(temp_file); } else { - /* Unknown encoding, copy as-is */ + fprintf(stderr, "%s: cannot create file object for segment conversion\n", program_name); fwrite(file_data + segments[i].start, 1, segments[i].length, outfile); } + + /* Clean up temporary segment file */ + unlink(temp_segment_name); } /* Close and replace original file if successful */ @@ -643,11 +711,6 @@ process_mixed_file(EncaAnalyser an, const char *fname) } enca_free(file_data); -#else - fprintf(stderr, "%s: mixed encoding conversion requires iconv support\n", program_name); - fprintf(stderr, "Would convert %zu segments to %s\n", - segment_count, options.target_enc_str); -#endif } cleanup: diff --git a/src/options.c b/src/options.c index c087121..4b160f8 100644 --- a/src/options.c +++ b/src/options.c @@ -62,6 +62,9 @@ static const Options DEFAULTS = { NULL, /* target_enc_str */ -1, /* prefix_filename */ 0, /* mixed_encodings */ + 1024, /* mixed_buffer_size */ + 0, /* mixed_ignore_errors */ + 0, /* mixed_use_predominant */ }; extern const char *const COPYING_text[]; @@ -210,7 +213,7 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) { /* Short command line options. */ static const char *short_options = - "cC:deE:fgGhil:L:mn:pPrsvVx:M"; + "cC:deE:fgGhil:L:mn:pPrsvVx:MB:I"; /* Long `GNU style' command line options {{{. */ static const struct option long_options[] = { @@ -229,6 +232,8 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) { "list", required_argument, NULL, 'l' }, { "mime-name", no_argument, NULL, 'm' }, { "mixed-encodings", no_argument, NULL, 'M' }, + { "mixed-buffer-size", required_argument, NULL, 'B' }, + { "mixed-ignore-errors", no_argument, NULL, 'I' }, { "name", required_argument, NULL, 'n' }, { "no-filename", no_argument, NULL, 'P' }, { "rfc1345-name", no_argument, NULL, 'r' }, @@ -331,6 +336,23 @@ interpret_opt(int argc, char *argv[], int cmdl_argc) options.mixed_encodings = 1; break; + case 'B': /* Mixed encoding buffer size. */ + { + char *endptr; + long size = strtol(optarg, &endptr, 10); + if (*endptr != '\0' || size <= 0 || size > 1048576) { + fprintf(stderr, "%s: invalid buffer size `%s' (must be 1-1048576)\n", + program_name, optarg); + exit(EXIT_FAILURE); + } + options.mixed_buffer_size = (int)size; + } + break; + + case 'I': /* Mixed encoding ignore errors. */ + options.mixed_ignore_errors = 1; + break; + case 'C': /* Add converters to converter list. */ add_parsed_converters(optarg); break;