From 402d10b1a784fd3268267fa4ead449ec36d66b6b Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Mon, 21 Jul 2025 21:07:18 +0000 Subject: [PATCH 01/11] Add Finnish language support and encoding data - Introduced Finnish language support in the language detection library. - Added encoding data for ISO 8859-4 (ISO 88594) specific to Finnish. - Created necessary files for Finnish language handling, including `finnish.h`, `lang_fi.c`, and associated data files. - Updated language list to include Finnish and its corresponding character sets. - Modified locale detection logic to accommodate Finnish language detection. - Ensured proper memory management and data structures for Finnish language support. --- DEVELOP.md | 2 +- data/Makefile.am | 4 + data/finnish/doit.sh | 2 + data/finnish/finnish.h | 94 ++++++++++++++ data/finnish/iso88594.base | 131 ++++++++++++++++++++ data/finnish/rawcounts.iso88594 | 132 ++++++++++++++++++++ devel-docs/libenca-decl-list.txt | 1 + devel-docs/libenca-decl.txt | 4 + devel-docs/libenca-sections.txt | 1 + devel-docs/tmpl/internal.sgml | 4 + devel-docs/xml/api-index-full.xml | 1 + devel-docs/xml/internal.xml | 8 ++ lib/Makefile.am | 1 + lib/internal.h | 187 +++++++++++++++------------- lib/lang.c | 132 ++++++++++---------- lib/lang_fi.c | 51 ++++++++ src/locale_detect.c | 197 ++++++++++++++++-------------- test/simtable.c | 15 ++- 18 files changed, 718 insertions(+), 249 deletions(-) create mode 100644 data/finnish/doit.sh create mode 100644 data/finnish/finnish.h create mode 100644 data/finnish/iso88594.base create mode 100644 data/finnish/rawcounts.iso88594 create mode 100644 lib/lang_fi.c diff --git a/DEVELOP.md b/DEVELOP.md index e727e43..b08f84c 100644 --- a/DEVELOP.md +++ b/DEVELOP.md @@ -70,7 +70,7 @@ Specifically, for multibyte encodings: existing languages in `data/*` and read `data/README`. * `lib/internal.h`: * Add new `ENCA_LANGUAGE_....` -* `src/lang.c`: +* `lib/lang.c`: * Add a new `LANGUAGE_LIST[]` entry pointing to the `ENCA_LANGUAGE_....` diff --git a/data/Makefile.am b/data/Makefile.am index 14e852b..bd8847e 100644 --- a/data/Makefile.am +++ b/data/Makefile.am @@ -16,6 +16,7 @@ noinst_HEADERS = \ croatian/croatian.h \ czech/czech.h \ estonian/estonian.h \ + finnish/finnish.h \ hungarian/hungarian.h \ latvian/latvian.h \ lithuanian/lithuanian.h \ @@ -46,6 +47,7 @@ noinst_SCRPITS = \ croatian/doit.sh \ czech/doit.sh \ estonian/doit.sh \ + finnish/doit.sh \ hungarian/doit.sh \ latvian/doit.sh \ lithuanian/doit.sh \ @@ -61,6 +63,7 @@ BASES = \ croatian/cp1250.base \ czech/iso88592.base \ estonian/iso88594.base \ + finnish/iso88594.base \ hungarian/iso88592.base \ russian/koi8r.base \ latvian/cp1257.base \ @@ -76,6 +79,7 @@ RAWCOUNTS = \ croatian/rawcounts.cp1250 \ czech/rawcounts.iso88592 \ estonian/rawcounts.iso88594 \ + finnish/rawcounts.iso88594 \ hungarian/rawcounts.iso88592 \ latvian/rawcounts.cp1257 \ lithuanian/rawcounts.cp1257 \ diff --git a/data/finnish/doit.sh b/data/finnish/doit.sh new file mode 100644 index 0000000..26649b3 --- /dev/null +++ b/data/finnish/doit.sh @@ -0,0 +1,2 @@ +#! /bin/bash +../doit.sh iso88594 \ No newline at end of file diff --git a/data/finnish/finnish.h b/data/finnish/finnish.h new file mode 100644 index 0000000..3d8914b --- /dev/null +++ b/data/finnish/finnish.h @@ -0,0 +1,94 @@ +/***** THIS IS A GENERATED FILE. DO NOT TOUCH! *****/ +/* THIS IS A GENERATED TABLE, see data/basetoc.c. */ +static const unsigned short int RAW_ISO88594[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 0, 3, 0, 1, 0, 0, 0, /* 0x20 */ + 5, 5, 1, 0, 147, 46, 237, 0, /* 0x28 */ + 33, 23, 26, 9, 7, 9, 6, 7, /* 0x30 */ + 6, 7, 11, 0, 1, 0, 0, 3, /* 0x38 */ + 0, 21, 8, 5, 4, 22, 5, 3, /* 0x40 */ + 29, 10, 18, 47, 25, 36, 19, 15, /* 0x48 */ + 33, 0, 16, 58, 40, 15, 34, 2, /* 0x50 */ + 0, 11, 1, 0, 0, 0, 0, 0, /* 0x58 */ + 1, 2403, 13, 9, 173, 1560, 11, 34, /* 0x60 */ + 337, 2059, 366, 994, 1114, 578, 1677, 1106, /* 0x68 */ + 336, 0, 477, 1475, 1859, 1022, 454, 3, /* 0x70 */ + 1, 342, 2, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 1, 0, 0, 1, /* 0xa0 */ + 1, 1, 0, 0, 0, 0, 0, 0, /* 0xa8 */ + 1, 1, 0, 0, 0, 0, 1, 0, /* 0xb0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 1, 0, 0, 0, 0, 1, 0, /* 0xc0 */ + 1, 1, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 1, 0, 1, 0, 0, /* 0xd0 */ + 1, 0, 0, 1, 0, 0, 0, 1, /* 0xd8 */ + 0, 0, 1, 1, 809, 0, 1, 0, /* 0xe0 */ + 1, 0, 0, 1, 1, 0, 1, 1, /* 0xe8 */ + 0, 1, 1, 0, 1, 0, 110, 0, /* 0xf0 */ + 0, 0, 1, 1, 0, 0, 1, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED TABLE, see data/totals.pl. */ +static const unsigned short int SIGNIFICANT[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 0, 3, 0, 1, 0, 0, 0, /* 0x20 */ + 5, 5, 1, 0, 147, 46, 237, 0, /* 0x28 */ + 33, 23, 26, 9, 7, 9, 6, 7, /* 0x30 */ + 6, 7, 11, 0, 1, 0, 0, 3, /* 0x38 */ + 0, 21, 8, 5, 4, 22, 5, 3, /* 0x40 */ + 29, 10, 18, 47, 25, 36, 19, 15, /* 0x48 */ + 33, 0, 16, 58, 40, 15, 34, 2, /* 0x50 */ + 0, 11, 1, 0, 0, 0, 0, 0, /* 0x58 */ + 1, 2403, 13, 9, 173, 1560, 11, 34, /* 0x60 */ + 337, 2059, 366, 994, 1114, 578, 1677, 1106, /* 0x68 */ + 336, 0, 477, 1475, 1859, 1022, 454, 3, /* 0x70 */ + 1, 342, 2, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 1, 0, 0, 1, /* 0xa0 */ + 1, 1, 0, 0, 0, 0, 0, 0, /* 0xa8 */ + 1, 1, 0, 0, 0, 0, 1, 0, /* 0xb0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 1, 0, 0, 0, 0, 1, 0, /* 0xc0 */ + 1, 1, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 1, 0, 1, 0, 0, /* 0xd0 */ + 1, 0, 0, 1, 0, 0, 0, 1, /* 0xd8 */ + 0, 0, 1, 1, 809, 0, 1, 0, /* 0xe0 */ + 1, 0, 0, 1, 1, 0, 1, 1, /* 0xe8 */ + 0, 1, 1, 0, 1, 0, 110, 0, /* 0xf0 */ + 0, 0, 1, 1, 0, 0, 1, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define WEIGHT_SUM 20425 + +/* THIS IS A GENERATED TABLE, see data/totals.pl */ +static const char *const CHARSET_NAMES[] = { + "iso88594", +}; + +/* THIS IS A GENERATED TABLE, see data/totals.pl */ +static const unsigned short int *const CHARSET_WEIGHTS[] = { + RAW_ISO88594, +}; + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define CHARSET_LETTERS NULL + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define CHARSET_PAIRS NULL + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define NCHARSETS 1 diff --git a/data/finnish/iso88594.base b/data/finnish/iso88594.base new file mode 100644 index 0000000..3f5cd5a --- /dev/null +++ b/data/finnish/iso88594.base @@ -0,0 +1,131 @@ +! 0 +" 3 +$ 1 +% 0 +& 0 +' 0 +( 5 +) 5 +* 1 ++ 0 +, 147 +- 46 +. 237 +/ 0 +0 33 +1 23 +2 26 +3 9 +4 7 +5 9 +6 6 +7 7 +8 6 +9 7 +: 11 +; 0 +< 1 +> 0 +? 3 +A 21 +B 8 +C 5 +D 4 +E 22 +F 5 +G 3 +H 29 +I 10 +J 18 +K 47 +L 25 +M 36 +N 19 +O 15 +P 33 +Q 0 +R 16 +S 58 +T 40 +U 15 +V 34 +W 2 +X 0 +Y 11 +Z 1 +` 1 +a 2403 +b 13 +c 9 +d 173 +e 1560 +f 11 +g 34 +h 337 +i 2059 +j 366 +k 994 +l 1114 +m 578 +n 1677 +o 1106 +p 336 +q 0 +r 477 +s 1475 +t 1859 +u 1022 +v 454 +w 3 +x 1 +y 342 +z 2 +� 1 +� 1 +� 1 +� 1 +� 0 +� 1 +� 1 +� 0 +� 1 +� 0 +� 0 +� 1 +� 0 +� 0 +� 1 +� 1 +� 1 +� 1 +� 1 +� 0 +� 0 +� 1 +� 1 +� 0 +� 1 +� 0 +� 1 +� 1 +� 809 +� 0 +� 1 +� 1 +� 0 +� 1 +� 1 +� 0 +� 1 +� 1 +� 0 +� 1 +� 1 +� 1 +� 0 +� 110 +� 0 +� 1 +� 1 +� 0 +� 1 diff --git a/data/finnish/rawcounts.iso88594 b/data/finnish/rawcounts.iso88594 new file mode 100644 index 0000000..30f0326 --- /dev/null +++ b/data/finnish/rawcounts.iso88594 @@ -0,0 +1,132 @@ +0x20 10528250 +0x21 ! 2813 +0x22 " 15298 +0x24 $ 29 +0x25 % 780 +0x26 & 805 +0x27 ' 686 +0x28 ( 20788 +0x29 ) 20496 +0x2a * 26 +0x2b + 1247 +0x2c , 594589 +0x2d - 185156 +0x2e . 957006 +0x2f / 2643 +0x30 0 133109 +0x31 1 94834 +0x32 2 105303 +0x33 3 39179 +0x34 4 30079 +0x35 5 37170 +0x36 6 24389 +0x37 7 24136 +0x38 8 24632 +0x39 9 31908 +0x3a : 47915 +0x3b ; 739 +0x3c < 3 +0x3e > 59 +0x3f ? 14153 +0x41 A 85373 +0x42 B 34129 +0x43 C 22131 +0x44 D 17099 +0x45 E 88993 +0x46 F 23766 +0x47 G 15482 +0x48 H 117602 +0x49 I 42377 +0x4a J 74860 +0x4b K 192647 +0x4c L 101390 +0x4d M 146307 +0x4e N 79857 +0x4f O 61092 +0x50 P 133127 +0x51 Q 1494 +0x52 R 67132 +0x53 S 233652 +0x54 T 161743 +0x55 U 62303 +0x56 V 137766 +0x57 W 12058 +0x58 X 1665 +0x59 Y 45403 +0x5a Z 4648 +0x60 ` 28 +0x61 a 9662810 +0x62 b 53597 +0x63 c 39673 +0x64 d 695765 +0x65 e 6273896 +0x66 f 45149 +0x67 g 139320 +0x68 h 1356039 +0x69 i 8282073 +0x6a j 1472743 +0x6b k 3999670 +0x6c l 4482255 +0x6d m 2326017 +0x6e n 6744098 +0x6f o 4449343 +0x70 p 1351313 +0x71 q 1897 +0x72 r 1919721 +0x73 s 5932172 +0x74 t 7477814 +0x75 u 4111589 +0x76 v 1828014 +0x77 w 14187 +0x78 x 5852 +0x79 y 1377345 +0x7a z 11311 +0xa4 � 3 +0xa7 � 34 +0xa8 � 3 +0xa9 � 50 +0xae � 53 +0xb0 � 9 +0xb1 � 1 +0xb4 � 86 +0xb6 � 5 +0xb9 � 998 +0xbe � 202 +0xc1 � 11 +0xc4 � 2565 +0xc5 � 305 +0xc6 � 1 +0xc8 � 4 +0xc9 � 7 +0xd3 � 1 +0xd5 � 7 +0xd6 � 677 +0xd7 � 59 +0xd8 � 11 +0xdb � 1 +0xdc � 59 +0xdf � 4 +0xe1 � 431 +0xe2 � 7 +0xe3 � 22 +0xe4 � 3253874 +0xe5 � 657 +0xe6 � 14 +0xe8 � 46 +0xe9 � 1938 +0xeb � 31 +0xec � 28 +0xed � 81 +0xee � 3 +0xef � 1 +0xf0 � 288 +0xf1 � 1 +0xf2 � 2 +0xf4 � 2 +0xf5 � 92 +0xf6 � 443467 +0xf8 � 120 +0xfa � 17 +0xfb � 2 +0xfc � 1134 +0xfe � 3 diff --git a/devel-docs/libenca-decl-list.txt b/devel-docs/libenca-decl-list.txt index 8fbefa1..a641762 100644 --- a/devel-docs/libenca-decl-list.txt +++ b/devel-docs/libenca-decl-list.txt @@ -75,6 +75,7 @@ ENCA_LANGUAGE_BE ENCA_LANGUAGE_BG ENCA_LANGUAGE_CS ENCA_LANGUAGE_ET +ENCA_LANGUAGE_FI ENCA_LANGUAGE_HR ENCA_LANGUAGE_HU ENCA_LANGUAGE_LT diff --git a/devel-docs/libenca-decl.txt b/devel-docs/libenca-decl.txt index 12e7393..8ab0aaa 100644 --- a/devel-docs/libenca-decl.txt +++ b/devel-docs/libenca-decl.txt @@ -438,6 +438,10 @@ extern const EncaLanguageInfo ENCA_LANGUAGE_CS; extern const EncaLanguageInfo ENCA_LANGUAGE_ET; +ENCA_LANGUAGE_FI +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; + + ENCA_LANGUAGE_HR extern const EncaLanguageInfo ENCA_LANGUAGE_HR; diff --git a/devel-docs/libenca-sections.txt b/devel-docs/libenca-sections.txt index 390a224..ca93652 100644 --- a/devel-docs/libenca-sections.txt +++ b/devel-docs/libenca-sections.txt @@ -119,6 +119,7 @@ ENCA_LANGUAGE_BE ENCA_LANGUAGE_BG ENCA_LANGUAGE_CS ENCA_LANGUAGE_ET +ENCA_LANGUAGE_FI ENCA_LANGUAGE_HR ENCA_LANGUAGE_HU ENCA_LANGUAGE_LT diff --git a/devel-docs/tmpl/internal.sgml b/devel-docs/tmpl/internal.sgml index 3b87cee..448f158 100644 --- a/devel-docs/tmpl/internal.sgml +++ b/devel-docs/tmpl/internal.sgml @@ -438,6 +438,10 @@ Do not use outside Enca library. + + + + diff --git a/devel-docs/xml/api-index-full.xml b/devel-docs/xml/api-index-full.xml index 1c9fa72..aa69d35 100644 --- a/devel-docs/xml/api-index-full.xml +++ b/devel-docs/xml/api-index-full.xml @@ -50,6 +50,7 @@ ENCA_LANGUAGE_CS, variable in Internal Functions enca_language_destroy, function in Internal Functions ENCA_LANGUAGE_ET, variable in Internal Functions +ENCA_LANGUAGE_FI, variable in Internal Functions enca_language_hook_eol, function in Internal Functions enca_language_hook_ncs, function in Internal Functions ENCA_LANGUAGE_HR, variable in Internal Functions diff --git a/devel-docs/xml/internal.xml b/devel-docs/xml/internal.xml index a6526f5..827f33e 100644 --- a/devel-docs/xml/internal.xml +++ b/devel-docs/xml/internal.xml @@ -1511,6 +1511,14 @@ UTF-8, negative doubly-encoded. Estonian language. Everything the world out there needs to know about this language. + +ENCA_LANGUAGE_FI +ENCA_LANGUAGE_FI +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; + +Finnish language. +Everything the world out there needs to know about this language. + ENCA_LANGUAGE_HR ENCA_LANGUAGE_HR diff --git a/lib/Makefile.am b/lib/Makefile.am index 9771adb..b7b5c43 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -21,6 +21,7 @@ libenca_la_SOURCES = \ lang_bg.c \ lang_cs.c \ lang_et.c \ + lang_fi.c \ lang_hr.c \ lang_hu.c \ lang_lt.c \ diff --git a/lib/internal.h b/lib/internal.h index 1330b9f..8ba02db 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -15,38 +15,39 @@ /* str- an mem- function, theoretically they are all in string.h */ #ifdef HAVE_STRING_H -# include +#include #else /* HAVE_STRING_H */ -# ifdef HAVE_STRINGS_H -# include -# endif /* HAVE_STRINGS_H */ +#ifdef HAVE_STRINGS_H +#include +#endif /* HAVE_STRINGS_H */ #endif /* HAVE_STRING_H */ #ifdef HAVE_MEMORY_H -# include +#include #endif /* HAVE_MEMORY_H */ #ifdef DEBUG -# include +#include #endif /* DEBUG */ /* Flags for character type table. * 0-10 are standard ones, 11-13 Enca-specific. */ -enum { - ENCA_CTYPE_ALNUM = 1 << 0, - ENCA_CTYPE_ALPHA = 1 << 1, - ENCA_CTYPE_CNTRL = 1 << 2, - ENCA_CTYPE_DIGIT = 1 << 3, - ENCA_CTYPE_GRAPH = 1 << 4, - ENCA_CTYPE_LOWER = 1 << 5, - ENCA_CTYPE_PRINT = 1 << 6, - ENCA_CTYPE_PUNCT = 1 << 7, - ENCA_CTYPE_SPACE = 1 << 8, - ENCA_CTYPE_UPPER = 1 << 9, +enum +{ + ENCA_CTYPE_ALNUM = 1 << 0, + ENCA_CTYPE_ALPHA = 1 << 1, + ENCA_CTYPE_CNTRL = 1 << 2, + ENCA_CTYPE_DIGIT = 1 << 3, + ENCA_CTYPE_GRAPH = 1 << 4, + ENCA_CTYPE_LOWER = 1 << 5, + ENCA_CTYPE_PRINT = 1 << 6, + ENCA_CTYPE_PUNCT = 1 << 7, + ENCA_CTYPE_SPACE = 1 << 8, + ENCA_CTYPE_UPPER = 1 << 9, ENCA_CTYPE_XDIGIT = 1 << 10, - ENCA_CTYPE_NAME = 1 << 11, + ENCA_CTYPE_NAME = 1 << 11, ENCA_CTYPE_BINARY = 1 << 12, - ENCA_CTYPE_TEXT = 1 << 13 + ENCA_CTYPE_TEXT = 1 << 13 }; /* Forward delcarations of structured Enca types */ @@ -74,7 +75,8 @@ typedef struct _EncaUTFCheckData EncaUTFCheckData; * * All the #int fields are indices in #ALIAS_LIST[]. **/ -struct _EncaCharsetInfo { +struct _EncaCharsetInfo +{ int enca; int rfc1345; int cstocs; @@ -96,7 +98,7 @@ struct _EncaCharsetInfo { * Returns: Nonzero if charset ratigns have been actually modified, zero * otherwise. **/ -typedef int (* EncaHookFunc)(EncaAnalyserState *analyser); +typedef int (*EncaHookFunc)(EncaAnalyserState *analyser); /** * EncaGuessFunc: @@ -106,7 +108,7 @@ typedef int (* EncaHookFunc)(EncaAnalyserState *analyser); * * Returns: Nonzero if analyser->result has been set, zero otherwise. **/ -typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser); +typedef int (*EncaGuessFunc)(EncaAnalyserState *analyser); /** * EncaLanguageInfo: @@ -126,7 +128,8 @@ typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser); * * Language specific data. **/ -struct _EncaLanguageInfo { +struct _EncaLanguageInfo +{ const char *name; const char *humanname; size_t ncharsets; @@ -157,7 +160,8 @@ struct _EncaLanguageInfo { * * Analyser options, a part of analyser state. **/ -struct _EncaAnalyserOptions { +struct _EncaAnalyserOptions +{ int const_buffer; size_t min_chars; double threshold; @@ -210,7 +214,8 @@ struct _EncaAnalyserOptions { * * Passed as an opaque object (`this') to analyser calls. **/ -struct _EncaAnalyserState { +struct _EncaAnalyserState +{ /* Language data. */ const EncaLanguageInfo *lang; size_t ncharsets; @@ -251,7 +256,8 @@ struct _EncaAnalyserState { * * Cointainer for data needed by enca_language_hook_ncs(). **/ -struct _EncaLanguageHookData1CS { +struct _EncaLanguageHookData1CS +{ const char *name; size_t size; const unsigned char *list; @@ -267,7 +273,8 @@ struct _EncaLanguageHookData1CS { * * Cointainer for data needed by enca_language_hook_eol(). **/ -struct _EncaLanguageHookDataEOL { +struct _EncaLanguageHookDataEOL +{ const char *name; EncaSurface eol; size_t cs; @@ -285,7 +292,8 @@ struct _EncaLanguageHookDataEOL { * * Data needed by double-UTF-8 check, per language charset. **/ -struct _EncaUTFCheckData { +struct _EncaUTFCheckData +{ double rating; size_t size; int result; @@ -331,20 +339,20 @@ struct _EncaUTFCheckData { **/ #define enca_ctype_test(c, t) ((enca_ctype_data[(unsigned char)c] & t) != 0) -#define enca_isalnum(c) enca_ctype_test((c), ENCA_CTYPE_ALNUM) -#define enca_isalpha(c) enca_ctype_test((c), ENCA_CTYPE_ALPHA) -#define enca_iscntrl(c) enca_ctype_test((c), ENCA_CTYPE_CNTRL) -#define enca_isdigit(c) enca_ctype_test((c), ENCA_CTYPE_DIGIT) -#define enca_isgraph(c) enca_ctype_test((c), ENCA_CTYPE_GRAPH) -#define enca_islower(c) enca_ctype_test((c), ENCA_CTYPE_LOWER) -#define enca_isprint(c) enca_ctype_test((c), ENCA_CTYPE_PRINT) -#define enca_ispunct(c) enca_ctype_test((c), ENCA_CTYPE_PUNCT) -#define enca_isspace(c) enca_ctype_test((c), ENCA_CTYPE_SPACE) -#define enca_isupper(c) enca_ctype_test((c), ENCA_CTYPE_UPPER) +#define enca_isalnum(c) enca_ctype_test((c), ENCA_CTYPE_ALNUM) +#define enca_isalpha(c) enca_ctype_test((c), ENCA_CTYPE_ALPHA) +#define enca_iscntrl(c) enca_ctype_test((c), ENCA_CTYPE_CNTRL) +#define enca_isdigit(c) enca_ctype_test((c), ENCA_CTYPE_DIGIT) +#define enca_isgraph(c) enca_ctype_test((c), ENCA_CTYPE_GRAPH) +#define enca_islower(c) enca_ctype_test((c), ENCA_CTYPE_LOWER) +#define enca_isprint(c) enca_ctype_test((c), ENCA_CTYPE_PRINT) +#define enca_ispunct(c) enca_ctype_test((c), ENCA_CTYPE_PUNCT) +#define enca_isspace(c) enca_ctype_test((c), ENCA_CTYPE_SPACE) +#define enca_isupper(c) enca_ctype_test((c), ENCA_CTYPE_UPPER) #define enca_isxdigit(c) enca_ctype_test((c), ENCA_CTYPE_XDIGIT) -#define enca_isname(c) enca_ctype_test((c), ENCA_CTYPE_NAME) +#define enca_isname(c) enca_ctype_test((c), ENCA_CTYPE_NAME) #define enca_isbinary(c) enca_ctype_test((c), ENCA_CTYPE_BINARY) -#define enca_istext(c) enca_ctype_test((c), ENCA_CTYPE_TEXT) +#define enca_istext(c) enca_ctype_test((c), ENCA_CTYPE_TEXT) /** * ELEMENTS: @@ -354,11 +362,11 @@ struct _EncaUTFCheckData { * * Returns: the number of elements. **/ -#define ELEMENTS(array) (sizeof(array)/sizeof((array)[0])) +#define ELEMENTS(array) (sizeof(array) / sizeof((array)[0])) -void* enca_malloc (size_t size); -void* enca_realloc (void *ptr, - size_t size); +void *enca_malloc(size_t size); +void *enca_realloc(void *ptr, + size_t size); /** * enca_free: @@ -370,7 +378,11 @@ void* enca_realloc (void *ptr, * @ptr MUST be l-value. **/ #define enca_free(ptr) \ - { if (ptr) free(ptr); ptr=NULL; } + { \ + if (ptr) \ + free(ptr); \ + ptr = NULL; \ + } /** * NEW: @@ -381,7 +393,7 @@ void* enca_realloc (void *ptr, * * Returns: Pointer to the newly allocated memory. **/ -#define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type))) +#define NEW(type, n) ((type *)enca_malloc((n) * sizeof(type))) /** * RENEW: @@ -394,7 +406,7 @@ void* enca_realloc (void *ptr, * Returns: Pointer to the reallocated memory (or pointer safe to call free() * on when @n is zero). **/ -#define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type))) +#define RENEW(ptr, type, n) ((type *)enca_realloc((ptr), (n) * sizeof(type))) /** * MAKE_HOOK_LINE: @@ -403,23 +415,23 @@ void* enca_realloc (void *ptr, * Ugly code `beautifier' macro for language hooks. **/ #define MAKE_HOOK_LINE(name) \ - { #name, ELEMENTS(list_##name), list_##name, (size_t)-1 } + {#name, ELEMENTS(list_##name), list_##name, (size_t)-1} /* Always use our, since we rely on enca_strdup(NULL) -> NULL */ -char* enca_strdup(const char *s); +char *enca_strdup(const char *s); #ifndef HAVE_STRSTR -const char* enca_strstr(const char *haystack, - const char* needle); -#else/* not HAVE_STRSTR */ -# define enca_strstr strstr +const char *enca_strstr(const char *haystack, + const char *needle); +#else /* not HAVE_STRSTR */ +#define enca_strstr strstr #endif /* not HAVE_STRSTR */ #ifndef HAVE_STPCPY -char* enca_stpcpy(char *dest, +char *enca_stpcpy(char *dest, const char *src); #else /* not HAVE_STPCPY */ -# define enca_stpcpy stpcpy +#define enca_stpcpy stpcpy #endif /* not HAVE_STPCPY */ /** @@ -431,58 +443,59 @@ char* enca_stpcpy(char *dest, #define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA) /* common.c */ -char* enca_strconcat (const char *str, - ...); -char* enca_strappend (char *str, - ...); +char *enca_strconcat(const char *str, + ...); +char *enca_strappend(char *str, + ...); /* encnames.c */ -int enca_name_to_charset (const char *csname); -EncaSurface enca_name_to_surface (const char *sname); +int enca_name_to_charset(const char *csname); +EncaSurface enca_name_to_surface(const char *sname); /* enca.c */ -int enca_language_init (EncaAnalyserState *analyser, - const char *langname); -void enca_language_destroy (EncaAnalyserState *analyser); -double* enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang); +int enca_language_init(EncaAnalyserState *analyser, + const char *langname); +void enca_language_destroy(EncaAnalyserState *analyser); +double *enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang); /* unicodemap.c */ -int enca_charsets_subset_identical (int charset1, - int charset2, - const size_t *counts); +int enca_charsets_subset_identical(int charset1, + int charset2, + const size_t *counts); /* filters.c */ -size_t enca_filter_boxdraw (EncaAnalyserState *analyser, - unsigned char fill_char); -int enca_language_hook_ncs (EncaAnalyserState *analyser, - size_t ncs, - EncaLanguageHookData1CS *hookdata); -int enca_language_hook_eol (EncaAnalyserState *analyser, - size_t ncs, - EncaLanguageHookDataEOL *hookdata); +size_t enca_filter_boxdraw(EncaAnalyserState *analyser, + unsigned char fill_char); +int enca_language_hook_ncs(EncaAnalyserState *analyser, + size_t ncs, + EncaLanguageHookData1CS *hookdata); +int enca_language_hook_eol(EncaAnalyserState *analyser, + size_t ncs, + EncaLanguageHookDataEOL *hookdata); /* guess.c */ -void enca_guess_init (EncaAnalyserState *analyser); -void enca_guess_destroy (EncaAnalyserState *analyser); -EncaSurface enca_eol_surface (const unsigned char *buffer, - size_t size, - const size_t *counts); -void enca_find_max_sec (EncaAnalyserState *analyser); +void enca_guess_init(EncaAnalyserState *analyser); +void enca_guess_destroy(EncaAnalyserState *analyser); +EncaSurface enca_eol_surface(const unsigned char *buffer, + size_t size, + const size_t *counts); +void enca_find_max_sec(EncaAnalyserState *analyser); /* utf8_double.c */ -void enca_double_utf8_init (EncaAnalyserState *analyser); -void enca_double_utf8_destroy (EncaAnalyserState *analyser); +void enca_double_utf8_init(EncaAnalyserState *analyser); +void enca_double_utf8_destroy(EncaAnalyserState *analyser); /* pair.c */ -void enca_pair_init (EncaAnalyserState *analyser); -void enca_pair_destroy (EncaAnalyserState *analyser); -int enca_pair_analyse (EncaAnalyserState *analyser); +void enca_pair_init(EncaAnalyserState *analyser); +void enca_pair_destroy(EncaAnalyserState *analyser); +int enca_pair_analyse(EncaAnalyserState *analyser); /* Languages. */ extern const EncaLanguageInfo ENCA_LANGUAGE_BE; extern const EncaLanguageInfo ENCA_LANGUAGE_BG; extern const EncaLanguageInfo ENCA_LANGUAGE_CS; extern const EncaLanguageInfo ENCA_LANGUAGE_ET; +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; extern const EncaLanguageInfo ENCA_LANGUAGE_HR; extern const EncaLanguageInfo ENCA_LANGUAGE_HU; extern const EncaLanguageInfo ENCA_LANGUAGE_LT; diff --git a/lib/lang.c b/lib/lang.c index bda21f2..73bf823 100644 --- a/lib/lang.c +++ b/lib/lang.c @@ -17,7 +17,7 @@ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #ifdef HAVE_CONFIG_H -# include "config.h" +#include "config.h" #endif /* HAVE_CONFIG_H */ #include "enca.h" @@ -30,45 +30,46 @@ * tested **/ static const EncaLanguageInfo ENCA_LANGUAGE___ = { - "__", /* name */ - "none", /* human name */ - 0, /* number of charsets */ - NULL, /* their names */ - NULL, /* character weights */ - NULL, /* significancy data */ - NULL, /* letter data */ - NULL, /* pair data */ - 0, /* sum of weights */ - NULL, /* hook function */ - NULL, /* eolhook function */ - NULL, /* lcuchook function */ - NULL, /* ratinghook function */ + "__", /* name */ + "none", /* human name */ + 0, /* number of charsets */ + NULL, /* their names */ + NULL, /* character weights */ + NULL, /* significancy data */ + NULL, /* letter data */ + NULL, /* pair data */ + 0, /* sum of weights */ + NULL, /* hook function */ + NULL, /* eolhook function */ + NULL, /* lcuchook function */ + NULL, /* ratinghook function */ }; /* All languages. */ static const EncaLanguageInfo *const LANGUAGE_LIST[] = { - &ENCA_LANGUAGE_BE, /* Belarusian. */ - &ENCA_LANGUAGE_BG, /* Bulgarian. */ - &ENCA_LANGUAGE_CS, /* Czech. */ - &ENCA_LANGUAGE_ET, /* Estonian. */ - &ENCA_LANGUAGE_HR, /* Croatian. */ - &ENCA_LANGUAGE_HU, /* Hungarian. */ - &ENCA_LANGUAGE_LT, /* Latvian. */ - &ENCA_LANGUAGE_LV, /* Lithuanian. */ - &ENCA_LANGUAGE_PL, /* Polish. */ - &ENCA_LANGUAGE_RU, /* Russian. */ - &ENCA_LANGUAGE_SK, /* Slovak. */ - &ENCA_LANGUAGE_SL, /* Slovene. */ - &ENCA_LANGUAGE_UK, /* Ukrainian. */ - &ENCA_LANGUAGE_ZH, /* Chinese. */ - &ENCA_LANGUAGE___, /* None. */ + &ENCA_LANGUAGE_BE, /* Belarusian. */ + &ENCA_LANGUAGE_BG, /* Bulgarian. */ + &ENCA_LANGUAGE_CS, /* Czech. */ + &ENCA_LANGUAGE_ET, /* Estonian. */ + &ENCA_LANGUAGE_FI, /* Finnish. */ + &ENCA_LANGUAGE_HR, /* Croatian. */ + &ENCA_LANGUAGE_HU, /* Hungarian. */ + &ENCA_LANGUAGE_LT, /* Latvian. */ + &ENCA_LANGUAGE_LV, /* Lithuanian. */ + &ENCA_LANGUAGE_PL, /* Polish. */ + &ENCA_LANGUAGE_RU, /* Russian. */ + &ENCA_LANGUAGE_SK, /* Slovak. */ + &ENCA_LANGUAGE_SL, /* Slovene. */ + &ENCA_LANGUAGE_UK, /* Ukrainian. */ + &ENCA_LANGUAGE_ZH, /* Chinese. */ + &ENCA_LANGUAGE___, /* None. */ }; #define NLANGUAGES (ELEMENTS(LANGUAGE_LIST)) /* Local prototypes. */ -static int* language_charsets_ids(const EncaLanguageInfo *lang); -static const EncaLanguageInfo* find_language(const char *langname); +static int *language_charsets_ids(const EncaLanguageInfo *lang); +static const EncaLanguageInfo *find_language(const char *langname); /** * enca_language_init: @@ -82,9 +83,8 @@ static const EncaLanguageInfo* find_language(const char *langname); * * Returns: Nonzero on success, zero otherwise. **/ -int -enca_language_init(EncaAnalyserState *analyser, - const char *langname) +int enca_language_init(EncaAnalyserState *analyser, + const char *langname) { const EncaLanguageInfo *lang; @@ -116,8 +116,7 @@ enca_language_init(EncaAnalyserState *analyser, * * Destroys the language part of analyser state @analyser. **/ -void -enca_language_destroy(EncaAnalyserState *analyser) +void enca_language_destroy(EncaAnalyserState *analyser) { enca_free(analyser->charsets); enca_free(analyser->lcbits); @@ -140,13 +139,13 @@ enca_language_destroy(EncaAnalyserState *analyser) * * Returns: The list of languages, storing their number into *@n. **/ -const char** +const char ** enca_get_languages(size_t *n) { const char **languages; size_t i; - languages = NEW(const char*, NLANGUAGES); + languages = NEW(const char *, NLANGUAGES); for (i = 0; i < NLANGUAGES; i++) languages[i] = LANGUAGE_LIST[i]->name; @@ -164,7 +163,7 @@ enca_get_languages(size_t *n) * * Returns: The language name. **/ -const char* +const char * enca_analyser_language(EncaAnalyser analyser) { assert(analyser != NULL); @@ -182,7 +181,7 @@ enca_analyser_language(EncaAnalyser analyser) * * Returns: The English language name. **/ -const char* +const char * enca_language_english_name(const char *lang) { const EncaLanguageInfo *linfo; @@ -207,16 +206,16 @@ enca_language_english_name(const char *lang) * contains no charsets or @langname is invalid, #NULL is returned * and zero stored into *@n. **/ -int* -enca_get_language_charsets(const char *langname, - size_t *n) +int *enca_get_language_charsets(const char *langname, + size_t *n) { const EncaLanguageInfo *lang; assert(langname != NULL); lang = find_language(langname); - if (lang == NULL) { + if (lang == NULL) + { *n = 0; return NULL; } @@ -236,7 +235,7 @@ enca_get_language_charsets(const char *langname, * * Returns: The charsets id table; #NULL when @lang has no charsets. **/ -static int* +static int * language_charsets_ids(const EncaLanguageInfo *lang) { int *charsets; @@ -248,7 +247,8 @@ language_charsets_ids(const EncaLanguageInfo *lang) return NULL; charsets = NEW(int, lang->ncharsets); - for (i = 0; i < lang->ncharsets; i++) { + for (i = 0; i < lang->ncharsets; i++) + { charsets[i] = enca_name_to_charset(lang->csnames[i]); assert(charsets[i] != ENCA_CS_UNKNOWN); } @@ -264,7 +264,7 @@ language_charsets_ids(const EncaLanguageInfo *lang) * * Returns: Pointer to its language information data; #NULL if not found. **/ -static const EncaLanguageInfo* +static const EncaLanguageInfo * find_language(const char *langname) { const EncaLanguageInfo *lang = NULL; @@ -273,8 +273,10 @@ find_language(const char *langname) if (langname == NULL) return NULL; - for (i = 0; i < NLANGUAGES; i++) { - if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) { + for (i = 0; i < NLANGUAGES; i++) + { + if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) + { lang = LANGUAGE_LIST[i]; break; } @@ -303,7 +305,7 @@ find_language(const char *langname) * Returns: The matrix, its size is determined by @lang->ncharsets; #NULL * for language with no charsets. **/ -double* +double * enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) { const size_t n = lang->ncharsets; @@ -319,27 +321,32 @@ enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) return NULL; /* Below diagonal. */ - smat = NEW(double, n*n); - for (i = 0; i < n; i++) { - for (j = 0; j <= i; j++) { - smat[i*n + j] = 0.0; + smat = NEW(double, n *n); + for (i = 0; i < n; i++) + { + for (j = 0; j <= i; j++) + { + smat[i * n + j] = 0.0; for (c = 0; c < 0x100; c++) - smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON); + smat[i * n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON); } } /* Above diagonal. */ - for (i = 0; i < n; i++) { - for (j = i+1; j < n; j++) - smat[i*n + j] = smat[j*n + i]; + for (i = 0; i < n; i++) + { + for (j = i + 1; j < n; j++) + smat[i * n + j] = smat[j * n + i]; } /* Normalize. */ - for (i = 0; i < n; i++) { - double wmax = smat[i*n + i]; + for (i = 0; i < n; i++) + { + double wmax = smat[i * n + i]; - for (j = 0; j < n; j++) { - smat[i*n + j] /= wmax; + for (j = 0; j < n; j++) + { + smat[i * n + j] /= wmax; } } @@ -347,4 +354,3 @@ enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) } /* vim: ts=2 */ - diff --git a/lib/lang_fi.c b/lib/lang_fi.c new file mode 100644 index 0000000..5dc7034 --- /dev/null +++ b/lib/lang_fi.c @@ -0,0 +1,51 @@ +/* + encoding data and routines dependent on language; finnish + + Copyright (C) 2025 + + This program is free software; you can redistribute it and/or modify it + under the terms of version 2 of the GNU General Public License as published + by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +*/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include "enca.h" +#include "internal.h" +#include "data/finnish/finnish.h" + +/** + * ENCA_LANGUAGE_FI: + * + * Finnish language. + * + * Everything the world out there needs to know about this language. + **/ +const EncaLanguageInfo ENCA_LANGUAGE_FI = { + "fi", + "finnish", + NCHARSETS, + CHARSET_NAMES, + CHARSET_WEIGHTS, + SIGNIFICANT, + CHARSET_LETTERS, + CHARSET_PAIRS, + WEIGHT_SUM, + NULL, + NULL, + NULL, + NULL, +}; + +/* vim: ts=2 + */ \ No newline at end of file diff --git a/src/locale_detect.c b/src/locale_detect.c index 4502228..446b990 100644 --- a/src/locale_detect.c +++ b/src/locale_detect.c @@ -19,32 +19,32 @@ #include "common.h" #ifdef HAVE_SETLOCALE -# ifdef HAVE_LOCALE_H -# include -# else /* HAVE_LOCALE_H */ -char* setlocale(int category, const char *locale); -# endif /* HAVE_LOCALE_H */ +#ifdef HAVE_LOCALE_H +#include +#else /* HAVE_LOCALE_H */ +char *setlocale(int category, const char *locale); +#endif /* HAVE_LOCALE_H */ #endif /* HAVE_SETLOCALE */ #ifdef HAVE_NL_LANGINFO -# ifdef HAVE_LANGINFO_H -# include -# else /* HAVE_LANGINFO_H */ +#ifdef HAVE_LANGINFO_H +#include +#else /* HAVE_LANGINFO_H */ char *nl_langinfo(nl_item *item); -# endif /* HAVE_LANGINFO_H */ +#endif /* HAVE_LANGINFO_H */ #endif /* HAVE_NL_LANGINFO */ static char *codeset = NULL; /* Local prototypes. */ -static char* locale_alias_convert(const char *locname); -static char* strip_locale_name(const char *locname); -static char* static_iso639_alias_convert(const char *locname); +static char *locale_alias_convert(const char *locname); +static char *strip_locale_name(const char *locname); +static char *static_iso639_alias_convert(const char *locname); #ifdef HAVE_SETLOCALE -static char* detect_target_charset(const char *locname); -static char* detect_user_language(void); +static char *detect_target_charset(const char *locname); +static char *detect_user_language(void); #endif /* HAVE_SETLOCALE */ -static void codeset_free(void); +static void codeset_free(void); /* * when lang is not NULL converts it to two-character language code @@ -52,7 +52,7 @@ static void codeset_free(void); * returns string of length 2 containig language code (to be freed by caller) * or NULL if not detected or unable to convert. */ -char* +char * detect_lang(const char *lang) { char *locname, *result, *cvt; @@ -60,7 +60,8 @@ detect_lang(const char *lang) atexit(codeset_free); #ifdef HAVE_SETLOCALE /* No lang, detect locale, then CODESET, then try to transform it */ - if (!lang) { + if (!lang) + { locname = detect_user_language(); /* HERE: locname is (a) newly allocated (b) NULL */ codeset = detect_target_charset(locname); @@ -81,7 +82,7 @@ detect_lang(const char *lang) enca_free(locname); return result; -#else /* HAVE_SETLOCALE */ +#else /* HAVE_SETLOCALE */ UNUSED(locname); cvt = locale_alias_convert(lang); result = strip_locale_name(cvt); @@ -98,7 +99,7 @@ detect_lang(const char *lang) * * Returns: A string (to be freed) with charset name or NULL on failure. **/ -static char* +static char * detect_target_charset(const char *locname) { char *s = NULL; @@ -112,9 +113,10 @@ detect_target_charset(const char *locname) s = enca_strdup(nl_langinfo(CODESET)); - if (setlocale(LC_CTYPE, "C") == NULL) { + if (setlocale(LC_CTYPE, "C") == NULL) + { fprintf(stderr, "%s: Cannot set LC_CTYPE to the portable \"C\" locale\n", - program_name); + program_name); exit(EXIT_TROUBLE); } if (options.verbosity_level > 2) @@ -137,33 +139,33 @@ detect_target_charset(const char *locname) * * Returns: A string (to be freed) with locale name or NULL on failure. **/ -static char* +static char * detect_user_language(void) { static const int test_categories[] = { - LC_CTYPE, LC_COLLATE, + LC_CTYPE, + LC_COLLATE, #if HAVE_LC_MESSAGES - LC_MESSAGES, + LC_MESSAGES, #endif }; char *s = NULL; size_t i; - for (i = 0; i < ELEMENTS(test_categories); i++) { + for (i = 0; i < ELEMENTS(test_categories); i++) + { enca_free(s); if ((s = setlocale(test_categories[i], "")) == NULL) continue; s = enca_strdup(s); - if (setlocale(test_categories[i], "C") == NULL) { + if (setlocale(test_categories[i], "C") == NULL) + { fprintf(stderr, "%s: Cannot set locale to the portable \"C\" locale\n", - program_name); + program_name); exit(EXIT_TROUBLE); } - if (strcmp(s, "") == 0 - || strcmp(s, "C") == 0 - || strcmp(s, "POSIX") == 0 - || (strncmp(s, "en", 2) == 0 && !isalpha(s[2]))) + if (strcmp(s, "") == 0 || strcmp(s, "C") == 0 || strcmp(s, "POSIX") == 0 || (strncmp(s, "en", 2) == 0 && !isalpha(s[2]))) continue; if (options.verbosity_level > 2) @@ -186,13 +188,13 @@ detect_user_language(void) (but the worst thing that can happen is we return wrong locale name) the locale.alias format is nowhere described, so we assume every line consists of alias (row 1), some whitespace and canonical name */ -static char* +static char * locale_alias_convert(const char *locname) { #ifdef HAVE_LOCALE_ALIAS File *fla; /* locale.alias file */ Buffer *buf; - char *s,*p,*q; + char *s, *p, *q; size_t n; #endif /* HAVE_LOCALE_ALIAS */ @@ -207,8 +209,10 @@ locale_alias_convert(const char *locname) /* try to read locale.alias */ buf = buffer_new(0); fla = file_new(LOCALE_ALIAS_PATH, buf); - if (file_open(fla, "r") != 0) { - if (options.verbosity_level) { + if (file_open(fla, "r") != 0) + { + if (options.verbosity_level) + { fprintf(stderr, "Cannot find locale.alias file.\n" "This build of enca probably has been configured for " "quite a different system\n"); @@ -222,16 +226,20 @@ locale_alias_convert(const char *locname) somewhat crude now */ n = strlen(locname); p = NULL; - s = (char*)buf->data; /* alias */ - while (file_getline(fla) != NULL) { + s = (char *)buf->data; /* alias */ + while (file_getline(fla) != NULL) + { if (strncmp(s, locname, n) == 0 && - (isspace(s[n]) || (s[n] == ':' && isspace(s[n+1])))) { + (isspace(s[n]) || (s[n] == ':' && isspace(s[n + 1])))) + { p = s + n; /* skip any amount of whitespace */ - while (isspace(*p)) p++; + while (isspace(*p)) + p++; q = p; /* anything up to next whitespace is the canonical locale name */ - while (*q != '\0' && !isspace(*q)) q++; + while (*q != '\0' && !isspace(*q)) + q++; *q = '\0'; p = enca_strdup(p); break; @@ -242,7 +250,7 @@ locale_alias_convert(const char *locname) buffer_free(buf); return p != NULL ? p : static_iso639_alias_convert(locname); -#else /* HAVE_LOCALE_ALIAS */ +#else /* HAVE_LOCALE_ALIAS */ return static_iso639_alias_convert(locname); #endif /* HAVE_LOCALE_ALIAS */ } @@ -256,7 +264,7 @@ locale_alias_convert(const char *locname) * * Returns: the codeset name. **/ -const char* +const char * get_lang_codeset(void) { if (!codeset) @@ -271,18 +279,18 @@ get_lang_codeset(void) * * Returned string should be freed by caller. **/ -static char* +static char * strip_locale_name(const char *locname) { /* Some supported languages can also appear as dialects of some other * language */ - struct { + struct + { const char *dialect; const char *iso639; - } - const DIALECTS[] = { - { "cs_SK", "sk" }, - { "ru_UA", "uk" }, + } const DIALECTS[] = { + {"cs_SK", "sk"}, + {"ru_UA", "uk"}, }; size_t n; @@ -298,13 +306,15 @@ strip_locale_name(const char *locname) return s; /* Some long specification (either X/Open or CEN). */ - if (n >= 5 && s[2] == '_' - && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) { + if (n >= 5 && s[2] == '_' && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) + { size_t i; /* Convert dialects. */ - for (i = 0; i < ELEMENTS(DIALECTS); i++) { - if (strncmp(DIALECTS[i].dialect, s, 5) == 0) { + for (i = 0; i < ELEMENTS(DIALECTS); i++) + { + if (strncmp(DIALECTS[i].dialect, s, 5) == 0) + { s[0] = DIALECTS[i].iso639[0]; s[1] = DIALECTS[i].iso639[1]; break; @@ -313,7 +323,8 @@ strip_locale_name(const char *locname) s[2] = '\0'; } - else { + else + { /* Just garbage or some unresolved locale alias. */ enca_free(s); } @@ -328,57 +339,59 @@ strip_locale_name(const char *locname) * * Returned string should be freed by caller. **/ -static char* +static char * static_iso639_alias_convert(const char *locname) { - struct { + struct + { const char *alias; const char *iso639; - } - const ALIASES[] = { - { "byelarussian", "be" }, - { "byelarusian", "be" }, - { "belarussian", "be" }, - { "belarusian", "be" }, - { "byelorussian", "be" }, - { "belorussian", "be" }, - { "byelorusian", "be" }, - { "belorusian", "be" }, - { "bosnian", "hr" }, - { "bulgarian", "bg" }, - { "chinese", "zh" }, - { "croatian", "hr" }, - { "czech", "cs" }, - { "estonian", "et" }, - { "hungarian", "hu" }, - { "lativan", "lt" }, - { "lettic", "lv" }, - { "lettish", "lv" }, - { "lithuanian", "lt" }, - { "macedonian", "bg"}, - { "magyar", "hu" }, - { "montenegrin-cyrilic", "bg"}, - { "montenegrin-latin", "hr" }, - { "polish", "pl" }, - { "russian", "ru" }, - { "serbian-cyrilic", "bg"}, - { "serbian-latin", "hr"}, - { "slovak", "sk" }, - { "slovene", "sl" }, - { "slovenian", "sl" }, - { "ukrainian", "uk" } - }; + } const ALIASES[] = { + {"byelarussian", "be"}, + {"byelarusian", "be"}, + {"belarussian", "be"}, + {"belarusian", "be"}, + {"byelorussian", "be"}, + {"belorussian", "be"}, + {"byelorusian", "be"}, + {"belorusian", "be"}, + {"bosnian", "hr"}, + {"bulgarian", "bg"}, + {"chinese", "zh"}, + {"croatian", "hr"}, + {"czech", "cs"}, + {"estonian", "et"}, + {"finnish", "fi"}, + {"hungarian", "hu"}, + {"lativan", "lt"}, + {"lettic", "lv"}, + {"lettish", "lv"}, + {"lithuanian", "lt"}, + {"macedonian", "bg"}, + {"magyar", "hu"}, + {"montenegrin-cyrilic", "bg"}, + {"montenegrin-latin", "hr"}, + {"polish", "pl"}, + {"russian", "ru"}, + {"serbian-cyrilic", "bg"}, + {"serbian-latin", "hr"}, + {"slovak", "sk"}, + {"slovene", "sl"}, + {"slovenian", "sl"}, + {"ukrainian", "uk"}}; size_t i; if (!locname) return NULL; - for (i = 0; i < ELEMENTS(ALIASES); i++) { - if (strcmp(ALIASES[i].alias, locname) == 0) { + for (i = 0; i < ELEMENTS(ALIASES); i++) + { + if (strcmp(ALIASES[i].alias, locname) == 0) + { if (options.verbosity_level > 2) fprintf(stderr, "Decrypted locale alias using built-in table: %s\n", - ALIASES[i].iso639); + ALIASES[i].iso639); return enca_strdup(ALIASES[i].iso639); } diff --git a/test/simtable.c b/test/simtable.c index 1d41e05..21a317c 100644 --- a/test/simtable.c +++ b/test/simtable.c @@ -16,7 +16,8 @@ prl(const EncaLanguageInfo *l, const char *hooks) int a; size_t i, j; - if (myargc > 1) { + if (myargc > 1) + { a = 1; while (a < myargc && strcmp(myargv[a], l->name)) a++; @@ -26,9 +27,11 @@ prl(const EncaLanguageInfo *l, const char *hooks) printf("\n==\x1b[1m%s\x1b[m==\n", l->name); m = enca_get_charset_similarity_matrix(l); - for (i = 0; i < l->ncharsets; i++) { - for (j = 0; j < l->ncharsets; j++) { - double q = 1000.0*m[i*l->ncharsets + j]; + for (i = 0; i < l->ncharsets; i++) + { + for (j = 0; j < l->ncharsets; j++) + { + double q = 1000.0 * m[i * l->ncharsets + j]; if (i == j) printf("\x1b[36m"); @@ -49,8 +52,7 @@ prl(const EncaLanguageInfo *l, const char *hooks) free(m); } -int -main(int argc, char *argv[]) +int main(int argc, char *argv[]) { myargc = argc; myargv = argv; @@ -59,6 +61,7 @@ main(int argc, char *argv[]) prl(&ENCA_LANGUAGE_BG, "1251mac"); prl(&ENCA_LANGUAGE_CS, "isowin 852kam"); prl(&ENCA_LANGUAGE_ET, ""); + prl(&ENCA_LANGUAGE_FI, ""); prl(&ENCA_LANGUAGE_HR, "isowin"); prl(&ENCA_LANGUAGE_HU, "isocork isowin[XXX]"); prl(&ENCA_LANGUAGE_LT, "winbalt lat4balt iso13win[XXX]"); From b4fa0d06507682d56cd2df3127e1d7b3a5f6ee92 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Mon, 21 Jul 2025 21:41:07 +0000 Subject: [PATCH 02/11] Add Finnish language support to Makefile configurations --- data/Makefile.in | 4 ++++ lib/Makefile.in | 27 ++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/data/Makefile.in b/data/Makefile.in index b968a56..7e3c1f5 100644 --- a/data/Makefile.in +++ b/data/Makefile.in @@ -367,6 +367,7 @@ noinst_HEADERS = \ croatian/croatian.h \ czech/czech.h \ estonian/estonian.h \ + finnish/finnish.h \ hungarian/hungarian.h \ latvian/latvian.h \ lithuanian/lithuanian.h \ @@ -396,6 +397,7 @@ noinst_SCRPITS = \ croatian/doit.sh \ czech/doit.sh \ estonian/doit.sh \ + finnish/doit.sh \ hungarian/doit.sh \ latvian/doit.sh \ lithuanian/doit.sh \ @@ -411,6 +413,7 @@ BASES = \ croatian/cp1250.base \ czech/iso88592.base \ estonian/iso88594.base \ + finnish/iso88594.base \ hungarian/iso88592.base \ russian/koi8r.base \ latvian/cp1257.base \ @@ -426,6 +429,7 @@ RAWCOUNTS = \ croatian/rawcounts.cp1250 \ czech/rawcounts.iso88592 \ estonian/rawcounts.iso88594 \ + finnish/rawcounts.iso88594 \ hungarian/rawcounts.iso88592 \ latvian/rawcounts.cp1257 \ lithuanian/rawcounts.cp1257 \ diff --git a/lib/Makefile.in b/lib/Makefile.in index 8daf0dc..820c000 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -140,9 +140,10 @@ LTLIBRARIES = $(lib_LTLIBRARIES) libenca_la_LIBADD = am_libenca_la_OBJECTS = common.lo ctype.lo enca.lo encnames.lo \ filters.lo guess.lo lang.lo lang_be.lo lang_bg.lo lang_cs.lo \ - lang_et.lo lang_hr.lo lang_hu.lo lang_lt.lo lang_lv.lo \ - lang_pl.lo lang_ru.lo lang_sk.lo lang_sl.lo lang_uk.lo \ - lang_zh.lo multibyte.lo pair.lo unicodemap.lo utf8_double.lo + lang_et.lo lang_fi.lo lang_hr.lo lang_hu.lo lang_lt.lo \ + lang_lv.lo lang_pl.lo lang_ru.lo lang_sk.lo lang_sl.lo \ + lang_uk.lo lang_zh.lo multibyte.lo pair.lo unicodemap.lo \ + utf8_double.lo libenca_la_OBJECTS = $(am_libenca_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -171,14 +172,14 @@ am__depfiles_remade = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/ctype.Plo \ ./$(DEPDIR)/filters.Plo ./$(DEPDIR)/guess.Plo \ ./$(DEPDIR)/lang.Plo ./$(DEPDIR)/lang_be.Plo \ ./$(DEPDIR)/lang_bg.Plo ./$(DEPDIR)/lang_cs.Plo \ - ./$(DEPDIR)/lang_et.Plo ./$(DEPDIR)/lang_hr.Plo \ - ./$(DEPDIR)/lang_hu.Plo ./$(DEPDIR)/lang_lt.Plo \ - ./$(DEPDIR)/lang_lv.Plo ./$(DEPDIR)/lang_pl.Plo \ - ./$(DEPDIR)/lang_ru.Plo ./$(DEPDIR)/lang_sk.Plo \ - ./$(DEPDIR)/lang_sl.Plo ./$(DEPDIR)/lang_uk.Plo \ - ./$(DEPDIR)/lang_zh.Plo ./$(DEPDIR)/multibyte.Plo \ - ./$(DEPDIR)/pair.Plo ./$(DEPDIR)/unicodemap.Plo \ - ./$(DEPDIR)/utf8_double.Plo + ./$(DEPDIR)/lang_et.Plo ./$(DEPDIR)/lang_fi.Plo \ + ./$(DEPDIR)/lang_hr.Plo ./$(DEPDIR)/lang_hu.Plo \ + ./$(DEPDIR)/lang_lt.Plo ./$(DEPDIR)/lang_lv.Plo \ + ./$(DEPDIR)/lang_pl.Plo ./$(DEPDIR)/lang_ru.Plo \ + ./$(DEPDIR)/lang_sk.Plo ./$(DEPDIR)/lang_sl.Plo \ + ./$(DEPDIR)/lang_uk.Plo ./$(DEPDIR)/lang_zh.Plo \ + ./$(DEPDIR)/multibyte.Plo ./$(DEPDIR)/pair.Plo \ + ./$(DEPDIR)/unicodemap.Plo ./$(DEPDIR)/utf8_double.Plo am__mv = mv -f COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) @@ -397,6 +398,7 @@ libenca_la_SOURCES = \ lang_bg.c \ lang_cs.c \ lang_et.c \ + lang_fi.c \ lang_hr.c \ lang_hu.c \ lang_lt.c \ @@ -504,6 +506,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_bg.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_cs.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_et.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_fi.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_hr.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_hu.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_lt.Plo@am__quote@ # am--include-marker @@ -711,6 +714,7 @@ distclean: distclean-am -rm -f ./$(DEPDIR)/lang_bg.Plo -rm -f ./$(DEPDIR)/lang_cs.Plo -rm -f ./$(DEPDIR)/lang_et.Plo + -rm -f ./$(DEPDIR)/lang_fi.Plo -rm -f ./$(DEPDIR)/lang_hr.Plo -rm -f ./$(DEPDIR)/lang_hu.Plo -rm -f ./$(DEPDIR)/lang_lt.Plo @@ -781,6 +785,7 @@ maintainer-clean: maintainer-clean-am -rm -f ./$(DEPDIR)/lang_bg.Plo -rm -f ./$(DEPDIR)/lang_cs.Plo -rm -f ./$(DEPDIR)/lang_et.Plo + -rm -f ./$(DEPDIR)/lang_fi.Plo -rm -f ./$(DEPDIR)/lang_hr.Plo -rm -f ./$(DEPDIR)/lang_hu.Plo -rm -f ./$(DEPDIR)/lang_lt.Plo From 5f90ab78dcf05932db94901061bf8c4bad11bbae Mon Sep 17 00:00:00 2001 From: Egor-OSSRevival Date: Fri, 25 Jul 2025 11:44:00 +0300 Subject: [PATCH 03/11] Improve input handling in normalize.pl to skip malformed lines and correctly process space character counts --- data/normalize.pl | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/data/normalize.pl b/data/normalize.pl index b86de69..4e67486 100755 --- a/data/normalize.pl +++ b/data/normalize.pl @@ -19,7 +19,15 @@ if (!defined $ARGV[0]) { while () { - ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + chomp; + # Handle special case of space character (0x20 count) + if (/^(0x20)\s+(\d+)$/) { + $char_hex[$n] = $1; + $char[$n] = ' '; + $count[$n] = $2; + } else { + ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + } if ($max < $count[$n]) { $max = $count[$n]; @@ -49,7 +57,15 @@ my $sum2 = 0; while () { - ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + chomp; + # Handle special case of space character (0x20 count) + if (/^(0x20)\s+(\d+)$/) { + $char_hex[$n] = $1; + $char[$n] = ' '; + $count[$n] = $2; + } else { + ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + } $sum2 += $count[$n]; $n++; From fd9acfc048f1e7ad939390ea5db9ed7bc23447a1 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Fri, 25 Jul 2025 13:23:25 +0000 Subject: [PATCH 04/11] Enhance Finnish language support by adding cp1257 charset handling and updating related files. Modify doit.sh to include cp1257, expand finnish.h to define RAW_CP1257, and update iso88594.base and rawcounts.iso88594 with new data. Implement hooks in lang_fi.c for charset selection between iso8859-4 and cp1257. --- data/finnish/doit.sh | 2 +- data/finnish/finnish.h | 170 +++++++++++-------- data/finnish/iso88594.base | 240 ++++++++++++++------------ data/finnish/rawcounts.iso88594 | 287 +++++++++++++++++--------------- lib/lang_fi.c | 45 ++++- 5 files changed, 436 insertions(+), 308 deletions(-) mode change 100644 => 100755 data/finnish/doit.sh diff --git a/data/finnish/doit.sh b/data/finnish/doit.sh old mode 100644 new mode 100755 index 26649b3..9b02625 --- a/data/finnish/doit.sh +++ b/data/finnish/doit.sh @@ -1,2 +1,2 @@ #! /bin/bash -../doit.sh iso88594 \ No newline at end of file +../doit.sh iso88594 cp1257 \ No newline at end of file diff --git a/data/finnish/finnish.h b/data/finnish/finnish.h index 3d8914b..140ef26 100644 --- a/data/finnish/finnish.h +++ b/data/finnish/finnish.h @@ -1,87 +1,125 @@ /***** THIS IS A GENERATED FILE. DO NOT TOUCH! *****/ /* THIS IS A GENERATED TABLE, see data/basetoc.c. */ static const unsigned short int RAW_ISO88594[] = { - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ - 0, 0, 3, 0, 1, 0, 0, 0, /* 0x20 */ - 5, 5, 1, 0, 147, 46, 237, 0, /* 0x28 */ - 33, 23, 26, 9, 7, 9, 6, 7, /* 0x30 */ - 6, 7, 11, 0, 1, 0, 0, 3, /* 0x38 */ - 0, 21, 8, 5, 4, 22, 5, 3, /* 0x40 */ - 29, 10, 18, 47, 25, 36, 19, 15, /* 0x48 */ - 33, 0, 16, 58, 40, 15, 34, 2, /* 0x50 */ - 0, 11, 1, 0, 0, 0, 0, 0, /* 0x58 */ - 1, 2403, 13, 9, 173, 1560, 11, 34, /* 0x60 */ - 337, 2059, 366, 994, 1114, 578, 1677, 1106, /* 0x68 */ - 336, 0, 477, 1475, 1859, 1022, 454, 3, /* 0x70 */ - 1, 342, 2, 0, 0, 0, 0, 0, /* 0x78 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ - 0, 0, 0, 0, 1, 0, 0, 1, /* 0xa0 */ - 1, 1, 0, 0, 0, 0, 0, 0, /* 0xa8 */ - 1, 1, 0, 0, 0, 0, 1, 0, /* 0xb0 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ - 0, 1, 0, 0, 0, 0, 1, 0, /* 0xc0 */ - 1, 1, 0, 0, 0, 0, 0, 0, /* 0xc8 */ - 0, 0, 0, 1, 0, 1, 0, 0, /* 0xd0 */ - 1, 0, 0, 1, 0, 0, 0, 1, /* 0xd8 */ - 0, 0, 1, 1, 809, 0, 1, 0, /* 0xe0 */ - 1, 0, 0, 1, 1, 0, 1, 1, /* 0xe8 */ - 0, 1, 1, 0, 1, 0, 110, 0, /* 0xf0 */ - 0, 0, 1, 1, 0, 0, 1, 0, /* 0xf8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 4, 21, 0, 0, 2, 1, 2, /* 0x20 */ + 24, 24, 0, 0, 326, 128, 0, 7, /* 0x28 */ + 122, 105, 76, 33, 28, 34, 25, 25, /* 0x30 */ + 28, 49, 39, 2, 0, 0, 0, 8, /* 0x38 */ + 0, 58, 21, 18, 14, 52, 14, 11, /* 0x40 */ + 69, 31, 46, 118, 61, 83, 42, 37, /* 0x48 */ + 74, 0, 39, 133, 96, 23, 66, 9, /* 0x50 */ + 1, 23, 1, 0, 0, 0, 0, 1, /* 0x58 */ + 0, 5399, 37, 32, 403, 3579, 32, 84, /* 0x60 */ + 765, 4735, 859, 2254, 2539, 1338, 3767, 2470, /* 0x68 */ + 768, 1, 1096, 3362, 4242, 2326, 1031, 11, /* 0x70 */ + 4, 800, 6, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 6, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 2, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 1808, 0, 0, 0, /* 0xe0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 0, 0, 0, 0, 0, 0, 255, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED TABLE, see data/basetoc.c. */ +static const unsigned short int RAW_CP1257[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 4, 21, 0, 0, 2, 1, 2, /* 0x20 */ + 24, 24, 0, 0, 326, 128, 0, 7, /* 0x28 */ + 122, 105, 76, 33, 28, 34, 25, 25, /* 0x30 */ + 28, 49, 39, 2, 0, 0, 0, 8, /* 0x38 */ + 0, 58, 21, 18, 14, 52, 14, 11, /* 0x40 */ + 69, 31, 46, 118, 61, 83, 42, 37, /* 0x48 */ + 74, 0, 39, 133, 96, 23, 66, 9, /* 0x50 */ + 1, 23, 1, 0, 0, 0, 0, 1, /* 0x58 */ + 0, 5399, 37, 32, 403, 3579, 32, 84, /* 0x60 */ + 765, 4735, 859, 2254, 2539, 1338, 3767, 2470, /* 0x68 */ + 768, 1, 1096, 3362, 4242, 2326, 1031, 11, /* 0x70 */ + 4, 800, 6, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 6, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 2, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 1808, 0, 0, 0, /* 0xe0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 1, 0, 0, 0, 0, 0, 255, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ }; /* THIS IS A GENERATED TABLE, see data/totals.pl. */ static const unsigned short int SIGNIFICANT[] = { - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ - 0, 0, 3, 0, 1, 0, 0, 0, /* 0x20 */ - 5, 5, 1, 0, 147, 46, 237, 0, /* 0x28 */ - 33, 23, 26, 9, 7, 9, 6, 7, /* 0x30 */ - 6, 7, 11, 0, 1, 0, 0, 3, /* 0x38 */ - 0, 21, 8, 5, 4, 22, 5, 3, /* 0x40 */ - 29, 10, 18, 47, 25, 36, 19, 15, /* 0x48 */ - 33, 0, 16, 58, 40, 15, 34, 2, /* 0x50 */ - 0, 11, 1, 0, 0, 0, 0, 0, /* 0x58 */ - 1, 2403, 13, 9, 173, 1560, 11, 34, /* 0x60 */ - 337, 2059, 366, 994, 1114, 578, 1677, 1106, /* 0x68 */ - 336, 0, 477, 1475, 1859, 1022, 454, 3, /* 0x70 */ - 1, 342, 2, 0, 0, 0, 0, 0, /* 0x78 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ - 0, 0, 0, 0, 1, 0, 0, 1, /* 0xa0 */ - 1, 1, 0, 0, 0, 0, 0, 0, /* 0xa8 */ - 1, 1, 0, 0, 0, 0, 1, 0, /* 0xb0 */ - 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ - 0, 1, 0, 0, 0, 0, 1, 0, /* 0xc0 */ - 1, 1, 0, 0, 0, 0, 0, 0, /* 0xc8 */ - 0, 0, 0, 1, 0, 1, 0, 0, /* 0xd0 */ - 1, 0, 0, 1, 0, 0, 0, 1, /* 0xd8 */ - 0, 0, 1, 1, 809, 0, 1, 0, /* 0xe0 */ - 1, 0, 0, 1, 1, 0, 1, 1, /* 0xe8 */ - 0, 1, 1, 0, 1, 0, 110, 0, /* 0xf0 */ - 0, 0, 1, 1, 0, 0, 1, 0, /* 0xf8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 8, 42, 0, 0, 4, 2, 4, /* 0x20 */ + 48, 48, 0, 0, 652, 256, 0, 14, /* 0x28 */ + 244, 210, 152, 66, 56, 68, 50, 50, /* 0x30 */ + 56, 98, 78, 4, 0, 0, 0, 16, /* 0x38 */ + 0, 116, 42, 36, 28, 104, 28, 22, /* 0x40 */ + 138, 62, 92, 236, 122, 166, 84, 74, /* 0x48 */ + 148, 0, 78, 266, 192, 46, 132, 18, /* 0x50 */ + 2, 46, 2, 0, 0, 0, 0, 2, /* 0x58 */ + 0, 10798, 74, 64, 806, 7158, 64, 168, /* 0x60 */ + 1530, 9470, 1718, 4508, 5078, 2676, 7534, 4940, /* 0x68 */ + 1536, 2, 2192, 6724, 8484, 4652, 2062, 22, /* 0x70 */ + 8, 1600, 12, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 12, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 4, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 3616, 0, 0, 0, /* 0xe0 */ + 0, 2, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 1, 0, 0, 0, 0, 0, 510, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ }; /* THIS IS A GENERATED VALUE, see data/totals.pl */ -#define WEIGHT_SUM 20425 +#define WEIGHT_SUM 46267 /* THIS IS A GENERATED TABLE, see data/totals.pl */ static const char *const CHARSET_NAMES[] = { "iso88594", + "cp1257", }; /* THIS IS A GENERATED TABLE, see data/totals.pl */ static const unsigned short int *const CHARSET_WEIGHTS[] = { RAW_ISO88594, + RAW_CP1257, }; /* THIS IS A GENERATED VALUE, see data/totals.pl */ @@ -91,4 +129,4 @@ static const unsigned short int *const CHARSET_WEIGHTS[] = { #define CHARSET_PAIRS NULL /* THIS IS A GENERATED VALUE, see data/totals.pl */ -#define NCHARSETS 1 +#define NCHARSETS 2 diff --git a/data/finnish/iso88594.base b/data/finnish/iso88594.base index 3f5cd5a..31f7520 100644 --- a/data/finnish/iso88594.base +++ b/data/finnish/iso88594.base` 1 -a 2403 -b 13 -c 9 -d 173 -e 1560 -f 11 -g 34 -h 337 -i 2059 -j 366 -k 994 -l 1114 -m 578 -n 1677 -o 1106 -p 336 -q 0 -r 477 -s 1475 -t 1859 -u 1022 -v 454 -w 3 -x 1 -y 342 -z 2 -� 1 -� 1 -� 1 -� 1 +[ 0 +\ 0 +] 0 +^ 0 +_ 1 +` 0 +a 5399 +b 37 +c 32 +d 403 +e 3579 +f 32 +g 84 +h 765 +i 4735 +j 859 +k 2254 +l 2539 +m 1338 +n 3767 +o 2470 +p 768 +q 1 +r 1096 +s 3362 +t 4242 +u 2326 +v 1031 +w 11 +x 4 +y 800 +z 6 +{ 0 +| 0 +} 0 +~ 0 +. 0 � 0 -� 1 -� 1 � 0 -� 1 � 0 � 0 -� 1 � 0 � 0 -� 1 -� 1 -� 1 -� 1 -� 1 � 0 � 0 -� 1 -� 1 +� 6 +� 0 � 0 -� 1 � 0 -� 1 -� 1 -� 809 � 0 -� 1 -� 1 � 0 -� 1 -� 1 � 0 -� 1 -� 1 � 0 -� 1 -� 1 � 1 � 0 -� 110 � 0 -� 1 -� 1 +� 0 +� 0 +� 2 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 1808 +� 0 +� 0 +� 0 � 0 � 1 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 255 +� 0 +� 0 +� 0 +� 0 +� 0 diff --git a/data/finnish/rawcounts.iso88594 b/data/finnish/rawcounts.iso88594 index 30f0326..71ef656 100644 --- a/data/finnish/rawcounts.iso88594 +++ b/data/finnish/rawcounts.iso88594 @@ -1,132 +1,155 @@ -0x20 10528250 -0x21 ! 2813 -0x22 " 15298 -0x24 $ 29 -0x25 % 780 -0x26 & 805 -0x27 ' 686 -0x28 ( 20788 -0x29 ) 20496 -0x2a * 26 -0x2b + 1247 -0x2c , 594589 -0x2d - 185156 -0x2e . 957006 -0x2f / 2643 -0x30 0 133109 -0x31 1 94834 -0x32 2 105303 -0x33 3 39179 -0x34 4 30079 -0x35 5 37170 -0x36 6 24389 -0x37 7 24136 -0x38 8 24632 -0x39 9 31908 -0x3a : 47915 -0x3b ; 739 -0x3c < 3 -0x3e > 59 -0x3f ? 14153 -0x41 A 85373 -0x42 B 34129 -0x43 C 22131 -0x44 D 17099 -0x45 E 88993 -0x46 F 23766 -0x47 G 15482 -0x48 H 117602 -0x49 I 42377 -0x4a J 74860 -0x4b K 192647 -0x4c L 101390 -0x4d M 146307 -0x4e N 79857 -0x4f O 61092 -0x50 P 133127 -0x51 Q 1494 -0x52 R 67132 -0x53 S 233652 -0x54 T 161743 -0x55 U 62303 -0x56 V 137766 -0x57 W 12058 -0x58 X 1665 -0x59 Y 45403 -0x5a Z 4648 -0x60 ` 28 -0x61 a 9662810 -0x62 b 53597 -0x63 c 39673 -0x64 d 695765 -0x65 e 6273896 -0x66 f 45149 -0x67 g 139320 -0x68 h 1356039 -0x69 i 8282073 -0x6a j 1472743 -0x6b k 3999670 -0x6c l 4482255 -0x6d m 2326017 -0x6e n 6744098 -0x6f o 4449343 -0x70 p 1351313 -0x71 q 1897 -0x72 r 1919721 -0x73 s 5932172 -0x74 t 7477814 -0x75 u 4111589 -0x76 v 1828014 -0x77 w 14187 -0x78 x 5852 -0x79 y 1377345 -0x7a z 11311 -0xa4 � 3 -0xa7 � 34 -0xa8 � 3 -0xa9 � 50 -0xae � 53 -0xb0 � 9 -0xb1 � 1 -0xb4 � 86 -0xb6 � 5 -0xb9 � 998 -0xbe � 202 -0xc1 � 11 -0xc4 � 2565 -0xc5 � 305 -0xc6 � 1 -0xc8 � 4 -0xc9 � 7 -0xd3 � 1 -0xd5 � 7 -0xd6 � 677 -0xd7 � 59 -0xd8 � 11 -0xdb � 1 -0xdc � 59 -0xdf � 4 -0xe1 � 431 -0xe2 � 7 -0xe3 � 22 -0xe4 � 3253874 -0xe5 � 657 -0xe6 � 14 -0xe8 � 46 -0xe9 � 1938 -0xeb � 31 -0xec � 28 -0xed � 81 -0xee � 3 -0xef � 1 -0xf0 � 288 -0xf1 � 1 -0xf2 � 2 -0xf4 � 2 -0xf5 � 92 -0xf6 � 443467 -0xf8 � 120 -0xfa � 17 -0xfb � 2 -0xfc � 1134 -0xfe � 3 +0x08 . 7 +0x15 . 1 +0x16 . 1 +0x20 43309824 +0x21 ! 32929 +0x22 " 157749 +0x23 # 2231 +0x24 $ 273 +0x25 % 20671 +0x26 & 10414 +0x27 ' 14661 +0x28 ( 176988 +0x29 ) 173923 +0x2a * 2473 +0x2b + 7176 +0x2c , 2356929 +0x2d - 929385 +0x2e . 4034374 +0x2f / 50930 +0x30 0 881852 +0x31 1 758265 +0x32 2 555082 +0x33 3 241716 +0x34 4 206462 +0x35 5 248893 +0x36 6 184643 +0x37 7 180858 +0x38 8 203671 +0x39 9 359659 +0x3a : 285993 +0x3b ; 19592 +0x3c < 339 +0x3d = 2455 +0x3e > 1246 +0x3f ? 63572 +0x40 @ 2648 +0x41 A 424041 +0x42 B 154984 +0x43 C 132076 +0x44 D 105440 +0x45 E 376940 +0x46 F 106667 +0x47 G 81094 +0x48 H 500425 +0x49 I 228774 +0x4a J 338869 +0x4b K 857247 +0x4c L 443843 +0x4d M 599604 +0x4e N 306990 +0x4f O 273646 +0x50 P 537341 +0x51 Q 7082 +0x52 R 282878 +0x53 S 964224 +0x54 T 697305 +0x55 U 170075 +0x56 V 482988 +0x57 W 65684 +0x58 X 9926 +0x59 Y 168603 +0x5a Z 13982 +0x5b [ 2555 +0x5c \ 54 +0x5d ] 2574 +0x5e ^ 90 +0x5f _ 7538 +0x60 ` 269 +0x61 a 38977603 +0x62 b 273986 +0x63 c 236578 +0x64 d 2913605 +0x65 e 25839170 +0x66 f 233861 +0x67 g 611191 +0x68 h 5525729 +0x69 i 34184835 +0x6a j 6203886 +0x6b k 16273609 +0x6c l 18331533 +0x6d m 9660734 +0x6e n 27200205 +0x6f o 17835561 +0x70 p 5544908 +0x71 q 9220 +0x72 r 7912194 +0x73 s 24276850 +0x74 t 30624995 +0x75 u 16795139 +0x76 v 7446789 +0x77 w 80798 +0x78 x 34546 +0x79 y 5775453 +0x7a z 49256 +0x7b { 233 +0x7c | 635 +0x7d } 229 +0x7e ~ 200 +0x7f . 7 +0xa1 � 1 +0xa4 � 51 +0xa6 � 1 +0xa7 � 7098 +0xa8 � 95 +0xa9 � 962 +0xaa � 1 +0xab � 1 +0xad � 48990 +0xae � 256 +0xaf � 2 +0xb0 � 1561 +0xb1 � 31 +0xb4 � 803 +0xb6 � 21 +0xb8 � 3 +0xb9 � 7560 +0xba � 217 +0xbb � 3 +0xbe � 1435 +0xc0 � 7 +0xc4 � 15430 +0xc5 � 1636 +0xc6 � 45 +0xc8 � 87 +0xc9 � 314 +0xcf � 1 +0xd2 � 183 +0xd3 � 16 +0xd5 � 21 +0xd6 � 3626 +0xd7 � 920 +0xd8 � 151 +0xdc � 207 +0xde � 1 +0xdf � 230 +0xe0 � 272 +0xe4 � 13055748 +0xe5 � 4188 +0xe6 � 275 +0xe7 � 2 +0xe8 � 353 +0xe9 � 11018 +0xea � 32 +0xec � 93 +0xef � 102 +0xf1 � 53 +0xf2 � 1948 +0xf3 � 13 +0xf5 � 557 +0xf6 � 1847997 +0xf7 � 1 +0xf8 � 1072 +0xf9 � 5 +0xfc � 4651 +0xfe � 599 \ No newline at end of file diff --git a/lib/lang_fi.c b/lib/lang_fi.c index 5dc7034..cccd13b 100644 --- a/lib/lang_fi.c +++ b/lib/lang_fi.c @@ -24,6 +24,10 @@ #include "internal.h" #include "data/finnish/finnish.h" +/* Local prototypes. */ +static int hook(EncaAnalyserState *analyser); +static int hook_iso4cp1257(EncaAnalyserState *analyser); + /** * ENCA_LANGUAGE_FI: * @@ -41,11 +45,50 @@ const EncaLanguageInfo ENCA_LANGUAGE_FI = { CHARSET_LETTERS, CHARSET_PAIRS, WEIGHT_SUM, - NULL, + &hook, NULL, NULL, NULL, }; +/** + * hook: + * @analyser: Analyser state whose charset ratings are to be modified. + * + * Launches language specific hooks for language "fi". + * + * Returns: Nonzero if charset ratigns have been actually modified, zero + * otherwise. + **/ +static int +hook(EncaAnalyserState *analyser) +{ + return hook_iso4cp1257(analyser); +} + +/** + * hook_iso4cp1257: + * @analyser: Analyser state whose charset ratings are to be modified. + * + * Decides between iso8859-4 and cp1257 charsets for language "fi". + * + * Returns: Nonzero if charset ratigns have been actually modified, zero + * otherwise. + **/ +static int +hook_iso4cp1257(EncaAnalyserState *analyser) +{ + static const unsigned char list_iso88594[] = { + 0xb9, 0xbe, 0xa9, 0xae, 0xa8}; + static const unsigned char list_cp1257[] = { + 0xf0, 0xfe, 0xd0, 0xde, 0xb8}; + static EncaLanguageHookData1CS hookdata[] = { + MAKE_HOOK_LINE(iso88594), + MAKE_HOOK_LINE(cp1257), + }; + + return enca_language_hook_ncs(analyser, ELEMENTS(hookdata), hookdata); +} + /* vim: ts=2 */ \ No newline at end of file From 943abd6535625d0980f362dcebd9eb5ff809cb02 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Mon, 28 Jul 2025 12:14:00 +0000 Subject: [PATCH 05/11] Refactor header inclusions and formatting in internal.h, lang.c, locale_detect.c, and simtable.c for improved readability and consistency. --- lib/internal.h | 188 +++++++++++++++++++---------------------- lib/lang.c | 132 ++++++++++++++--------------- src/locale_detect.c | 200 +++++++++++++++++++++----------------------- test/simtable.c | 16 ++-- 4 files changed, 252 insertions(+), 284 deletions(-) diff --git a/lib/internal.h b/lib/internal.h index 8ba02db..191c108 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -15,39 +15,38 @@ /* str- an mem- function, theoretically they are all in string.h */ #ifdef HAVE_STRING_H -#include +# include #else /* HAVE_STRING_H */ -#ifdef HAVE_STRINGS_H -#include -#endif /* HAVE_STRINGS_H */ +# ifdef HAVE_STRINGS_H +# include +# endif /* HAVE_STRINGS_H */ #endif /* HAVE_STRING_H */ #ifdef HAVE_MEMORY_H -#include +# include #endif /* HAVE_MEMORY_H */ #ifdef DEBUG -#include +# include #endif /* DEBUG */ /* Flags for character type table. * 0-10 are standard ones, 11-13 Enca-specific. */ -enum -{ - ENCA_CTYPE_ALNUM = 1 << 0, - ENCA_CTYPE_ALPHA = 1 << 1, - ENCA_CTYPE_CNTRL = 1 << 2, - ENCA_CTYPE_DIGIT = 1 << 3, - ENCA_CTYPE_GRAPH = 1 << 4, - ENCA_CTYPE_LOWER = 1 << 5, - ENCA_CTYPE_PRINT = 1 << 6, - ENCA_CTYPE_PUNCT = 1 << 7, - ENCA_CTYPE_SPACE = 1 << 8, - ENCA_CTYPE_UPPER = 1 << 9, +enum { + ENCA_CTYPE_ALNUM = 1 << 0, + ENCA_CTYPE_ALPHA = 1 << 1, + ENCA_CTYPE_CNTRL = 1 << 2, + ENCA_CTYPE_DIGIT = 1 << 3, + ENCA_CTYPE_GRAPH = 1 << 4, + ENCA_CTYPE_LOWER = 1 << 5, + ENCA_CTYPE_PRINT = 1 << 6, + ENCA_CTYPE_PUNCT = 1 << 7, + ENCA_CTYPE_SPACE = 1 << 8, + ENCA_CTYPE_UPPER = 1 << 9, ENCA_CTYPE_XDIGIT = 1 << 10, - ENCA_CTYPE_NAME = 1 << 11, + ENCA_CTYPE_NAME = 1 << 11, ENCA_CTYPE_BINARY = 1 << 12, - ENCA_CTYPE_TEXT = 1 << 13 + ENCA_CTYPE_TEXT = 1 << 13 }; /* Forward delcarations of structured Enca types */ @@ -75,8 +74,7 @@ typedef struct _EncaUTFCheckData EncaUTFCheckData; * * All the #int fields are indices in #ALIAS_LIST[]. **/ -struct _EncaCharsetInfo -{ +struct _EncaCharsetInfo { int enca; int rfc1345; int cstocs; @@ -98,7 +96,7 @@ struct _EncaCharsetInfo * Returns: Nonzero if charset ratigns have been actually modified, zero * otherwise. **/ -typedef int (*EncaHookFunc)(EncaAnalyserState *analyser); +typedef int (* EncaHookFunc)(EncaAnalyserState *analyser); /** * EncaGuessFunc: @@ -108,7 +106,7 @@ typedef int (*EncaHookFunc)(EncaAnalyserState *analyser); * * Returns: Nonzero if analyser->result has been set, zero otherwise. **/ -typedef int (*EncaGuessFunc)(EncaAnalyserState *analyser); +typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser); /** * EncaLanguageInfo: @@ -128,8 +126,7 @@ typedef int (*EncaGuessFunc)(EncaAnalyserState *analyser); * * Language specific data. **/ -struct _EncaLanguageInfo -{ +struct _EncaLanguageInfo { const char *name; const char *humanname; size_t ncharsets; @@ -160,8 +157,7 @@ struct _EncaLanguageInfo * * Analyser options, a part of analyser state. **/ -struct _EncaAnalyserOptions -{ +struct _EncaAnalyserOptions { int const_buffer; size_t min_chars; double threshold; @@ -214,8 +210,7 @@ struct _EncaAnalyserOptions * * Passed as an opaque object (`this') to analyser calls. **/ -struct _EncaAnalyserState -{ +struct _EncaAnalyserState { /* Language data. */ const EncaLanguageInfo *lang; size_t ncharsets; @@ -256,8 +251,7 @@ struct _EncaAnalyserState * * Cointainer for data needed by enca_language_hook_ncs(). **/ -struct _EncaLanguageHookData1CS -{ +struct _EncaLanguageHookData1CS { const char *name; size_t size; const unsigned char *list; @@ -273,8 +267,7 @@ struct _EncaLanguageHookData1CS * * Cointainer for data needed by enca_language_hook_eol(). **/ -struct _EncaLanguageHookDataEOL -{ +struct _EncaLanguageHookDataEOL { const char *name; EncaSurface eol; size_t cs; @@ -292,8 +285,7 @@ struct _EncaLanguageHookDataEOL * * Data needed by double-UTF-8 check, per language charset. **/ -struct _EncaUTFCheckData -{ +struct _EncaUTFCheckData { double rating; size_t size; int result; @@ -339,20 +331,20 @@ struct _EncaUTFCheckData **/ #define enca_ctype_test(c, t) ((enca_ctype_data[(unsigned char)c] & t) != 0) -#define enca_isalnum(c) enca_ctype_test((c), ENCA_CTYPE_ALNUM) -#define enca_isalpha(c) enca_ctype_test((c), ENCA_CTYPE_ALPHA) -#define enca_iscntrl(c) enca_ctype_test((c), ENCA_CTYPE_CNTRL) -#define enca_isdigit(c) enca_ctype_test((c), ENCA_CTYPE_DIGIT) -#define enca_isgraph(c) enca_ctype_test((c), ENCA_CTYPE_GRAPH) -#define enca_islower(c) enca_ctype_test((c), ENCA_CTYPE_LOWER) -#define enca_isprint(c) enca_ctype_test((c), ENCA_CTYPE_PRINT) -#define enca_ispunct(c) enca_ctype_test((c), ENCA_CTYPE_PUNCT) -#define enca_isspace(c) enca_ctype_test((c), ENCA_CTYPE_SPACE) -#define enca_isupper(c) enca_ctype_test((c), ENCA_CTYPE_UPPER) +#define enca_isalnum(c) enca_ctype_test((c), ENCA_CTYPE_ALNUM) +#define enca_isalpha(c) enca_ctype_test((c), ENCA_CTYPE_ALPHA) +#define enca_iscntrl(c) enca_ctype_test((c), ENCA_CTYPE_CNTRL) +#define enca_isdigit(c) enca_ctype_test((c), ENCA_CTYPE_DIGIT) +#define enca_isgraph(c) enca_ctype_test((c), ENCA_CTYPE_GRAPH) +#define enca_islower(c) enca_ctype_test((c), ENCA_CTYPE_LOWER) +#define enca_isprint(c) enca_ctype_test((c), ENCA_CTYPE_PRINT) +#define enca_ispunct(c) enca_ctype_test((c), ENCA_CTYPE_PUNCT) +#define enca_isspace(c) enca_ctype_test((c), ENCA_CTYPE_SPACE) +#define enca_isupper(c) enca_ctype_test((c), ENCA_CTYPE_UPPER) #define enca_isxdigit(c) enca_ctype_test((c), ENCA_CTYPE_XDIGIT) -#define enca_isname(c) enca_ctype_test((c), ENCA_CTYPE_NAME) +#define enca_isname(c) enca_ctype_test((c), ENCA_CTYPE_NAME) #define enca_isbinary(c) enca_ctype_test((c), ENCA_CTYPE_BINARY) -#define enca_istext(c) enca_ctype_test((c), ENCA_CTYPE_TEXT) +#define enca_istext(c) enca_ctype_test((c), ENCA_CTYPE_TEXT) /** * ELEMENTS: @@ -362,11 +354,11 @@ struct _EncaUTFCheckData * * Returns: the number of elements. **/ -#define ELEMENTS(array) (sizeof(array) / sizeof((array)[0])) +#define ELEMENTS(array) (sizeof(array)/sizeof((array)[0])) -void *enca_malloc(size_t size); -void *enca_realloc(void *ptr, - size_t size); +void* enca_malloc (size_t size); +void* enca_realloc (void *ptr, + size_t size); /** * enca_free: @@ -378,11 +370,7 @@ void *enca_realloc(void *ptr, * @ptr MUST be l-value. **/ #define enca_free(ptr) \ - { \ - if (ptr) \ - free(ptr); \ - ptr = NULL; \ - } + { if (ptr) free(ptr); ptr=NULL; } /** * NEW: @@ -393,7 +381,7 @@ void *enca_realloc(void *ptr, * * Returns: Pointer to the newly allocated memory. **/ -#define NEW(type, n) ((type *)enca_malloc((n) * sizeof(type))) +#define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type))) /** * RENEW: @@ -406,7 +394,7 @@ void *enca_realloc(void *ptr, * Returns: Pointer to the reallocated memory (or pointer safe to call free() * on when @n is zero). **/ -#define RENEW(ptr, type, n) ((type *)enca_realloc((ptr), (n) * sizeof(type))) +#define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type))) /** * MAKE_HOOK_LINE: @@ -415,23 +403,23 @@ void *enca_realloc(void *ptr, * Ugly code `beautifier' macro for language hooks. **/ #define MAKE_HOOK_LINE(name) \ - {#name, ELEMENTS(list_##name), list_##name, (size_t)-1} + { #name, ELEMENTS(list_##name), list_##name, (size_t)-1 } /* Always use our, since we rely on enca_strdup(NULL) -> NULL */ -char *enca_strdup(const char *s); +char* enca_strdup(const char *s); #ifndef HAVE_STRSTR -const char *enca_strstr(const char *haystack, - const char *needle); -#else /* not HAVE_STRSTR */ -#define enca_strstr strstr +const char* enca_strstr(const char *haystack, + const char* needle); +#else/* not HAVE_STRSTR */ +# define enca_strstr strstr #endif /* not HAVE_STRSTR */ #ifndef HAVE_STPCPY -char *enca_stpcpy(char *dest, +char* enca_stpcpy(char *dest, const char *src); #else /* not HAVE_STPCPY */ -#define enca_stpcpy stpcpy +# define enca_stpcpy stpcpy #endif /* not HAVE_STPCPY */ /** @@ -443,52 +431,52 @@ char *enca_stpcpy(char *dest, #define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA) /* common.c */ -char *enca_strconcat(const char *str, - ...); -char *enca_strappend(char *str, - ...); +char* enca_strconcat (const char *str, + ...); +char* enca_strappend (char *str, + ...); /* encnames.c */ -int enca_name_to_charset(const char *csname); -EncaSurface enca_name_to_surface(const char *sname); +int enca_name_to_charset (const char *csname); +EncaSurface enca_name_to_surface (const char *sname); /* enca.c */ -int enca_language_init(EncaAnalyserState *analyser, - const char *langname); -void enca_language_destroy(EncaAnalyserState *analyser); -double *enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang); +int enca_language_init (EncaAnalyserState *analyser, + const char *langname); +void enca_language_destroy (EncaAnalyserState *analyser); +double* enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang); /* unicodemap.c */ -int enca_charsets_subset_identical(int charset1, - int charset2, - const size_t *counts); +int enca_charsets_subset_identical (int charset1, + int charset2, + const size_t *counts); /* filters.c */ -size_t enca_filter_boxdraw(EncaAnalyserState *analyser, - unsigned char fill_char); -int enca_language_hook_ncs(EncaAnalyserState *analyser, - size_t ncs, - EncaLanguageHookData1CS *hookdata); -int enca_language_hook_eol(EncaAnalyserState *analyser, - size_t ncs, - EncaLanguageHookDataEOL *hookdata); +size_t enca_filter_boxdraw (EncaAnalyserState *analyser, + unsigned char fill_char); +int enca_language_hook_ncs (EncaAnalyserState *analyser, + size_t ncs, + EncaLanguageHookData1CS *hookdata); +int enca_language_hook_eol (EncaAnalyserState *analyser, + size_t ncs, + EncaLanguageHookDataEOL *hookdata); /* guess.c */ -void enca_guess_init(EncaAnalyserState *analyser); -void enca_guess_destroy(EncaAnalyserState *analyser); -EncaSurface enca_eol_surface(const unsigned char *buffer, - size_t size, - const size_t *counts); -void enca_find_max_sec(EncaAnalyserState *analyser); +void enca_guess_init (EncaAnalyserState *analyser); +void enca_guess_destroy (EncaAnalyserState *analyser); +EncaSurface enca_eol_surface (const unsigned char *buffer, + size_t size, + const size_t *counts); +void enca_find_max_sec (EncaAnalyserState *analyser); /* utf8_double.c */ -void enca_double_utf8_init(EncaAnalyserState *analyser); -void enca_double_utf8_destroy(EncaAnalyserState *analyser); +void enca_double_utf8_init (EncaAnalyserState *analyser); +void enca_double_utf8_destroy (EncaAnalyserState *analyser); /* pair.c */ -void enca_pair_init(EncaAnalyserState *analyser); -void enca_pair_destroy(EncaAnalyserState *analyser); -int enca_pair_analyse(EncaAnalyserState *analyser); +void enca_pair_init (EncaAnalyserState *analyser); +void enca_pair_destroy (EncaAnalyserState *analyser); +int enca_pair_analyse (EncaAnalyserState *analyser); /* Languages. */ extern const EncaLanguageInfo ENCA_LANGUAGE_BE; @@ -517,4 +505,4 @@ extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[]; /* Locale-independent character type table. */ extern const short int enca_ctype_data[0x100]; -#endif /* not LIBENCA_H */ +#endif /* not LIBENCA_H */ \ No newline at end of file diff --git a/lib/lang.c b/lib/lang.c index 73bf823..2663b2a 100644 --- a/lib/lang.c +++ b/lib/lang.c @@ -17,7 +17,7 @@ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #ifdef HAVE_CONFIG_H -#include "config.h" +# include "config.h" #endif /* HAVE_CONFIG_H */ #include "enca.h" @@ -30,46 +30,46 @@ * tested **/ static const EncaLanguageInfo ENCA_LANGUAGE___ = { - "__", /* name */ - "none", /* human name */ - 0, /* number of charsets */ - NULL, /* their names */ - NULL, /* character weights */ - NULL, /* significancy data */ - NULL, /* letter data */ - NULL, /* pair data */ - 0, /* sum of weights */ - NULL, /* hook function */ - NULL, /* eolhook function */ - NULL, /* lcuchook function */ - NULL, /* ratinghook function */ + "__", /* name */ + "none", /* human name */ + 0, /* number of charsets */ + NULL, /* their names */ + NULL, /* character weights */ + NULL, /* significancy data */ + NULL, /* letter data */ + NULL, /* pair data */ + 0, /* sum of weights */ + NULL, /* hook function */ + NULL, /* eolhook function */ + NULL, /* lcuchook function */ + NULL, /* ratinghook function */ }; /* All languages. */ static const EncaLanguageInfo *const LANGUAGE_LIST[] = { - &ENCA_LANGUAGE_BE, /* Belarusian. */ - &ENCA_LANGUAGE_BG, /* Bulgarian. */ - &ENCA_LANGUAGE_CS, /* Czech. */ - &ENCA_LANGUAGE_ET, /* Estonian. */ - &ENCA_LANGUAGE_FI, /* Finnish. */ - &ENCA_LANGUAGE_HR, /* Croatian. */ - &ENCA_LANGUAGE_HU, /* Hungarian. */ - &ENCA_LANGUAGE_LT, /* Latvian. */ - &ENCA_LANGUAGE_LV, /* Lithuanian. */ - &ENCA_LANGUAGE_PL, /* Polish. */ - &ENCA_LANGUAGE_RU, /* Russian. */ - &ENCA_LANGUAGE_SK, /* Slovak. */ - &ENCA_LANGUAGE_SL, /* Slovene. */ - &ENCA_LANGUAGE_UK, /* Ukrainian. */ - &ENCA_LANGUAGE_ZH, /* Chinese. */ - &ENCA_LANGUAGE___, /* None. */ + &ENCA_LANGUAGE_BE, /* Belarusian. */ + &ENCA_LANGUAGE_BG, /* Bulgarian. */ + &ENCA_LANGUAGE_CS, /* Czech. */ + &ENCA_LANGUAGE_ET, /* Estonian. */ + &ENCA_LANGUAGE_FI, /* Finnish. */ + &ENCA_LANGUAGE_HR, /* Croatian. */ + &ENCA_LANGUAGE_HU, /* Hungarian. */ + &ENCA_LANGUAGE_LT, /* Latvian. */ + &ENCA_LANGUAGE_LV, /* Lithuanian. */ + &ENCA_LANGUAGE_PL, /* Polish. */ + &ENCA_LANGUAGE_RU, /* Russian. */ + &ENCA_LANGUAGE_SK, /* Slovak. */ + &ENCA_LANGUAGE_SL, /* Slovene. */ + &ENCA_LANGUAGE_UK, /* Ukrainian. */ + &ENCA_LANGUAGE_ZH, /* Chinese. */ + &ENCA_LANGUAGE___, /* None. */ }; #define NLANGUAGES (ELEMENTS(LANGUAGE_LIST)) /* Local prototypes. */ -static int *language_charsets_ids(const EncaLanguageInfo *lang); -static const EncaLanguageInfo *find_language(const char *langname); +static int* language_charsets_ids(const EncaLanguageInfo *lang); +static const EncaLanguageInfo* find_language(const char *langname); /** * enca_language_init: @@ -83,8 +83,9 @@ static const EncaLanguageInfo *find_language(const char *langname); * * Returns: Nonzero on success, zero otherwise. **/ -int enca_language_init(EncaAnalyserState *analyser, - const char *langname) +int +enca_language_init(EncaAnalyserState *analyser, + const char *langname) { const EncaLanguageInfo *lang; @@ -116,7 +117,8 @@ int enca_language_init(EncaAnalyserState *analyser, * * Destroys the language part of analyser state @analyser. **/ -void enca_language_destroy(EncaAnalyserState *analyser) +void +enca_language_destroy(EncaAnalyserState *analyser) { enca_free(analyser->charsets); enca_free(analyser->lcbits); @@ -139,13 +141,13 @@ void enca_language_destroy(EncaAnalyserState *analyser) * * Returns: The list of languages, storing their number into *@n. **/ -const char ** +const char** enca_get_languages(size_t *n) { const char **languages; size_t i; - languages = NEW(const char *, NLANGUAGES); + languages = NEW(const char*, NLANGUAGES); for (i = 0; i < NLANGUAGES; i++) languages[i] = LANGUAGE_LIST[i]->name; @@ -163,7 +165,7 @@ enca_get_languages(size_t *n) * * Returns: The language name. **/ -const char * +const char* enca_analyser_language(EncaAnalyser analyser) { assert(analyser != NULL); @@ -181,7 +183,7 @@ enca_analyser_language(EncaAnalyser analyser) * * Returns: The English language name. **/ -const char * +const char* enca_language_english_name(const char *lang) { const EncaLanguageInfo *linfo; @@ -206,16 +208,16 @@ enca_language_english_name(const char *lang) * contains no charsets or @langname is invalid, #NULL is returned * and zero stored into *@n. **/ -int *enca_get_language_charsets(const char *langname, - size_t *n) +int* +enca_get_language_charsets(const char *langname, + size_t *n) { const EncaLanguageInfo *lang; assert(langname != NULL); lang = find_language(langname); - if (lang == NULL) - { + if (lang == NULL) { *n = 0; return NULL; } @@ -235,7 +237,7 @@ int *enca_get_language_charsets(const char *langname, * * Returns: The charsets id table; #NULL when @lang has no charsets. **/ -static int * +static int* language_charsets_ids(const EncaLanguageInfo *lang) { int *charsets; @@ -247,8 +249,7 @@ language_charsets_ids(const EncaLanguageInfo *lang) return NULL; charsets = NEW(int, lang->ncharsets); - for (i = 0; i < lang->ncharsets; i++) - { + for (i = 0; i < lang->ncharsets; i++) { charsets[i] = enca_name_to_charset(lang->csnames[i]); assert(charsets[i] != ENCA_CS_UNKNOWN); } @@ -264,7 +265,7 @@ language_charsets_ids(const EncaLanguageInfo *lang) * * Returns: Pointer to its language information data; #NULL if not found. **/ -static const EncaLanguageInfo * +static const EncaLanguageInfo* find_language(const char *langname) { const EncaLanguageInfo *lang = NULL; @@ -273,10 +274,8 @@ find_language(const char *langname) if (langname == NULL) return NULL; - for (i = 0; i < NLANGUAGES; i++) - { - if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) - { + for (i = 0; i < NLANGUAGES; i++) { + if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) { lang = LANGUAGE_LIST[i]; break; } @@ -305,7 +304,7 @@ find_language(const char *langname) * Returns: The matrix, its size is determined by @lang->ncharsets; #NULL * for language with no charsets. **/ -double * +double* enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) { const size_t n = lang->ncharsets; @@ -321,32 +320,27 @@ enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) return NULL; /* Below diagonal. */ - smat = NEW(double, n *n); - for (i = 0; i < n; i++) - { - for (j = 0; j <= i; j++) - { - smat[i * n + j] = 0.0; + smat = NEW(double, n*n); + for (i = 0; i < n; i++) { + for (j = 0; j <= i; j++) { + smat[i*n + j] = 0.0; for (c = 0; c < 0x100; c++) - smat[i * n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON); + smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON); } } /* Above diagonal. */ - for (i = 0; i < n; i++) - { - for (j = i + 1; j < n; j++) - smat[i * n + j] = smat[j * n + i]; + for (i = 0; i < n; i++) { + for (j = i+1; j < n; j++) + smat[i*n + j] = smat[j*n + i]; } /* Normalize. */ - for (i = 0; i < n; i++) - { - double wmax = smat[i * n + i]; + for (i = 0; i < n; i++) { + double wmax = smat[i*n + i]; - for (j = 0; j < n; j++) - { - smat[i * n + j] /= wmax; + for (j = 0; j < n; j++) { + smat[i*n + j] /= wmax; } } diff --git a/src/locale_detect.c b/src/locale_detect.c index 446b990..90a8883 100644 --- a/src/locale_detect.c +++ b/src/locale_detect.c @@ -19,32 +19,32 @@ #include "common.h" #ifdef HAVE_SETLOCALE -#ifdef HAVE_LOCALE_H -#include -#else /* HAVE_LOCALE_H */ -char *setlocale(int category, const char *locale); -#endif /* HAVE_LOCALE_H */ +# ifdef HAVE_LOCALE_H +# include +# else /* HAVE_LOCALE_H */ +char* setlocale(int category, const char *locale); +# endif /* HAVE_LOCALE_H */ #endif /* HAVE_SETLOCALE */ #ifdef HAVE_NL_LANGINFO -#ifdef HAVE_LANGINFO_H -#include -#else /* HAVE_LANGINFO_H */ +# ifdef HAVE_LANGINFO_H +# include +# else /* HAVE_LANGINFO_H */ char *nl_langinfo(nl_item *item); -#endif /* HAVE_LANGINFO_H */ +# endif /* HAVE_LANGINFO_H */ #endif /* HAVE_NL_LANGINFO */ static char *codeset = NULL; /* Local prototypes. */ -static char *locale_alias_convert(const char *locname); -static char *strip_locale_name(const char *locname); -static char *static_iso639_alias_convert(const char *locname); +static char* locale_alias_convert(const char *locname); +static char* strip_locale_name(const char *locname); +static char* static_iso639_alias_convert(const char *locname); #ifdef HAVE_SETLOCALE -static char *detect_target_charset(const char *locname); -static char *detect_user_language(void); +static char* detect_target_charset(const char *locname); +static char* detect_user_language(void); #endif /* HAVE_SETLOCALE */ -static void codeset_free(void); +static void codeset_free(void); /* * when lang is not NULL converts it to two-character language code @@ -52,7 +52,7 @@ static void codeset_free(void); * returns string of length 2 containig language code (to be freed by caller) * or NULL if not detected or unable to convert. */ -char * +char* detect_lang(const char *lang) { char *locname, *result, *cvt; @@ -60,8 +60,7 @@ detect_lang(const char *lang) atexit(codeset_free); #ifdef HAVE_SETLOCALE /* No lang, detect locale, then CODESET, then try to transform it */ - if (!lang) - { + if (!lang) { locname = detect_user_language(); /* HERE: locname is (a) newly allocated (b) NULL */ codeset = detect_target_charset(locname); @@ -82,7 +81,7 @@ detect_lang(const char *lang) enca_free(locname); return result; -#else /* HAVE_SETLOCALE */ +#else /* HAVE_SETLOCALE */ UNUSED(locname); cvt = locale_alias_convert(lang); result = strip_locale_name(cvt); @@ -99,7 +98,7 @@ detect_lang(const char *lang) * * Returns: A string (to be freed) with charset name or NULL on failure. **/ -static char * +static char* detect_target_charset(const char *locname) { char *s = NULL; @@ -113,10 +112,9 @@ detect_target_charset(const char *locname) s = enca_strdup(nl_langinfo(CODESET)); - if (setlocale(LC_CTYPE, "C") == NULL) - { + if (setlocale(LC_CTYPE, "C") == NULL) { fprintf(stderr, "%s: Cannot set LC_CTYPE to the portable \"C\" locale\n", - program_name); + program_name); exit(EXIT_TROUBLE); } if (options.verbosity_level > 2) @@ -139,33 +137,33 @@ detect_target_charset(const char *locname) * * Returns: A string (to be freed) with locale name or NULL on failure. **/ -static char * +static char* detect_user_language(void) { static const int test_categories[] = { - LC_CTYPE, - LC_COLLATE, + LC_CTYPE, LC_COLLATE, #if HAVE_LC_MESSAGES - LC_MESSAGES, + LC_MESSAGES, #endif }; char *s = NULL; size_t i; - for (i = 0; i < ELEMENTS(test_categories); i++) - { + for (i = 0; i < ELEMENTS(test_categories); i++) { enca_free(s); if ((s = setlocale(test_categories[i], "")) == NULL) continue; s = enca_strdup(s); - if (setlocale(test_categories[i], "C") == NULL) - { + if (setlocale(test_categories[i], "C") == NULL) { fprintf(stderr, "%s: Cannot set locale to the portable \"C\" locale\n", - program_name); + program_name); exit(EXIT_TROUBLE); } - if (strcmp(s, "") == 0 || strcmp(s, "C") == 0 || strcmp(s, "POSIX") == 0 || (strncmp(s, "en", 2) == 0 && !isalpha(s[2]))) + if (strcmp(s, "") == 0 + || strcmp(s, "C") == 0 + || strcmp(s, "POSIX") == 0 + || (strncmp(s, "en", 2) == 0 && !isalpha(s[2]))) continue; if (options.verbosity_level > 2) @@ -188,13 +186,13 @@ detect_user_language(void) (but the worst thing that can happen is we return wrong locale name) the locale.alias format is nowhere described, so we assume every line consists of alias (row 1), some whitespace and canonical name */ -static char * +static char* locale_alias_convert(const char *locname) { #ifdef HAVE_LOCALE_ALIAS File *fla; /* locale.alias file */ Buffer *buf; - char *s, *p, *q; + char *s,*p,*q; size_t n; #endif /* HAVE_LOCALE_ALIAS */ @@ -209,10 +207,8 @@ locale_alias_convert(const char *locname) /* try to read locale.alias */ buf = buffer_new(0); fla = file_new(LOCALE_ALIAS_PATH, buf); - if (file_open(fla, "r") != 0) - { - if (options.verbosity_level) - { + if (file_open(fla, "r") != 0) { + if (options.verbosity_level) { fprintf(stderr, "Cannot find locale.alias file.\n" "This build of enca probably has been configured for " "quite a different system\n"); @@ -226,20 +222,16 @@ locale_alias_convert(const char *locname) somewhat crude now */ n = strlen(locname); p = NULL; - s = (char *)buf->data; /* alias */ - while (file_getline(fla) != NULL) - { + s = (char*)buf->data; /* alias */ + while (file_getline(fla) != NULL) { if (strncmp(s, locname, n) == 0 && - (isspace(s[n]) || (s[n] == ':' && isspace(s[n + 1])))) - { + (isspace(s[n]) || (s[n] == ':' && isspace(s[n+1])))) { p = s + n; /* skip any amount of whitespace */ - while (isspace(*p)) - p++; + while (isspace(*p)) p++; q = p; /* anything up to next whitespace is the canonical locale name */ - while (*q != '\0' && !isspace(*q)) - q++; + while (*q != '\0' && !isspace(*q)) q++; *q = '\0'; p = enca_strdup(p); break; @@ -250,7 +242,7 @@ locale_alias_convert(const char *locname) buffer_free(buf); return p != NULL ? p : static_iso639_alias_convert(locname); -#else /* HAVE_LOCALE_ALIAS */ +#else /* HAVE_LOCALE_ALIAS */ return static_iso639_alias_convert(locname); #endif /* HAVE_LOCALE_ALIAS */ } @@ -264,7 +256,7 @@ locale_alias_convert(const char *locname) * * Returns: the codeset name. **/ -const char * +const char* get_lang_codeset(void) { if (!codeset) @@ -279,18 +271,18 @@ get_lang_codeset(void) * * Returned string should be freed by caller. **/ -static char * +static char* strip_locale_name(const char *locname) { /* Some supported languages can also appear as dialects of some other * language */ - struct - { + struct { const char *dialect; const char *iso639; - } const DIALECTS[] = { - {"cs_SK", "sk"}, - {"ru_UA", "uk"}, + } + const DIALECTS[] = { + { "cs_SK", "sk" }, + { "ru_UA", "uk" }, }; size_t n; @@ -306,15 +298,13 @@ strip_locale_name(const char *locname) return s; /* Some long specification (either X/Open or CEN). */ - if (n >= 5 && s[2] == '_' && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) - { + if (n >= 5 && s[2] == '_' + && (s[5] == '\0' || s[5] == '.' || s[5] == '+')) { size_t i; /* Convert dialects. */ - for (i = 0; i < ELEMENTS(DIALECTS); i++) - { - if (strncmp(DIALECTS[i].dialect, s, 5) == 0) - { + for (i = 0; i < ELEMENTS(DIALECTS); i++) { + if (strncmp(DIALECTS[i].dialect, s, 5) == 0) { s[0] = DIALECTS[i].iso639[0]; s[1] = DIALECTS[i].iso639[1]; break; @@ -323,8 +313,7 @@ strip_locale_name(const char *locname) s[2] = '\0'; } - else - { + else { /* Just garbage or some unresolved locale alias. */ enca_free(s); } @@ -339,59 +328,58 @@ strip_locale_name(const char *locname) * * Returned string should be freed by caller. **/ -static char * +static char* static_iso639_alias_convert(const char *locname) { - struct - { + struct { const char *alias; const char *iso639; - } const ALIASES[] = { - {"byelarussian", "be"}, - {"byelarusian", "be"}, - {"belarussian", "be"}, - {"belarusian", "be"}, - {"byelorussian", "be"}, - {"belorussian", "be"}, - {"byelorusian", "be"}, - {"belorusian", "be"}, - {"bosnian", "hr"}, - {"bulgarian", "bg"}, - {"chinese", "zh"}, - {"croatian", "hr"}, - {"czech", "cs"}, - {"estonian", "et"}, - {"finnish", "fi"}, - {"hungarian", "hu"}, - {"lativan", "lt"}, - {"lettic", "lv"}, - {"lettish", "lv"}, - {"lithuanian", "lt"}, - {"macedonian", "bg"}, - {"magyar", "hu"}, - {"montenegrin-cyrilic", "bg"}, - {"montenegrin-latin", "hr"}, - {"polish", "pl"}, - {"russian", "ru"}, - {"serbian-cyrilic", "bg"}, - {"serbian-latin", "hr"}, - {"slovak", "sk"}, - {"slovene", "sl"}, - {"slovenian", "sl"}, - {"ukrainian", "uk"}}; + } + const ALIASES[] = { + { "byelarussian", "be" }, + { "byelarusian", "be" }, + { "belarussian", "be" }, + { "belarusian", "be" }, + { "byelorussian", "be" }, + { "belorussian", "be" }, + { "byelorusian", "be" }, + { "belorusian", "be" }, + { "bosnian", "hr" }, + { "bulgarian", "bg" }, + { "chinese", "zh" }, + { "croatian", "hr" }, + { "czech", "cs" }, + { "estonian", "et" }, + { "finnish", "fi"}, + { "hungarian", "hu" }, + { "lativan", "lt" }, + { "lettic", "lv" }, + { "lettish", "lv" }, + { "lithuanian", "lt" }, + { "macedonian", "bg"}, + { "magyar", "hu" }, + { "montenegrin-cyrilic", "bg"}, + { "montenegrin-latin", "hr" }, + { "polish", "pl" }, + { "russian", "ru" }, + { "serbian-cyrilic", "bg"}, + { "serbian-latin", "hr"}, + { "slovak", "sk" }, + { "slovene", "sl" }, + { "slovenian", "sl" }, + { "ukrainian", "uk" } + }; size_t i; if (!locname) return NULL; - for (i = 0; i < ELEMENTS(ALIASES); i++) - { - if (strcmp(ALIASES[i].alias, locname) == 0) - { + for (i = 0; i < ELEMENTS(ALIASES); i++) { + if (strcmp(ALIASES[i].alias, locname) == 0) { if (options.verbosity_level > 2) fprintf(stderr, "Decrypted locale alias using built-in table: %s\n", - ALIASES[i].iso639); + ALIASES[i].iso639); return enca_strdup(ALIASES[i].iso639); } @@ -407,4 +395,4 @@ codeset_free(void) } /* vim: ts=2 - */ + */ \ No newline at end of file diff --git a/test/simtable.c b/test/simtable.c index 21a317c..9170c37 100644 --- a/test/simtable.c +++ b/test/simtable.c @@ -16,8 +16,7 @@ prl(const EncaLanguageInfo *l, const char *hooks) int a; size_t i, j; - if (myargc > 1) - { + if (myargc > 1) { a = 1; while (a < myargc && strcmp(myargv[a], l->name)) a++; @@ -27,11 +26,9 @@ prl(const EncaLanguageInfo *l, const char *hooks) printf("\n==\x1b[1m%s\x1b[m==\n", l->name); m = enca_get_charset_similarity_matrix(l); - for (i = 0; i < l->ncharsets; i++) - { - for (j = 0; j < l->ncharsets; j++) - { - double q = 1000.0 * m[i * l->ncharsets + j]; + for (i = 0; i < l->ncharsets; i++) { + for (j = 0; j < l->ncharsets; j++) { + double q = 1000.0*m[i*l->ncharsets + j]; if (i == j) printf("\x1b[36m"); @@ -52,7 +49,8 @@ prl(const EncaLanguageInfo *l, const char *hooks) free(m); } -int main(int argc, char *argv[]) +int +main(int argc, char *argv[]) { myargc = argc; myargv = argv; @@ -73,4 +71,4 @@ int main(int argc, char *argv[]) prl(&ENCA_LANGUAGE_UK, "macwin isokoi ibm1125"); return 0; -} +} \ No newline at end of file From 6c7e496d47b3f6919a02286816dd4966d4ec7ff4 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Tue, 29 Jul 2025 20:33:23 +0000 Subject: [PATCH 06/11] Add Finnish language support to setup script and include sample text files in cp1257, iso88594, and utf8 encodings --- test/fi-s.cp1257 | 1 + test/fi-s.iso88594 | 1 + test/fi-s.utf8 | 1 + test/setup.sh | 3 ++- 4 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 test/fi-s.cp1257 create mode 100644 test/fi-s.iso88594 create mode 100644 test/fi-s.utf8 diff --git a/test/fi-s.cp1257 b/test/fi-s.cp1257 new file mode 100644 index 0000000..2c501e2 --- /dev/null +++ b/test/fi-s.cp1257 @@ -0,0 +1 @@ +Hyv yt! Kyttk shk? iti lysi ypydlt ljy. \ No newline at end of file diff --git a/test/fi-s.iso88594 b/test/fi-s.iso88594 new file mode 100644 index 0000000..2c501e2 --- /dev/null +++ b/test/fi-s.iso88594 @@ -0,0 +1 @@ +Hyv yt! Kyttk shk? iti lysi ypydlt ljy. \ No newline at end of file diff --git a/test/fi-s.utf8 b/test/fi-s.utf8 new file mode 100644 index 0000000..d1bdda7 --- /dev/null +++ b/test/fi-s.utf8 @@ -0,0 +1 @@ +Hyvää yötä! Käytätkö sähköä? Äiti löysi yöpöydältä öljyä. \ No newline at end of file diff --git a/test/setup.sh b/test/setup.sh index a1b886f..b6f634c 100644 --- a/test/setup.sh +++ b/test/setup.sh @@ -1,10 +1,11 @@ ENCA=$top_builddir/src/enca -TEST_LANGUAGES="be bg cs et hr hu lt lv pl ru sk sl uk zh" +TEST_LANGUAGES="be bg cs et fi hr hu lt lv pl ru sk sl uk zh" ALL_TEST_LANGUAGES="$TEST_LANGUAGES none" TEST_PAIR_be="koi8uni cp1251" TEST_PAIR_bg="ibm855 cp1251" TEST_PAIR_cs="keybcs2 ibm852" TEST_PAIR_et="iso885913 baltic" +TEST_PAIR_fi="iso88594 cp1257" TEST_PAIR_hr="ibm852 cp1250" TEST_PAIR_hu="cp1250 ibm852" TEST_PAIR_lt="iso88594 baltic" From f5392452448b24d0fef69e1d72eaa012b59fc97e Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Tue, 29 Jul 2025 21:25:10 +0000 Subject: [PATCH 07/11] Update Finnish test files with new sample text for cp1257, iso88594, and utf8 encodings --- test/fi-s.cp1257 | 2 +- test/fi-s.iso88594 | 2 +- test/fi-s.utf8 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/fi-s.cp1257 b/test/fi-s.cp1257 index 2c501e2..7bb4af6 100644 --- a/test/fi-s.cp1257 +++ b/test/fi-s.cp1257 @@ -1 +1 @@ -Hyv yt! Kyttk shk? iti lysi ypydlt ljy. \ No newline at end of file +Tss on testimerkkej: \ No newline at end of file diff --git a/test/fi-s.iso88594 b/test/fi-s.iso88594 index 2c501e2..afc51c8 100644 --- a/test/fi-s.iso88594 +++ b/test/fi-s.iso88594 @@ -1 +1 @@ -Hyv yt! Kyttk shk? iti lysi ypydlt ljy. \ No newline at end of file +Tss on testimerkkej: \ No newline at end of file diff --git a/test/fi-s.utf8 b/test/fi-s.utf8 index d1bdda7..055a354 100644 --- a/test/fi-s.utf8 +++ b/test/fi-s.utf8 @@ -1 +1 @@ -Hyvää yötä! Käytätkö sähköä? Äiti löysi yöpöydältä öljyä. \ No newline at end of file +Tässä on testimerkkejä: š ž ¸ \ No newline at end of file From 66123e2c1e20decfd36a5223f82181fd5623b511 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Tue, 29 Jul 2025 21:49:57 +0000 Subject: [PATCH 08/11] Update Finnish test files with new sample text for cp1257, iso88594, and utf8 encodings --- test/fi-s.cp1257 | 2 +- test/fi-s.iso88594 | 2 +- test/fi-s.utf8 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/fi-s.cp1257 b/test/fi-s.cp1257 index 7bb4af6..32d0937 100644 --- a/test/fi-s.cp1257 +++ b/test/fi-s.cp1257 @@ -1 +1 @@ -Tss on testimerkkej: \ No newline at end of file +Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. , , \ No newline at end of file diff --git a/test/fi-s.iso88594 b/test/fi-s.iso88594 index afc51c8..32d0937 100644 --- a/test/fi-s.iso88594 +++ b/test/fi-s.iso88594 @@ -1 +1 @@ -Tss on testimerkkej: \ No newline at end of file +Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. , , \ No newline at end of file diff --git a/test/fi-s.utf8 b/test/fi-s.utf8 index 055a354..95fc5cb 100644 --- a/test/fi-s.utf8 +++ b/test/fi-s.utf8 @@ -1 +1 @@ -Tässä on testimerkkejä: š ž ¸ \ No newline at end of file +Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. š ž \ No newline at end of file From 1a963bc012f5b99e866af03548f72513800cf7ce Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Tue, 29 Jul 2025 22:13:23 +0000 Subject: [PATCH 09/11] Add Finnish encoding support to test-guess-short.expected --- test/test-guess-short.expected | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test-guess-short.expected b/test/test-guess-short.expected index 4de1a58..6ecb9c8 100644 --- a/test/test-guess-short.expected +++ b/test/test-guess-short.expected @@ -31,6 +31,9 @@ et-s.iso88594: ISO-8859-4 et-s.iso88594.qp: ISO-8859-4/qp et-s.macce: macce et-s.utf8: UTF-8/CRLF +fi-s.cp1257: CP1257/LF +fi-s.iso88594: ISO-8859-4 +fi-s.utf8: UTF-8 hr-s.cork: CORK/LF hr-s.cp1250: CP1250 hr-s.ibm852: IBM852 From f1104679c8eb074a14560f1d26eac059af1a7f49 Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Tue, 29 Jul 2025 22:17:06 +0000 Subject: [PATCH 10/11] Fix encoding test files by removing extraneous characters and ensuring proper formatting for cp1257, iso88594, and utf8 encodings. --- test/fi-s.cp1257 | 2 +- test/fi-s.iso88594 | 2 +- test/fi-s.utf8 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/fi-s.cp1257 b/test/fi-s.cp1257 index 32d0937..41e0274 100644 --- a/test/fi-s.cp1257 +++ b/test/fi-s.cp1257 @@ -1 +1 @@ -Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. , , \ No newline at end of file +Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. \ No newline at end of file diff --git a/test/fi-s.iso88594 b/test/fi-s.iso88594 index 32d0937..91fe12f 100644 --- a/test/fi-s.iso88594 +++ b/test/fi-s.iso88594 @@ -1 +1 @@ -Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. , , \ No newline at end of file +Hyv ystv, l kyt ljy yll. Kvely jrven rell on trke. \ No newline at end of file diff --git a/test/fi-s.utf8 b/test/fi-s.utf8 index 95fc5cb..e3d7bff 100644 --- a/test/fi-s.utf8 +++ b/test/fi-s.utf8 @@ -1 +1 @@ -Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. š ž \ No newline at end of file +Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. š \ No newline at end of file From 1d05dcc4443070d49405e1e19a4a250bd6ad9f2c Mon Sep 17 00:00:00 2001 From: Egor Kovalchuk Date: Wed, 30 Jul 2025 08:19:39 +0000 Subject: [PATCH 11/11] Add Finnish encoding support to test-guess-stdin.msys2.expected --- test/test-guess-stdin.msys2.expected | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test-guess-stdin.msys2.expected b/test/test-guess-stdin.msys2.expected index c152c45..10ba0b5 100644 --- a/test/test-guess-stdin.msys2.expected +++ b/test/test-guess-stdin.msys2.expected @@ -31,6 +31,9 @@ et-s.iso88594: ISO-8859-4 et-s.iso88594.qp: ISO-8859-4/qp et-s.macce: macce et-s.utf8: UTF-8 +fi-s.cp1257: CP1257/LF +fi-s.iso88594: ISO-8859-4 +fi-s.utf8: UTF-8 hr-s.cork: CORK/LF hr-s.cp1250: CP1250/LF hr-s.ibm852: IBM852/LF