diff --git a/DEVELOP.md b/DEVELOP.md index e727e43..b08f84c 100644 --- a/DEVELOP.md +++ b/DEVELOP.md @@ -70,7 +70,7 @@ Specifically, for multibyte encodings: existing languages in `data/*` and read `data/README`. * `lib/internal.h`: * Add new `ENCA_LANGUAGE_....` -* `src/lang.c`: +* `lib/lang.c`: * Add a new `LANGUAGE_LIST[]` entry pointing to the `ENCA_LANGUAGE_....` diff --git a/data/Makefile.am b/data/Makefile.am index 14e852b..bd8847e 100644 --- a/data/Makefile.am +++ b/data/Makefile.am @@ -16,6 +16,7 @@ noinst_HEADERS = \ croatian/croatian.h \ czech/czech.h \ estonian/estonian.h \ + finnish/finnish.h \ hungarian/hungarian.h \ latvian/latvian.h \ lithuanian/lithuanian.h \ @@ -46,6 +47,7 @@ noinst_SCRPITS = \ croatian/doit.sh \ czech/doit.sh \ estonian/doit.sh \ + finnish/doit.sh \ hungarian/doit.sh \ latvian/doit.sh \ lithuanian/doit.sh \ @@ -61,6 +63,7 @@ BASES = \ croatian/cp1250.base \ czech/iso88592.base \ estonian/iso88594.base \ + finnish/iso88594.base \ hungarian/iso88592.base \ russian/koi8r.base \ latvian/cp1257.base \ @@ -76,6 +79,7 @@ RAWCOUNTS = \ croatian/rawcounts.cp1250 \ czech/rawcounts.iso88592 \ estonian/rawcounts.iso88594 \ + finnish/rawcounts.iso88594 \ hungarian/rawcounts.iso88592 \ latvian/rawcounts.cp1257 \ lithuanian/rawcounts.cp1257 \ diff --git a/data/Makefile.in b/data/Makefile.in index b968a56..7e3c1f5 100644 --- a/data/Makefile.in +++ b/data/Makefile.in @@ -367,6 +367,7 @@ noinst_HEADERS = \ croatian/croatian.h \ czech/czech.h \ estonian/estonian.h \ + finnish/finnish.h \ hungarian/hungarian.h \ latvian/latvian.h \ lithuanian/lithuanian.h \ @@ -396,6 +397,7 @@ noinst_SCRPITS = \ croatian/doit.sh \ czech/doit.sh \ estonian/doit.sh \ + finnish/doit.sh \ hungarian/doit.sh \ latvian/doit.sh \ lithuanian/doit.sh \ @@ -411,6 +413,7 @@ BASES = \ croatian/cp1250.base \ czech/iso88592.base \ estonian/iso88594.base \ + finnish/iso88594.base \ hungarian/iso88592.base \ russian/koi8r.base \ latvian/cp1257.base \ @@ -426,6 +429,7 @@ RAWCOUNTS = \ croatian/rawcounts.cp1250 \ czech/rawcounts.iso88592 \ estonian/rawcounts.iso88594 \ + finnish/rawcounts.iso88594 \ hungarian/rawcounts.iso88592 \ latvian/rawcounts.cp1257 \ lithuanian/rawcounts.cp1257 \ diff --git a/data/finnish/doit.sh b/data/finnish/doit.sh new file mode 100755 index 0000000..9b02625 --- /dev/null +++ b/data/finnish/doit.sh @@ -0,0 +1,2 @@ +#! /bin/bash +../doit.sh iso88594 cp1257 \ No newline at end of file diff --git a/data/finnish/finnish.h b/data/finnish/finnish.h new file mode 100644 index 0000000..140ef26 --- /dev/null +++ b/data/finnish/finnish.h @@ -0,0 +1,132 @@ +/***** THIS IS A GENERATED FILE. DO NOT TOUCH! *****/ +/* THIS IS A GENERATED TABLE, see data/basetoc.c. */ +static const unsigned short int RAW_ISO88594[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 4, 21, 0, 0, 2, 1, 2, /* 0x20 */ + 24, 24, 0, 0, 326, 128, 0, 7, /* 0x28 */ + 122, 105, 76, 33, 28, 34, 25, 25, /* 0x30 */ + 28, 49, 39, 2, 0, 0, 0, 8, /* 0x38 */ + 0, 58, 21, 18, 14, 52, 14, 11, /* 0x40 */ + 69, 31, 46, 118, 61, 83, 42, 37, /* 0x48 */ + 74, 0, 39, 133, 96, 23, 66, 9, /* 0x50 */ + 1, 23, 1, 0, 0, 0, 0, 1, /* 0x58 */ + 0, 5399, 37, 32, 403, 3579, 32, 84, /* 0x60 */ + 765, 4735, 859, 2254, 2539, 1338, 3767, 2470, /* 0x68 */ + 768, 1, 1096, 3362, 4242, 2326, 1031, 11, /* 0x70 */ + 4, 800, 6, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 6, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 2, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 1808, 0, 0, 0, /* 0xe0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 0, 0, 0, 0, 0, 0, 255, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED TABLE, see data/basetoc.c. */ +static const unsigned short int RAW_CP1257[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 4, 21, 0, 0, 2, 1, 2, /* 0x20 */ + 24, 24, 0, 0, 326, 128, 0, 7, /* 0x28 */ + 122, 105, 76, 33, 28, 34, 25, 25, /* 0x30 */ + 28, 49, 39, 2, 0, 0, 0, 8, /* 0x38 */ + 0, 58, 21, 18, 14, 52, 14, 11, /* 0x40 */ + 69, 31, 46, 118, 61, 83, 42, 37, /* 0x48 */ + 74, 0, 39, 133, 96, 23, 66, 9, /* 0x50 */ + 1, 23, 1, 0, 0, 0, 0, 1, /* 0x58 */ + 0, 5399, 37, 32, 403, 3579, 32, 84, /* 0x60 */ + 765, 4735, 859, 2254, 2539, 1338, 3767, 2470, /* 0x68 */ + 768, 1, 1096, 3362, 4242, 2326, 1031, 11, /* 0x70 */ + 4, 800, 6, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 6, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 2, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 1808, 0, 0, 0, /* 0xe0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 1, 0, 0, 0, 0, 0, 255, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED TABLE, see data/totals.pl. */ +static const unsigned short int SIGNIFICANT[] = { + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x08 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x18 */ + 0, 8, 42, 0, 0, 4, 2, 4, /* 0x20 */ + 48, 48, 0, 0, 652, 256, 0, 14, /* 0x28 */ + 244, 210, 152, 66, 56, 68, 50, 50, /* 0x30 */ + 56, 98, 78, 4, 0, 0, 0, 16, /* 0x38 */ + 0, 116, 42, 36, 28, 104, 28, 22, /* 0x40 */ + 138, 62, 92, 236, 122, 166, 84, 74, /* 0x48 */ + 148, 0, 78, 266, 192, 46, 132, 18, /* 0x50 */ + 2, 46, 2, 0, 0, 0, 0, 2, /* 0x58 */ + 0, 10798, 74, 64, 806, 7158, 64, 168, /* 0x60 */ + 1530, 9470, 1718, 4508, 5078, 2676, 7534, 4940, /* 0x68 */ + 1536, 2, 2192, 6724, 8484, 4652, 2062, 22, /* 0x70 */ + 8, 1600, 12, 0, 0, 0, 0, 0, /* 0x78 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 */ + 0, 0, 0, 0, 0, 12, 0, 0, /* 0xa8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 */ + 0, 1, 0, 0, 0, 0, 0, 0, /* 0xb8 */ + 0, 0, 0, 0, 4, 0, 0, 0, /* 0xc0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc8 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd8 */ + 0, 0, 0, 0, 3616, 0, 0, 0, /* 0xe0 */ + 0, 2, 0, 0, 0, 0, 0, 0, /* 0xe8 */ + 1, 0, 0, 0, 0, 0, 510, 0, /* 0xf0 */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf8 */ +}; + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define WEIGHT_SUM 46267 + +/* THIS IS A GENERATED TABLE, see data/totals.pl */ +static const char *const CHARSET_NAMES[] = { + "iso88594", + "cp1257", +}; + +/* THIS IS A GENERATED TABLE, see data/totals.pl */ +static const unsigned short int *const CHARSET_WEIGHTS[] = { + RAW_ISO88594, + RAW_CP1257, +}; + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define CHARSET_LETTERS NULL + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define CHARSET_PAIRS NULL + +/* THIS IS A GENERATED VALUE, see data/totals.pl */ +#define NCHARSETS 2 diff --git a/data/finnish/iso88594.base b/data/finnish/iso88594.base new file mode 100644 index 0000000..31f7520 --- /dev/null +++ b/data/finnish/iso88594.base @@ -0,0 +1,155 @@ +. 0 +. 0 +. 0 + 5999 +! 4 +" 21 +# 0 +$ 0 +% 2 +& 1 +' 2 +( 24 +) 24 +* 0 ++ 0 +, 326 +- 128 +. 558 +/ 7 +0 122 +1 105 +2 76 +3 33 +4 28 +5 34 +6 25 +7 25 +8 28 +9 49 +: 39 +; 2 +< 0 += 0 +> 0 +? 8 +@ 0 +A 58 +B 21 +C 18 +D 14 +E 52 +F 14 +G 11 +H 69 +I 31 +J 46 +K 118 +L 61 +M 83 +N 42 +O 37 +P 74 +Q 0 +R 39 +S 133 +T 96 +U 23 +V 66 +W 9 +X 1 +Y 23 +Z 1 +[ 0 +\ 0 +] 0 +^ 0 +_ 1 +` 0 +a 5399 +b 37 +c 32 +d 403 +e 3579 +f 32 +g 84 +h 765 +i 4735 +j 859 +k 2254 +l 2539 +m 1338 +n 3767 +o 2470 +p 768 +q 1 +r 1096 +s 3362 +t 4242 +u 2326 +v 1031 +w 11 +x 4 +y 800 +z 6 +{ 0 +| 0 +} 0 +~ 0 +. 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 6 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 1 +� 0 +� 0 +� 0 +� 0 +� 2 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 1808 +� 0 +� 0 +� 0 +� 0 +� 1 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 0 +� 255 +� 0 +� 0 +� 0 +� 0 +� 0 diff --git a/data/finnish/rawcounts.iso88594 b/data/finnish/rawcounts.iso88594 new file mode 100644 index 0000000..71ef656 --- /dev/null +++ b/data/finnish/rawcounts.iso88594 @@ -0,0 +1,155 @@ +0x08 . 7 +0x15 . 1 +0x16 . 1 +0x20 43309824 +0x21 ! 32929 +0x22 " 157749 +0x23 # 2231 +0x24 $ 273 +0x25 % 20671 +0x26 & 10414 +0x27 ' 14661 +0x28 ( 176988 +0x29 ) 173923 +0x2a * 2473 +0x2b + 7176 +0x2c , 2356929 +0x2d - 929385 +0x2e . 4034374 +0x2f / 50930 +0x30 0 881852 +0x31 1 758265 +0x32 2 555082 +0x33 3 241716 +0x34 4 206462 +0x35 5 248893 +0x36 6 184643 +0x37 7 180858 +0x38 8 203671 +0x39 9 359659 +0x3a : 285993 +0x3b ; 19592 +0x3c < 339 +0x3d = 2455 +0x3e > 1246 +0x3f ? 63572 +0x40 @ 2648 +0x41 A 424041 +0x42 B 154984 +0x43 C 132076 +0x44 D 105440 +0x45 E 376940 +0x46 F 106667 +0x47 G 81094 +0x48 H 500425 +0x49 I 228774 +0x4a J 338869 +0x4b K 857247 +0x4c L 443843 +0x4d M 599604 +0x4e N 306990 +0x4f O 273646 +0x50 P 537341 +0x51 Q 7082 +0x52 R 282878 +0x53 S 964224 +0x54 T 697305 +0x55 U 170075 +0x56 V 482988 +0x57 W 65684 +0x58 X 9926 +0x59 Y 168603 +0x5a Z 13982 +0x5b [ 2555 +0x5c \ 54 +0x5d ] 2574 +0x5e ^ 90 +0x5f _ 7538 +0x60 ` 269 +0x61 a 38977603 +0x62 b 273986 +0x63 c 236578 +0x64 d 2913605 +0x65 e 25839170 +0x66 f 233861 +0x67 g 611191 +0x68 h 5525729 +0x69 i 34184835 +0x6a j 6203886 +0x6b k 16273609 +0x6c l 18331533 +0x6d m 9660734 +0x6e n 27200205 +0x6f o 17835561 +0x70 p 5544908 +0x71 q 9220 +0x72 r 7912194 +0x73 s 24276850 +0x74 t 30624995 +0x75 u 16795139 +0x76 v 7446789 +0x77 w 80798 +0x78 x 34546 +0x79 y 5775453 +0x7a z 49256 +0x7b { 233 +0x7c | 635 +0x7d } 229 +0x7e ~ 200 +0x7f . 7 +0xa1 � 1 +0xa4 � 51 +0xa6 � 1 +0xa7 � 7098 +0xa8 � 95 +0xa9 � 962 +0xaa � 1 +0xab � 1 +0xad � 48990 +0xae � 256 +0xaf � 2 +0xb0 � 1561 +0xb1 � 31 +0xb4 � 803 +0xb6 � 21 +0xb8 � 3 +0xb9 � 7560 +0xba � 217 +0xbb � 3 +0xbe � 1435 +0xc0 � 7 +0xc4 � 15430 +0xc5 � 1636 +0xc6 � 45 +0xc8 � 87 +0xc9 � 314 +0xcf � 1 +0xd2 � 183 +0xd3 � 16 +0xd5 � 21 +0xd6 � 3626 +0xd7 � 920 +0xd8 � 151 +0xdc � 207 +0xde � 1 +0xdf � 230 +0xe0 � 272 +0xe4 � 13055748 +0xe5 � 4188 +0xe6 � 275 +0xe7 � 2 +0xe8 � 353 +0xe9 � 11018 +0xea � 32 +0xec � 93 +0xef � 102 +0xf1 � 53 +0xf2 � 1948 +0xf3 � 13 +0xf5 � 557 +0xf6 � 1847997 +0xf7 � 1 +0xf8 � 1072 +0xf9 � 5 +0xfc � 4651 +0xfe � 599 \ No newline at end of file diff --git a/data/normalize.pl b/data/normalize.pl index b86de69..4e67486 100755 --- a/data/normalize.pl +++ b/data/normalize.pl @@ -19,7 +19,15 @@ if (!defined $ARGV[0]) { while () { - ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + chomp; + # Handle special case of space character (0x20 count) + if (/^(0x20)\s+(\d+)$/) { + $char_hex[$n] = $1; + $char[$n] = ' '; + $count[$n] = $2; + } else { + ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + } if ($max < $count[$n]) { $max = $count[$n]; @@ -49,7 +57,15 @@ my $sum2 = 0; while () { - ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + chomp; + # Handle special case of space character (0x20 count) + if (/^(0x20)\s+(\d+)$/) { + $char_hex[$n] = $1; + $char[$n] = ' '; + $count[$n] = $2; + } else { + ($char_hex[$n], $char[$n], $count[$n]) = split /\s+/, $_, 3; + } $sum2 += $count[$n]; $n++; diff --git a/devel-docs/libenca-decl-list.txt b/devel-docs/libenca-decl-list.txt index 8fbefa1..a641762 100644 --- a/devel-docs/libenca-decl-list.txt +++ b/devel-docs/libenca-decl-list.txt @@ -75,6 +75,7 @@ ENCA_LANGUAGE_BE ENCA_LANGUAGE_BG ENCA_LANGUAGE_CS ENCA_LANGUAGE_ET +ENCA_LANGUAGE_FI ENCA_LANGUAGE_HR ENCA_LANGUAGE_HU ENCA_LANGUAGE_LT diff --git a/devel-docs/libenca-decl.txt b/devel-docs/libenca-decl.txt index 12e7393..8ab0aaa 100644 --- a/devel-docs/libenca-decl.txt +++ b/devel-docs/libenca-decl.txt @@ -438,6 +438,10 @@ extern const EncaLanguageInfo ENCA_LANGUAGE_CS; extern const EncaLanguageInfo ENCA_LANGUAGE_ET; +ENCA_LANGUAGE_FI +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; + + ENCA_LANGUAGE_HR extern const EncaLanguageInfo ENCA_LANGUAGE_HR; diff --git a/devel-docs/libenca-sections.txt b/devel-docs/libenca-sections.txt index 390a224..ca93652 100644 --- a/devel-docs/libenca-sections.txt +++ b/devel-docs/libenca-sections.txt @@ -119,6 +119,7 @@ ENCA_LANGUAGE_BE ENCA_LANGUAGE_BG ENCA_LANGUAGE_CS ENCA_LANGUAGE_ET +ENCA_LANGUAGE_FI ENCA_LANGUAGE_HR ENCA_LANGUAGE_HU ENCA_LANGUAGE_LT diff --git a/devel-docs/tmpl/internal.sgml b/devel-docs/tmpl/internal.sgml index 3b87cee..448f158 100644 --- a/devel-docs/tmpl/internal.sgml +++ b/devel-docs/tmpl/internal.sgml @@ -438,6 +438,10 @@ Do not use outside Enca library. + + + + diff --git a/devel-docs/xml/api-index-full.xml b/devel-docs/xml/api-index-full.xml index 1c9fa72..aa69d35 100644 --- a/devel-docs/xml/api-index-full.xml +++ b/devel-docs/xml/api-index-full.xml @@ -50,6 +50,7 @@ ENCA_LANGUAGE_CS, variable in Internal Functions enca_language_destroy, function in Internal Functions ENCA_LANGUAGE_ET, variable in Internal Functions +ENCA_LANGUAGE_FI, variable in Internal Functions enca_language_hook_eol, function in Internal Functions enca_language_hook_ncs, function in Internal Functions ENCA_LANGUAGE_HR, variable in Internal Functions diff --git a/devel-docs/xml/internal.xml b/devel-docs/xml/internal.xml index a6526f5..827f33e 100644 --- a/devel-docs/xml/internal.xml +++ b/devel-docs/xml/internal.xml @@ -1511,6 +1511,14 @@ UTF-8, negative doubly-encoded. Estonian language. Everything the world out there needs to know about this language. + +ENCA_LANGUAGE_FI +ENCA_LANGUAGE_FI +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; + +Finnish language. +Everything the world out there needs to know about this language. + ENCA_LANGUAGE_HR ENCA_LANGUAGE_HR diff --git a/lib/Makefile.am b/lib/Makefile.am index 9771adb..b7b5c43 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -21,6 +21,7 @@ libenca_la_SOURCES = \ lang_bg.c \ lang_cs.c \ lang_et.c \ + lang_fi.c \ lang_hr.c \ lang_hu.c \ lang_lt.c \ diff --git a/lib/Makefile.in b/lib/Makefile.in index 8daf0dc..820c000 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -140,9 +140,10 @@ LTLIBRARIES = $(lib_LTLIBRARIES) libenca_la_LIBADD = am_libenca_la_OBJECTS = common.lo ctype.lo enca.lo encnames.lo \ filters.lo guess.lo lang.lo lang_be.lo lang_bg.lo lang_cs.lo \ - lang_et.lo lang_hr.lo lang_hu.lo lang_lt.lo lang_lv.lo \ - lang_pl.lo lang_ru.lo lang_sk.lo lang_sl.lo lang_uk.lo \ - lang_zh.lo multibyte.lo pair.lo unicodemap.lo utf8_double.lo + lang_et.lo lang_fi.lo lang_hr.lo lang_hu.lo lang_lt.lo \ + lang_lv.lo lang_pl.lo lang_ru.lo lang_sk.lo lang_sl.lo \ + lang_uk.lo lang_zh.lo multibyte.lo pair.lo unicodemap.lo \ + utf8_double.lo libenca_la_OBJECTS = $(am_libenca_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -171,14 +172,14 @@ am__depfiles_remade = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/ctype.Plo \ ./$(DEPDIR)/filters.Plo ./$(DEPDIR)/guess.Plo \ ./$(DEPDIR)/lang.Plo ./$(DEPDIR)/lang_be.Plo \ ./$(DEPDIR)/lang_bg.Plo ./$(DEPDIR)/lang_cs.Plo \ - ./$(DEPDIR)/lang_et.Plo ./$(DEPDIR)/lang_hr.Plo \ - ./$(DEPDIR)/lang_hu.Plo ./$(DEPDIR)/lang_lt.Plo \ - ./$(DEPDIR)/lang_lv.Plo ./$(DEPDIR)/lang_pl.Plo \ - ./$(DEPDIR)/lang_ru.Plo ./$(DEPDIR)/lang_sk.Plo \ - ./$(DEPDIR)/lang_sl.Plo ./$(DEPDIR)/lang_uk.Plo \ - ./$(DEPDIR)/lang_zh.Plo ./$(DEPDIR)/multibyte.Plo \ - ./$(DEPDIR)/pair.Plo ./$(DEPDIR)/unicodemap.Plo \ - ./$(DEPDIR)/utf8_double.Plo + ./$(DEPDIR)/lang_et.Plo ./$(DEPDIR)/lang_fi.Plo \ + ./$(DEPDIR)/lang_hr.Plo ./$(DEPDIR)/lang_hu.Plo \ + ./$(DEPDIR)/lang_lt.Plo ./$(DEPDIR)/lang_lv.Plo \ + ./$(DEPDIR)/lang_pl.Plo ./$(DEPDIR)/lang_ru.Plo \ + ./$(DEPDIR)/lang_sk.Plo ./$(DEPDIR)/lang_sl.Plo \ + ./$(DEPDIR)/lang_uk.Plo ./$(DEPDIR)/lang_zh.Plo \ + ./$(DEPDIR)/multibyte.Plo ./$(DEPDIR)/pair.Plo \ + ./$(DEPDIR)/unicodemap.Plo ./$(DEPDIR)/utf8_double.Plo am__mv = mv -f COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) @@ -397,6 +398,7 @@ libenca_la_SOURCES = \ lang_bg.c \ lang_cs.c \ lang_et.c \ + lang_fi.c \ lang_hr.c \ lang_hu.c \ lang_lt.c \ @@ -504,6 +506,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_bg.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_cs.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_et.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_fi.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_hr.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_hu.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lang_lt.Plo@am__quote@ # am--include-marker @@ -711,6 +714,7 @@ distclean: distclean-am -rm -f ./$(DEPDIR)/lang_bg.Plo -rm -f ./$(DEPDIR)/lang_cs.Plo -rm -f ./$(DEPDIR)/lang_et.Plo + -rm -f ./$(DEPDIR)/lang_fi.Plo -rm -f ./$(DEPDIR)/lang_hr.Plo -rm -f ./$(DEPDIR)/lang_hu.Plo -rm -f ./$(DEPDIR)/lang_lt.Plo @@ -781,6 +785,7 @@ maintainer-clean: maintainer-clean-am -rm -f ./$(DEPDIR)/lang_bg.Plo -rm -f ./$(DEPDIR)/lang_cs.Plo -rm -f ./$(DEPDIR)/lang_et.Plo + -rm -f ./$(DEPDIR)/lang_fi.Plo -rm -f ./$(DEPDIR)/lang_hr.Plo -rm -f ./$(DEPDIR)/lang_hu.Plo -rm -f ./$(DEPDIR)/lang_lt.Plo diff --git a/lib/internal.h b/lib/internal.h index 1330b9f..191c108 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -483,6 +483,7 @@ extern const EncaLanguageInfo ENCA_LANGUAGE_BE; extern const EncaLanguageInfo ENCA_LANGUAGE_BG; extern const EncaLanguageInfo ENCA_LANGUAGE_CS; extern const EncaLanguageInfo ENCA_LANGUAGE_ET; +extern const EncaLanguageInfo ENCA_LANGUAGE_FI; extern const EncaLanguageInfo ENCA_LANGUAGE_HR; extern const EncaLanguageInfo ENCA_LANGUAGE_HU; extern const EncaLanguageInfo ENCA_LANGUAGE_LT; @@ -504,4 +505,4 @@ extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[]; /* Locale-independent character type table. */ extern const short int enca_ctype_data[0x100]; -#endif /* not LIBENCA_H */ +#endif /* not LIBENCA_H */ \ No newline at end of file diff --git a/lib/lang.c b/lib/lang.c index bda21f2..2663b2a 100644 --- a/lib/lang.c +++ b/lib/lang.c @@ -51,6 +51,7 @@ static const EncaLanguageInfo *const LANGUAGE_LIST[] = { &ENCA_LANGUAGE_BG, /* Bulgarian. */ &ENCA_LANGUAGE_CS, /* Czech. */ &ENCA_LANGUAGE_ET, /* Estonian. */ + &ENCA_LANGUAGE_FI, /* Finnish. */ &ENCA_LANGUAGE_HR, /* Croatian. */ &ENCA_LANGUAGE_HU, /* Hungarian. */ &ENCA_LANGUAGE_LT, /* Latvian. */ @@ -347,4 +348,3 @@ enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang) } /* vim: ts=2 */ - diff --git a/lib/lang_fi.c b/lib/lang_fi.c new file mode 100644 index 0000000..cccd13b --- /dev/null +++ b/lib/lang_fi.c @@ -0,0 +1,94 @@ +/* + encoding data and routines dependent on language; finnish + + Copyright (C) 2025 + + This program is free software; you can redistribute it and/or modify it + under the terms of version 2 of the GNU General Public License as published + by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +*/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include "enca.h" +#include "internal.h" +#include "data/finnish/finnish.h" + +/* Local prototypes. */ +static int hook(EncaAnalyserState *analyser); +static int hook_iso4cp1257(EncaAnalyserState *analyser); + +/** + * ENCA_LANGUAGE_FI: + * + * Finnish language. + * + * Everything the world out there needs to know about this language. + **/ +const EncaLanguageInfo ENCA_LANGUAGE_FI = { + "fi", + "finnish", + NCHARSETS, + CHARSET_NAMES, + CHARSET_WEIGHTS, + SIGNIFICANT, + CHARSET_LETTERS, + CHARSET_PAIRS, + WEIGHT_SUM, + &hook, + NULL, + NULL, + NULL, +}; + +/** + * hook: + * @analyser: Analyser state whose charset ratings are to be modified. + * + * Launches language specific hooks for language "fi". + * + * Returns: Nonzero if charset ratigns have been actually modified, zero + * otherwise. + **/ +static int +hook(EncaAnalyserState *analyser) +{ + return hook_iso4cp1257(analyser); +} + +/** + * hook_iso4cp1257: + * @analyser: Analyser state whose charset ratings are to be modified. + * + * Decides between iso8859-4 and cp1257 charsets for language "fi". + * + * Returns: Nonzero if charset ratigns have been actually modified, zero + * otherwise. + **/ +static int +hook_iso4cp1257(EncaAnalyserState *analyser) +{ + static const unsigned char list_iso88594[] = { + 0xb9, 0xbe, 0xa9, 0xae, 0xa8}; + static const unsigned char list_cp1257[] = { + 0xf0, 0xfe, 0xd0, 0xde, 0xb8}; + static EncaLanguageHookData1CS hookdata[] = { + MAKE_HOOK_LINE(iso88594), + MAKE_HOOK_LINE(cp1257), + }; + + return enca_language_hook_ncs(analyser, ELEMENTS(hookdata), hookdata); +} + +/* vim: ts=2 + */ \ No newline at end of file diff --git a/src/locale_detect.c b/src/locale_detect.c index 4502228..90a8883 100644 --- a/src/locale_detect.c +++ b/src/locale_detect.c @@ -350,6 +350,7 @@ static_iso639_alias_convert(const char *locname) { "croatian", "hr" }, { "czech", "cs" }, { "estonian", "et" }, + { "finnish", "fi"}, { "hungarian", "hu" }, { "lativan", "lt" }, { "lettic", "lv" }, @@ -394,4 +395,4 @@ codeset_free(void) } /* vim: ts=2 - */ + */ \ No newline at end of file diff --git a/test/fi-s.cp1257 b/test/fi-s.cp1257 new file mode 100644 index 0000000..41e0274 --- /dev/null +++ b/test/fi-s.cp1257 @@ -0,0 +1 @@ +Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. ð \ No newline at end of file diff --git a/test/fi-s.iso88594 b/test/fi-s.iso88594 new file mode 100644 index 0000000..91fe12f --- /dev/null +++ b/test/fi-s.iso88594 @@ -0,0 +1 @@ +Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. ¹ \ No newline at end of file diff --git a/test/fi-s.utf8 b/test/fi-s.utf8 new file mode 100644 index 0000000..e3d7bff --- /dev/null +++ b/test/fi-s.utf8 @@ -0,0 +1 @@ +Hyvä ystävä, älä käytä öljyä yöllä. Kävelyä järven äärellä on tärkeää. Å¡ \ No newline at end of file diff --git a/test/setup.sh b/test/setup.sh index a1b886f..b6f634c 100644 --- a/test/setup.sh +++ b/test/setup.sh @@ -1,10 +1,11 @@ ENCA=$top_builddir/src/enca -TEST_LANGUAGES="be bg cs et hr hu lt lv pl ru sk sl uk zh" +TEST_LANGUAGES="be bg cs et fi hr hu lt lv pl ru sk sl uk zh" ALL_TEST_LANGUAGES="$TEST_LANGUAGES none" TEST_PAIR_be="koi8uni cp1251" TEST_PAIR_bg="ibm855 cp1251" TEST_PAIR_cs="keybcs2 ibm852" TEST_PAIR_et="iso885913 baltic" +TEST_PAIR_fi="iso88594 cp1257" TEST_PAIR_hr="ibm852 cp1250" TEST_PAIR_hu="cp1250 ibm852" TEST_PAIR_lt="iso88594 baltic" diff --git a/test/simtable.c b/test/simtable.c index 1d41e05..9170c37 100644 --- a/test/simtable.c +++ b/test/simtable.c @@ -59,6 +59,7 @@ main(int argc, char *argv[]) prl(&ENCA_LANGUAGE_BG, "1251mac"); prl(&ENCA_LANGUAGE_CS, "isowin 852kam"); prl(&ENCA_LANGUAGE_ET, ""); + prl(&ENCA_LANGUAGE_FI, ""); prl(&ENCA_LANGUAGE_HR, "isowin"); prl(&ENCA_LANGUAGE_HU, "isocork isowin[XXX]"); prl(&ENCA_LANGUAGE_LT, "winbalt lat4balt iso13win[XXX]"); @@ -70,4 +71,4 @@ main(int argc, char *argv[]) prl(&ENCA_LANGUAGE_UK, "macwin isokoi ibm1125"); return 0; -} +} \ No newline at end of file diff --git a/test/test-guess-short.expected b/test/test-guess-short.expected index 4de1a58..6ecb9c8 100644 --- a/test/test-guess-short.expected +++ b/test/test-guess-short.expected @@ -31,6 +31,9 @@ et-s.iso88594: ISO-8859-4 et-s.iso88594.qp: ISO-8859-4/qp et-s.macce: macce et-s.utf8: UTF-8/CRLF +fi-s.cp1257: CP1257/LF +fi-s.iso88594: ISO-8859-4 +fi-s.utf8: UTF-8 hr-s.cork: CORK/LF hr-s.cp1250: CP1250 hr-s.ibm852: IBM852 diff --git a/test/test-guess-stdin.msys2.expected b/test/test-guess-stdin.msys2.expected index c152c45..10ba0b5 100644 --- a/test/test-guess-stdin.msys2.expected +++ b/test/test-guess-stdin.msys2.expected @@ -31,6 +31,9 @@ et-s.iso88594: ISO-8859-4 et-s.iso88594.qp: ISO-8859-4/qp et-s.macce: macce et-s.utf8: UTF-8 +fi-s.cp1257: CP1257/LF +fi-s.iso88594: ISO-8859-4 +fi-s.utf8: UTF-8 hr-s.cork: CORK/LF hr-s.cp1250: CP1250/LF hr-s.ibm852: IBM852/LF