Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CPP/7zip/Archive/LzhHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,12 @@ Z7_COM7F_IMF(CHandler::GetProperty(UInt32 index, PROPID propID, PROPVARIANT *val
{
case kpidPath:
{
UString s = NItemName::WinPathToOsPath(MultiByteToUnicodeString(item.GetName(), CP_OEMCP));
UString res;
#ifndef _WIN32
if (!UnixConvertLegacyToUnicode(item.GetName(), res, true, false, 0))
#endif
MultiByteToUnicodeString2(res, item.GetName(), CP_OEMCP);
UString s = NItemName::WinPathToOsPath(res);
if (!s.IsEmpty())
{
if (s.Back() == WCHAR_PATH_SEPARATOR)
Expand Down
9 changes: 9 additions & 0 deletions CPP/7zip/Archive/Zip/ZipItem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,15 @@ void CItem::GetUnicodeString(UString &res, const AString &s, bool isComment, boo
}
}

#ifndef _WIN32
bool isOem = false;
bool isAnsi = false;
if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS && MadeByVersion.Version >= 20) isAnsi = true;
else if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS || MadeByVersion.HostOS == NFileHeader::NHostOS::kFAT) isOem = true;
if (isOem || isAnsi || (useSpecifiedCodePage && codePage != 65001))
if (UnixConvertLegacyToUnicode(s, res, isOem, useSpecifiedCodePage, codePage)) return;
#endif

if (useSpecifiedCodePage)
isUtf8 = (codePage == CP_UTF8);
#ifdef _WIN32
Expand Down
58 changes: 58 additions & 0 deletions CPP/7zip/TEST_LegacyMapping.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "StdAfx.h"
#include "../Common/StringConvert.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef _WIN32
// Linker dummy required for MyString.o dependencies
extern "C" UINT WINAPI SysStringLen(BSTR) { return 0; }

void RunTest(const char* lc_all, bool isOem)
{
if (lc_all) setenv("LC_ALL", lc_all, 1);
else unsetenv("LC_ALL");

UString result;
AString src("A"); // Standard ASCII "A" (0x41)

printf("Testing Locale: %-15s (isOem=%d)... ", lc_all ? lc_all : "DEFAULT", isOem);

// Test the internal mapping logic via iconv_open verification
bool ok = UnixConvertLegacyToUnicode(src, result, isOem, false, 0);

if (ok) {
printf("[OK] (Mapping accepted by iconv)\n");
} else {
printf("[FAIL] (Mapping rejected or conversion failed)\n");
exit(1);
}
}

int main()
{
printf("Starting Legacy Codepage Mapping Tests...\n\n");

// 1. Japanese (Shift-JIS)
RunTest("ja_JP.UTF-8", false);

// 2. Russian (Cyrillic OEM/ANSI)
RunTest("ru_RU.UTF-8", true); // Should map to IBM866
RunTest("ru_RU.UTF-8", false); // Should map to WINDOWS-1251

// 3. German (Western Europe)
RunTest("de_DE.UTF-8", false); // Should map to WINDOWS-1252 / IBM850

// 4. Fallback/Standard
RunTest("C", true); // Should map to IBM437
RunTest("POSIX", false); // Should map to WINDOWS-1252

printf("\nAll codepage mapping tests PASSED.\n");
return 0;
}
#else
int main() {
printf("This test is for UNIX platforms only.\n");
return 0;
}
#endif
134 changes: 134 additions & 0 deletions CPP/Common/StringConvert.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// Common/StringConvert.cpp

#include <cstdio>

#include "StdAfx.h"

#include "StringConvert.h"
Expand All @@ -17,6 +19,138 @@
#include <locale.h>
#endif

#ifndef _WIN32
#include <iconv.h>

static const char *lcToOemTable[] = {
"af_ZA", "IBM850", "ar_SA", "IBM720", "ar_LB", "IBM720", "ar_EG", "IBM720",
"ar_DZ", "IBM720", "ar_BH", "IBM720", "ar_IQ", "IBM720", "ar_JO", "IBM720",
"ar_KW", "IBM720", "ar_LY", "IBM720", "ar_MA", "IBM720", "ar_OM", "IBM720",
"ar_QA", "IBM720", "ar_SY", "IBM720", "ar_TN", "IBM720", "ar_AE", "IBM720",
"ar_YE", "IBM720", "ast_ES", "IBM850", "az_AZ@cyrillic", "IBM866", "az_AZ", "IBM857",
"be_BY", "IBM866", "bg_BG", "IBM866", "br_FR", "IBM850", "ca_ES", "IBM850",
"zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "IBM850", "cs_CZ", "IBM852",
"cy_GB", "IBM850", "da_DK", "IBM850", "de_AT", "IBM850", "de_LI", "IBM850",
"de_LU", "IBM850", "de_CH", "IBM850", "de_DE", "IBM850", "el_GR", "IBM737",
"en_AU", "IBM850", "en_CA", "IBM850", "en_GB", "IBM850", "en_IE", "IBM850",
"en_JM", "IBM850", "en_BZ", "IBM850", "en_PH", "IBM437", "en_ZA", "IBM437",
"en_TT", "IBM850", "en_US", "IBM437", "en_ZW", "IBM437", "en_NZ", "IBM850",
"es_PA", "IBM850", "es_BO", "IBM850", "es_CR", "IBM850", "es_DO", "IBM850",
"es_SV", "IBM850", "es_EC", "IBM850", "es_GT", "IBM850", "es_HN", "IBM850",
"es_NI", "IBM850", "es_CL", "IBM850", "es_MX", "IBM850", "es_ES", "IBM850",
"es_CO", "IBM850", "es_PE", "IBM850", "es_AR", "IBM850",
"es_PR", "IBM850", "es_VE", "IBM850", "es_UY", "IBM850", "es_PY", "IBM850",
"et_EE", "IBM775", "eu_ES", "IBM850", "fa_IR", "IBM720", "fi_FI", "IBM850",
"fo_FO", "IBM850", "fr_FR", "IBM850", "fr_BE", "IBM850", "fr_CA", "IBM850",
"fr_LU", "IBM850", "fr_MC", "IBM850", "fr_CH", "IBM850", "ga_IE", "IBM437",
"gd_GB", "IBM850", "gv_IM", "IBM850", "gl_ES", "IBM850", "he_IL", "IBM862",
"hr_HR", "IBM852", "hu_HU", "IBM852", "id_ID", "IBM850", "is_IS", "IBM850",
"it_IT", "IBM850", "it_CH", "IBM850", "iv_IV", "IBM437", "ja_JP", "CP932",
"kk_KZ", "IBM866", "ko_KR", "CP949", "ky_KG", "IBM866", "lt_LT", "IBM775",
"lv_LV", "IBM775", "mk_MK", "IBM866", "mn_MN", "IBM866", "ms_BN", "IBM850",
"ms_MY", "IBM850", "nl_BE", "IBM850", "nl_NL", "IBM850", "nl_SR", "IBM850",
"nn_NO", "IBM850", "nb_NO", "IBM850", "pl_PL", "IBM852", "pt_BR", "IBM850",
"pt_PT", "IBM850", "rm_CH", "IBM850", "ro_RO", "IBM852", "ru_RU", "IBM866",
"sk_SK", "IBM852", "sl_SI", "IBM852", "sq_AL", "IBM852", "sr_RS@latin", "IBM852",
"sr_RS", "IBM855", "sv_SE", "IBM850", "sv_FI", "IBM850", "sw_KE", "IBM437",
"th_TH", "TIS-620", "tr_TR", "IBM857", "tt_RU", "IBM866", "uk_UA", "IBM866",
"ur_PK", "IBM720", "uz_UZ@cyrillic", "IBM866", "uz_UZ", "IBM857", "vi_VN", "WINDOWS-1258",
"wa_BE", "IBM850", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK"
};

static const char *lcToAnsiTable[] = {
"af_ZA", "WINDOWS-1252", "ar_SA", "WINDOWS-1256", "ar_LB", "WINDOWS-1256", "ar_EG", "WINDOWS-1256",
"ar_DZ", "WINDOWS-1256", "ar_BH", "WINDOWS-1256", "ar_IQ", "WINDOWS-1256", "ar_JO", "WINDOWS-1256",
"ar_KW", "WINDOWS-1256", "ar_LY", "WINDOWS-1256", "ar_MA", "WINDOWS-1256", "ar_OM", "WINDOWS-1256",
"ar_QA", "WINDOWS-1256", "ar_SY", "WINDOWS-1256", "ar_TN", "WINDOWS-1256", "ar_AE", "WINDOWS-1256",
"ar_YE", "WINDOWS-1256", "ast_ES", "WINDOWS-1252", "az_AZ@cyrillic", "WINDOWS-1251", "az_AZ", "WINDOWS-1254",
"be_BY", "WINDOWS-1251", "bg_BG", "WINDOWS-1251", "br_FR", "WINDOWS-1252", "ca_ES", "WINDOWS-1252",
"zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "WINDOWS-1252", "cs_CZ", "WINDOWS-1250",
"cy_GB", "WINDOWS-1252", "da_DK", "WINDOWS-1252", "de_AT", "WINDOWS-1252", "de_LI", "WINDOWS-1252",
"de_LU", "WINDOWS-1252", "de_CH", "WINDOWS-1252", "de_DE", "WINDOWS-1252", "el_GR", "WINDOWS-1253",
"en_AU", "WINDOWS-1252", "en_CA", "WINDOWS-1252", "en_GB", "WINDOWS-1252", "en_IE", "WINDOWS-1252",
"en_JM", "WINDOWS-1252", "en_BZ", "WINDOWS-1252", "en_PH", "WINDOWS-1252", "en_ZA", "WINDOWS-1252",
"en_TT", "WINDOWS-1252", "en_US", "WINDOWS-1252", "en_ZW", "WINDOWS-1252", "en_NZ", "WINDOWS-1252",
"es_PA", "WINDOWS-1252", "es_BO", "WINDOWS-1252", "es_CR", "WINDOWS-1252", "es_DO", "WINDOWS-1252",
"es_SV", "WINDOWS-1252", "es_EC", "WINDOWS-1252", "es_GT", "WINDOWS-1252", "es_HN", "WINDOWS-1252",
"es_NI", "WINDOWS-1252", "es_CL", "WINDOWS-1252", "es_MX", "WINDOWS-1252", "es_ES", "WINDOWS-1252",
"es_CO", "WINDOWS-1252", "es_PE", "WINDOWS-1252", "es_AR", "WINDOWS-1252",
"es_PR", "WINDOWS-1252", "es_VE", "WINDOWS-1252", "es_UY", "WINDOWS-1252", "es_PY", "WINDOWS-1252",
"et_EE", "WINDOWS-1257", "eu_ES", "WINDOWS-1252", "fa_IR", "WINDOWS-1256", "fi_FI", "WINDOWS-1252",
"fo_FO", "WINDOWS-1252", "fr_FR", "WINDOWS-1252", "fr_BE", "WINDOWS-1252", "fr_CA", "WINDOWS-1252",
"fr_LU", "WINDOWS-1252", "fr_MC", "WINDOWS-1252", "fr_CH", "WINDOWS-1252", "ga_IE", "WINDOWS-1252",
"gd_GB", "WINDOWS-1252", "gv_IM", "WINDOWS-1252", "gl_ES", "WINDOWS-1252", "he_IL", "WINDOWS-1255",
"hr_HR", "WINDOWS-1250", "hu_HU", "WINDOWS-1250", "id_ID", "WINDOWS-1252", "is_IS", "WINDOWS-1252",
"it_IT", "WINDOWS-1252", "it_CH", "WINDOWS-1252", "iv_IV", "WINDOWS-1252", "ja_JP", "CP932",
"kk_KZ", "WINDOWS-1251", "ko_KR", "CP949", "ky_KG", "WINDOWS-1251", "lt_LT", "WINDOWS-1257",
"lv_LV", "WINDOWS-1257", "mk_MK", "WINDOWS-1251", "mn_MN", "WINDOWS-1251", "ms_BN", "WINDOWS-1252",
"ms_MY", "WINDOWS-1252", "nl_BE", "WINDOWS-1252", "nl_NL", "WINDOWS-1252", "nl_SR", "WINDOWS-1252",
"nn_NO", "WINDOWS-1252", "nb_NO", "WINDOWS-1252", "pl_PL", "WINDOWS-1250", "pt_BR", "WINDOWS-1252",
"pt_PT", "WINDOWS-1252", "rm_CH", "WINDOWS-1252", "ro_RO", "WINDOWS-1250", "ru_RU", "WINDOWS-1251",
"sk_SK", "WINDOWS-1250", "sl_SI", "WINDOWS-1250", "sq_AL", "WINDOWS-1250", "sr_RS@latin", "WINDOWS-1250",
"sr_RS", "WINDOWS-1251", "sv_SE", "WINDOWS-1252", "sv_FI", "WINDOWS-1252", "sw_KE", "WINDOWS-1252",
"th_TH", "WINDOWS-874", "tr_TR", "WINDOWS-1254", "tt_RU", "WINDOWS-1251", "uk_UA", "WINDOWS-1251",
"ur_PK", "WINDOWS-1256", "uz_UZ@cyrillic", "WINDOWS-1251", "uz_UZ", "WINDOWS-1254", "vi_VN", "WINDOWS-1258",
"wa_BE", "WINDOWS-1252", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK"
};

bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage)
{
const char *targetCp = NULL;
char specCP[32];

if (useSpecifiedCodePage && codePage != 65001) {
if (codePage == 0) isOem = false;
else if (codePage == 1) isOem = true;
else {
snprintf(specCP, sizeof(specCP), "CP%u", (unsigned)codePage);
targetCp = specCP;
}
}

if (!targetCp) {
const char *lc = getenv("LC_ALL");
if (!lc || !lc[0]) lc = getenv("LC_CTYPE");
if (!lc || !lc[0]) lc = getenv("LANG");

if (!lc || !lc[0] || strcmp(lc, "C") == 0 || strcmp(lc, "POSIX") == 0) {
targetCp = isOem ? "IBM437" : "WINDOWS-1252";
} else {
int lcLen;
for (lcLen = 0; lc[lcLen] != '.' && lc[lcLen] != '@' && lc[lcLen] != '\0'; ++lcLen);
const char **table = isOem ? lcToOemTable : lcToAnsiTable;
int tableLen = isOem ? (sizeof(lcToOemTable) / sizeof(lcToOemTable[0])) : (sizeof(lcToAnsiTable) / sizeof(lcToAnsiTable[0]));
for (int i = 0; i < tableLen; i += 2) {
if (strncmp(lc, table[i], lcLen) == 0 && table[i][lcLen] == '\0') {
targetCp = table[i + 1];
break;
}
}
if (!targetCp) targetCp = isOem ? "IBM437" : "WINDOWS-1252";
}
}

iconv_t cd = iconv_open("UTF-8", targetCp);
if (cd == (iconv_t)-1) return false;

AString sUtf8;
unsigned slen = src.Len();
char* srcPtr = const_cast<char*>(src.Ptr());
unsigned dlen = slen * 4 + 1;
char* dstPtr = sUtf8.GetBuf_SetEnd(dlen);
const char* dstStart = dstPtr;
memset(dstPtr, 0, dlen);
size_t slen_st = slen;
size_t dlen_st = dlen;
size_t done = iconv(cd, &srcPtr, &slen_st, &dstPtr, &dlen_st);
iconv_close(cd);

if (done == (size_t)-1) return false;
*dstPtr = '\0';
sUtf8.ReleaseBuf_SetEnd((unsigned)(dstPtr - dstStart));
return ConvertUTF8ToUnicode(sUtf8, dest);
}
#endif
static const char k_DefultChar = '_';

#ifdef _WIN32
Expand Down
3 changes: 3 additions & 0 deletions CPP/Common/StringConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)

AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed);
AString UnicodeStringToMultiByte(const UString &src, UINT codePage = CP_ACP);
#ifndef _WIN32
bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage);
#endif

inline const wchar_t* GetUnicodeString(const wchar_t *u) { return u; }
inline const UString& GetUnicodeString(const UString &u) { return u; }
Expand Down