From 5f62aedd08e18f60687565d560516d43f6af4d44 Mon Sep 17 00:00:00 2001 From: himmel Date: Tue, 12 Aug 2025 16:09:14 +0800 Subject: [PATCH 1/3] Add GB18030 character set support documentation and navigation links --- CN/modules/ROOT/nav.adoc | 1 + CN/modules/ROOT/pages/master/6.4.adoc | 121 +++++++++++++++++++++++++ EN/modules/ROOT/nav.adoc | 1 + EN/modules/ROOT/pages/master/6.4.adoc | 122 ++++++++++++++++++++++++++ 4 files changed, 245 insertions(+) create mode 100644 CN/modules/ROOT/pages/master/6.4.adoc create mode 100644 EN/modules/ROOT/pages/master/6.4.adoc diff --git a/CN/modules/ROOT/nav.adoc b/CN/modules/ROOT/nav.adoc index 0f8e70f7..1c4ee668 100644 --- a/CN/modules/ROOT/nav.adoc +++ b/CN/modules/ROOT/nav.adoc @@ -23,6 +23,7 @@ *** xref:master/6.3.1.adoc[like] *** xref:master/6.3.3.adoc[RowID] *** xref:master/6.3.2.adoc[OUT 参数] +** xref:master/6.4.adoc[国标GB18030] * Oracle兼容功能列表 ** xref:master/7.1.adoc[1、框架设计] ** xref:master/7.2.adoc[2、GUC框架] diff --git a/CN/modules/ROOT/pages/master/6.4.adoc b/CN/modules/ROOT/pages/master/6.4.adoc new file mode 100644 index 00000000..731a230a --- /dev/null +++ b/CN/modules/ROOT/pages/master/6.4.adoc @@ -0,0 +1,121 @@ +:sectnums: +:sectnumlevels: 5 + +:imagesdir: ./_images + += GB18030 字符集服务端支持 + +== 目的 + +PostgreSQL 服务端提供了对 GB18030 字符集的全面支持。GB18030是中国国家标准,旨在包含所有汉字和多种少数民族文字,实现与Unicode的统一。在PostgreSQL中正确配置和使用GB18030字符集,对于处理和存储需要符合此标准的中文数据至关重要。 + +服务端 GB18030 支持应当具备以下特性: + +|==== +| 1. 支持 GB18030 作为服务端编码:initdb -E GB18030 可用,SHOW server_encoding 显示为 GB18030。 +| 2. 提供 GB18030 <-> UTF8 的双向转换。 +| 3. 支持多字节边界判定。 +|==== + +== 实现说明 + +=== initdb时通过-E指定GB18030或GB18030_2022 + +PostgreSQL已支持GB18030-2000版本作为客户端编码,通过扩展的方式支持GB18030_2022字符集与UTF的转换。 + +修改pg_enc来实现可指定GB18030作为服务端编码,PostgreSQL编码框架中增加底层函数以供pg内核调用。 + +设置一个全局变量is_load_gb18030_2022,默认为true,当用户指定-E选项时,在get_encoding_id中判断其设置的是否为gb18030_2022,如果是,将其字符串转为gb18030,然后将is_load_gb18030_2022 设为true,如果-E 选项为Gb18030,将其设为false。 + +在适当位置判断是否要加载插件,如果是,执行load_gb18030_2022,并将ivorysql.conf中的shared_preload_library添加gb18030_2022。 + +``` +if (encoding_name && *encoding_name) +{ + encoding_name_modify = pg_strdup(encoding_name); + if(pg_strcasecmp(encoding_name,"gb18030_2022") == 0) + { +    encoding_name_modify = pg_strdup("gb18030"); +    is_load_gb18030_2022 =  true; + } +   else if(pg_strcasecmp(encoding_name,"gb18030") == 0) +    is_load_gb18030_2022 = false; + +   if ((enc = pg_valid_server_encoding((const char *)encoding_name_modify)) >= 0) +    return enc; +} +``` + +=== 多字节处理 + +wchar.c,增加 GB18030 的函数指针: + pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar + pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 + pg_gb18030_mblen(const unsigned char *s):返回 1/2/4。 + pg_gb18030_dsplen(const unsigned char *s):ASCII 显示宽度 1;其它按 1 处理(与UTF8一致)。 + pg_gb18030_verifier(const unsigned char *s, int len):校验字节范围,拒绝非法序列。 + +=== 与客户端的交互 + +接收数据: 如果一个使用 UTF-8 编码的客户端连接上来,服务端在接收到数据后,会调用其内部的 utf8_to_gb18030 函数,将数据转换为 GB18030 格式,然后才进行验证和存储。 + +发送数据: 当该客户端执行 SELECT 查询时,服务端会从磁盘/内存中读取原生的 GB18030 数据,然后调用 gb18030_to_utf8 函数将其转换为 UTF-8 格式,最后再通过网络协议发送给客户端。 + +新增GB18030-2022.xml数据文件,通过perl脚本解析为map文件,提供 gb18030_to_utf8() 与 utf8_to_gb18030(),优先表驱动,覆盖不到的区间通过算法映射。 + +``` +static inline uint32 +unicode_to_utf8word(uint32 c) +{ + uint32 word; + + if (c <= 0x7F) + { + word = c; + } + else if (c <= 0x7FF) + { + word = (0xC0 | ((c >> 6) & 0x1F)) << 8; + word |= 0x80 | (c & 0x3F); + } + else if (c <= 0xFFFF) + { + word = (0xE0 | ((c >> 12) & 0x0F)) << 16; + word |= (0x80 | ((c >> 6) & 0x3F)) << 8; + word |= 0x80 | (c & 0x3F); + } + else + { + word = (0xF0 | ((c >> 18) & 0x07)) << 24; + word |= (0x80 | ((c >> 12) & 0x3F)) << 16; + word |= (0x80 | ((c >> 6) & 0x3F)) << 8; + word |= 0x80 | (c & 0x3F); + } + + return word; +} + +static uint32 +conv_18030_2022_to_utf8(uint32 code) +{ +#define conv18030(minunicode, mincode, maxcode) \ + if (code >= mincode && code <= maxcode) \ + return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode) + + conv18030(0x0452, 0x8130D330, 0x8136A531); + conv18030(0x2643, 0x8137A839, 0x8138FD38); + conv18030(0x361B, 0x8230A633, 0x8230F237); + conv18030(0x3CE1, 0x8231D438, 0x8232AF32); + conv18030(0x4160, 0x8232C937, 0x8232F837); + conv18030(0x44D7, 0x8233A339, 0x8233C931); + conv18030(0x478E, 0x8233E838, 0x82349638); + conv18030(0x49B8, 0x8234A131, 0x8234E733); + conv18030(0x9FA6, 0x82358F33, 0x8336C738); + conv18030(0xE865, 0x8336D030, 0x84308534); + conv18030(0xFA2A, 0x84309C38, 0x84318537); + conv18030(0xFFE6, 0x8431A234, 0x8431A439); + conv18030(0x10000, 0x90308130, 0xE3329A35); + /* No mapping exists */ + return 0; +} +``` \ No newline at end of file diff --git a/EN/modules/ROOT/nav.adoc b/EN/modules/ROOT/nav.adoc index 30b12a22..bcaec0c1 100644 --- a/EN/modules/ROOT/nav.adoc +++ b/EN/modules/ROOT/nav.adoc @@ -23,6 +23,7 @@ *** xref:master/6.3.1.adoc[like] *** xref:master/6.3.3.adoc[RowID] *** xref:master/6.3.2.adoc[OUT Parameter] +** xref:master/6.4.adoc[GB18030 Character Set] * List of Oracle compatible features ** xref:master/7.1.adoc[1、Ivorysql frame design] ** xref:master/7.2.adoc[2、GUC Framework] diff --git a/EN/modules/ROOT/pages/master/6.4.adoc b/EN/modules/ROOT/pages/master/6.4.adoc new file mode 100644 index 00000000..3492674b --- /dev/null +++ b/EN/modules/ROOT/pages/master/6.4.adoc @@ -0,0 +1,122 @@ +:sectnums: +:sectnumlevels: 5 + +:imagesdir: ./_images + += Server-Side Support for the GB18030 Character Set + +== Objective + +PostgreSQL provides comprehensive server-side support for the GB18030 character set. GB18030 is a Chinese national standard designed to include all Chinese characters and various minority scripts, aiming for alignment with Unicode. Proper configuration and use of the GB18030 character set within PostgreSQL are essential for processing and storing Chinese data that must comply with this standard. + +Server-side GB18030 support should provide the following features: +|==== +| 1. Support for GB18030 as a server encoding: initdb -E GB18030 is available, and SHOW server_encoding displays GB18030. +| 2. Provide bidirectional conversion between GB18030 and UTF8. +| 3. Support for multibyte character boundary determination. +|==== + +== Implementation Details + +=== Specifying GB18030 or GB18030_2022 with the -E Option during initdb + +PostgreSQL has historically supported the GB18030-2000 standard as a client-side encoding. Support for conversion between the GB18030_2022 character set and UTF-8 is provided via an extension. + +To enable GB18030 as a server encoding, modifications are made to pg_enc, and new low-level functions are added to the PostgreSQL encoding framework for invocation by the core system. + +A global variable, is_load_gb18030_2022, is introduced with a default value of true. When the -E option is used during initdb, the get_encoding_id function checks the specified encoding name. If the name is gb18030_2022, it is internally mapped to the gb18030 encoding ID, and the is_load_gb18030_2022 flag is set to true. If the -E option is GB18030, the flag is set to false. + +At the appropriate stage in the startup process, the system checks this flag to determine if the extension should be loaded. If required, the load_gb18030_2022 function is executed, and the gb18030_2022 extension is added to the shared_preload_libraries parameter in ivorysql.conf. +``` +if (encoding_name && *encoding_name) +{ + encoding_name_modify = pg_strdup(encoding_name); + if(pg_strcasecmp(encoding_name,"gb18030_2022") == 0) + { +    encoding_name_modify = pg_strdup("gb18030"); +    is_load_gb18030_2022 =  true; + } +   else if(pg_strcasecmp(encoding_name,"gb18030") == 0) +    is_load_gb18030_2022 = false; + +   if ((enc = pg_valid_server_encoding((const char *)encoding_name_modify)) >= 0) +    return enc; +} +``` + +=== Multibyte Character Handling + +Function pointers for GB18030 are added in wchar.c: + pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar + pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 + pg_gb18030_mblen(const unsigned char *s):Returns 1/2/4. + pg_gb18030_dsplen(const unsigned char *s):Calculates the display width of a character. ASCII characters have a width of 1, while others are also treated as having a width of 1 . + pg_gb18030_verifier(const unsigned char *s, int len):Verifies that a byte sequence is a valid GB18030 character, rejecting illegal sequences. + +=== Client-Server Interaction + +Receiving Data + +When a client using UTF-8 encoding connects to the server, the server, upon receiving data, invokes its internal utf8_to_gb18030 function. This converts the data to the GB18030 format, which is then validated and stored. + +Sending Data + +When the same client executes a SELECT query, the server reads the native GB18030 data from disk or memory. It then calls the gb18030_to_utf8 function to convert the data to UTF-8 format before sending it to the client via the network protocol. + +A new data file, GB18030-2022.xml, is introduced. This file is parsed by a Perl script to generate mapping files that provide the logic for the gb18030_to_utf8() and utf8_to_gb18030() conversion functions. The implementation prioritizes a table-driven approach, falling back to algorithmic mapping for ranges not covered by the tables. +``` +static inline uint32 +unicode_to_utf8word(uint32 c) +{ + uint32 word; + + if (c <= 0x7F) + { + word = c; + } + else if (c <= 0x7FF) + { + word = (0xC0 | ((c >> 6) & 0x1F)) << 8; + word |= 0x80 | (c & 0x3F); + } + else if (c <= 0xFFFF) + { + word = (0xE0 | ((c >> 12) & 0x0F)) << 16; + word |= (0x80 | ((c >> 6) & 0x3F)) << 8; + word |= 0x80 | (c & 0x3F); + } + else + { + word = (0xF0 | ((c >> 18) & 0x07)) << 24; + word |= (0x80 | ((c >> 12) & 0x3F)) << 16; + word |= (0x80 | ((c >> 6) & 0x3F)) << 8; + word |= 0x80 | (c & 0x3F); + } + + return word; +} + +static uint32 +conv_18030_2022_to_utf8(uint32 code) +{ +#define conv18030(minunicode, mincode, maxcode) \ + if (code >= mincode && code <= maxcode) \ + return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode) + + conv18030(0x0452, 0x8130D330, 0x8136A531); + conv18030(0x2643, 0x8137A839, 0x8138FD38); + conv18030(0x361B, 0x8230A633, 0x8230F237); + conv18030(0x3CE1, 0x8231D438, 0x8232AF32); + conv18030(0x4160, 0x8232C937, 0x8232F837); + conv18030(0x44D7, 0x8233A339, 0x8233C931); + conv18030(0x478E, 0x8233E838, 0x82349638); + conv18030(0x49B8, 0x8234A131, 0x8234E733); + conv18030(0x9FA6, 0x82358F33, 0x8336C738); + conv18030(0xE865, 0x8336D030, 0x84308534); + conv18030(0xFA2A, 0x84309C38, 0x84318537); + conv18030(0xFFE6, 0x8431A234, 0x8431A439); + conv18030(0x10000, 0x90308130, 0xE3329A35); + /* No mapping exists */ + return 0; +} +``` \ No newline at end of file From b518bb9b39415f75339a0b2425ce562e6f99b5e8 Mon Sep 17 00:00:00 2001 From: himmel Date: Tue, 12 Aug 2025 16:16:56 +0800 Subject: [PATCH 2/3] Docs update --- CN/modules/ROOT/pages/master/6.4.adoc | 4 ++++ EN/modules/ROOT/pages/master/6.4.adoc | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/CN/modules/ROOT/pages/master/6.4.adoc b/CN/modules/ROOT/pages/master/6.4.adoc index 731a230a..e1bef321 100644 --- a/CN/modules/ROOT/pages/master/6.4.adoc +++ b/CN/modules/ROOT/pages/master/6.4.adoc @@ -50,9 +50,13 @@ if (encoding_name && *encoding_name) wchar.c,增加 GB18030 的函数指针: pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar + pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 + pg_gb18030_mblen(const unsigned char *s):返回 1/2/4。 + pg_gb18030_dsplen(const unsigned char *s):ASCII 显示宽度 1;其它按 1 处理(与UTF8一致)。 + pg_gb18030_verifier(const unsigned char *s, int len):校验字节范围,拒绝非法序列。 === 与客户端的交互 diff --git a/EN/modules/ROOT/pages/master/6.4.adoc b/EN/modules/ROOT/pages/master/6.4.adoc index 3492674b..32ca24c0 100644 --- a/EN/modules/ROOT/pages/master/6.4.adoc +++ b/EN/modules/ROOT/pages/master/6.4.adoc @@ -47,10 +47,15 @@ if (encoding_name && *encoding_name) === Multibyte Character Handling Function pointers for GB18030 are added in wchar.c: + pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar + pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 + pg_gb18030_mblen(const unsigned char *s):Returns 1/2/4. + pg_gb18030_dsplen(const unsigned char *s):Calculates the display width of a character. ASCII characters have a width of 1, while others are also treated as having a width of 1 . + pg_gb18030_verifier(const unsigned char *s, int len):Verifies that a byte sequence is a valid GB18030 character, rejecting illegal sequences. === Client-Server Interaction From b35231a215967da6f309639be2058c0c4be43c53 Mon Sep 17 00:00:00 2001 From: himmel Date: Tue, 12 Aug 2025 16:21:43 +0800 Subject: [PATCH 3/3] Docs update --- CN/modules/ROOT/pages/master/6.4.adoc | 13 +++++++------ EN/modules/ROOT/pages/master/6.4.adoc | 12 ++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CN/modules/ROOT/pages/master/6.4.adoc b/CN/modules/ROOT/pages/master/6.4.adoc index e1bef321..fcad85f5 100644 --- a/CN/modules/ROOT/pages/master/6.4.adoc +++ b/CN/modules/ROOT/pages/master/6.4.adoc @@ -49,15 +49,16 @@ if (encoding_name && *encoding_name) === 多字节处理 wchar.c,增加 GB18030 的函数指针: - pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar - pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 +pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar - pg_gb18030_mblen(const unsigned char *s):返回 1/2/4。 +pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 - pg_gb18030_dsplen(const unsigned char *s):ASCII 显示宽度 1;其它按 1 处理(与UTF8一致)。 - - pg_gb18030_verifier(const unsigned char *s, int len):校验字节范围,拒绝非法序列。 +pg_gb18030_mblen(const unsigned char *s):返回 1/2/4。 + +pg_gb18030_dsplen(const unsigned char *s):ASCII 显示宽度 1;其它按 1 处理(与UTF8一致)。 + +pg_gb18030_verifier(const unsigned char *s, int len):校验字节范围,拒绝非法序列。 === 与客户端的交互 diff --git a/EN/modules/ROOT/pages/master/6.4.adoc b/EN/modules/ROOT/pages/master/6.4.adoc index 32ca24c0..05776583 100644 --- a/EN/modules/ROOT/pages/master/6.4.adoc +++ b/EN/modules/ROOT/pages/master/6.4.adoc @@ -48,15 +48,15 @@ if (encoding_name && *encoding_name) Function pointers for GB18030 are added in wchar.c: - pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar +pg_gb180302wchar_with_len(const unsigned char *from, pg_wchar *to, int len) gb18030 -> wchar - pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 +pg_wchar2gb18030_with_len(const pg_wchar *from, unsigned char *to, int len) wchar -> gb18030 - pg_gb18030_mblen(const unsigned char *s):Returns 1/2/4. +pg_gb18030_mblen(const unsigned char *s):Returns 1/2/4. - pg_gb18030_dsplen(const unsigned char *s):Calculates the display width of a character. ASCII characters have a width of 1, while others are also treated as having a width of 1 . - - pg_gb18030_verifier(const unsigned char *s, int len):Verifies that a byte sequence is a valid GB18030 character, rejecting illegal sequences. +pg_gb18030_dsplen(const unsigned char *s):Calculates the display width of a character. ASCII characters have a width of 1, while others are also treated as having a width of 1 . + +pg_gb18030_verifier(const unsigned char *s, int len):Verifies that a byte sequence is a valid GB18030 character, rejecting illegal sequences. === Client-Server Interaction