From 51a011ae0ad72bce5bc8bf303d49f62e98406498 Mon Sep 17 00:00:00 2001 From: notfoundzzz Date: Wed, 21 Jan 2026 14:40:54 +0800 Subject: [PATCH 1/3] =?UTF-8?q?[209=5F9]=20=E4=BF=AE=E5=A4=8D=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E6=A8=A1=E5=BC=8F=E7=9A=84=E4=B8=AD=E6=96=87=E6=8D=A2?= =?UTF-8?q?=E8=A1=8C=E6=98=BE=E7=A4=BA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TeXmacs/tests/tmu/209_9.tmu | 36 ++++++++++++++++++++ devel/209_9.md | 47 +++++++++++++++++++++++++++ src/System/Language/prog_language.cpp | 45 +++++++++++++++++++++++-- src/System/Language/verb_language.cpp | 45 +++++++++++++++++++++++-- 4 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 TeXmacs/tests/tmu/209_9.tmu create mode 100644 devel/209_9.md diff --git a/TeXmacs/tests/tmu/209_9.tmu b/TeXmacs/tests/tmu/209_9.tmu new file mode 100644 index 0000000000..f36ea0b188 --- /dev/null +++ b/TeXmacs/tests/tmu/209_9.tmu @@ -0,0 +1,36 @@ +> + +> + +<\body> + code模式示例: + + <\cpp-code> + > + + + python-code模式示例: + + <\python-code> + + + + cpp-code模式示例: + + <\cpp-code> + + + + r-code模式示例: + + <\r-code> + + + + +<\initial> + <\collection> + + + + diff --git a/devel/209_9.md b/devel/209_9.md new file mode 100644 index 0000000000..ed50b24521 --- /dev/null +++ b/devel/209_9.md @@ -0,0 +1,47 @@ +# 209_9 修复代码模式的中文换行显示问题 + +## 如何测试 +1. 启动 Mogan / TeXmacs +2. 插入以下任意代码环境之一(或其他支持的代码环境): + - `\code` + - `\python-code` + - `\cpp-code` + - `\r-code` + +3. 输入一行**足够长**、包含**中文字符**的内容,例如: +```tex +z中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中中 +``` +4. 在开头输入字符以触发不同位置的自动换行 + +期望结果: + +- 中文字符不会被拆分 + +- 不再出现 <#XXXX> 或在 < 处断裂的异常显示 + +- 中文字符要么完整出现在上一行,要么完整出现在下一行 + +测试文档: TeXmacs/tests/tmu/209_9.tmu + +## 2026/1/21 +### What +修复在代码模式下(包括 \code、\python-code、\cpp-code 等环境) +中文字符在自动换行时被错误拆分、显示为 <#XXXX> 的问题。 + +### Why +代码模式在自动换行时直接按字符串下标切分字符串, +当断行位置落在 <#XXXX> 内部时,会破坏内部转义结构, +最终导致渲染失败并显示为 <#XXXX>。 + +关联issue #2605 + +### How +在 verb_language_rep::hyphenate 与 prog_language_rep::hyphenate 中 +引入断行边界保护机制: + +- 将 <#...> 内部转义序列视为不可拆分的原子 + +- 若断行位置落在原子内部,则向左吸附到最近的合法边界 + +- 仅在合法边界处对字符串进行切分 diff --git a/src/System/Language/prog_language.cpp b/src/System/Language/prog_language.cpp index fe4cb40071..02d393a6ef 100644 --- a/src/System/Language/prog_language.cpp +++ b/src/System/Language/prog_language.cpp @@ -21,6 +21,46 @@ #include "tm_url.hpp" #include "tree_helper.hpp" +// Protect TeXmacs internal escape sequences like "<#4E2D>" (CJK, etc.) +// from being split during automatic line wrapping in prog/code environments. +static inline int +tm_atom_end_for_code_wrap (string s, int i) { + int n= N (s); + if (i < 0 || i >= n) return i; + if (s[i] != '<') return i + 1; + + // Only treat "<#...>" as an indivisible atom to avoid affecting normal code + // like "" + if (i + 1 >= n || s[i + 1] != '#') return i + 1; + + int j= i + 2; + while (j < n && s[j] != '>') + j++; + if (j < n && s[j] == '>') return j + 1; + + // malformed sequence: degrade gracefully + return i + 1; +} + +// Snap "after" to the greatest atom boundary <= after, so we never split inside +// "<#...>". +static inline int +tm_snap_after_boundary_for_code_wrap (string s, int after) { + int n= N (s); + if (after <= 0) return 0; + if (after >= n) return n; + + int i = 0; + int last= 0; + while (i < n) { + int j= tm_atom_end_for_code_wrap (s, i); + if (j > after) break; + last= j; + i = j; + } + return last; +} + prog_language_rep::prog_language_rep (string name) : abstract_language_rep (name) { if (DEBUG_PARSER) @@ -434,8 +474,9 @@ prog_language_rep::get_hyphens (string s) { void prog_language_rep::hyphenate (string s, int after, string& left, string& right) { - left = s (0, after); - right= s (after, N (s)); + int a= tm_snap_after_boundary_for_code_wrap (s, after); + left = s (0, a); + right= s (a, N (s)); } string diff --git a/src/System/Language/verb_language.cpp b/src/System/Language/verb_language.cpp index 2ed1714e69..9581a8025f 100644 --- a/src/System/Language/verb_language.cpp +++ b/src/System/Language/verb_language.cpp @@ -24,6 +24,46 @@ is_sep_char (char c) { return c == '-' || c == '/' || c == '\\' || c == ',' || c == '?'; } +// Protect TeXmacs internal escape sequences like "<#4E2D>" (CJK, etc.) +// from being split during automatic line wrapping in code/prog environments. +static inline int +tm_atom_end_for_code_wrap (string s, int i) { + int n= N (s); + if (i < 0 || i >= n) return i; + if (s[i] != '<') return i + 1; + + // Only treat "<#...>" as an indivisible atom to avoid affecting normal code + // like "" + if (i + 1 >= n || s[i + 1] != '#') return i + 1; + + int j= i + 2; + while (j < n && s[j] != '>') + j++; + if (j < n && s[j] == '>') return j + 1; + + // malformed sequence: degrade gracefully + return i + 1; +} + +// Snap "after" to the greatest atom boundary <= after, so we never split inside +// "<#...>". +static inline int +tm_snap_after_boundary_for_code_wrap (string s, int after) { + int n= N (s); + if (after <= 0) return 0; + if (after >= n) return n; + + int i = 0; + int last= 0; + while (i < n) { + int j= tm_atom_end_for_code_wrap (s, i); + if (j > after) break; + last= j; + i = j; + } + return last; +} + text_property verb_language_rep::advance (tree t, int& pos) { string s= t->label; @@ -64,8 +104,9 @@ verb_language_rep::get_hyphens (string s) { void verb_language_rep::hyphenate (string s, int after, string& left, string& right) { - left = s (0, after); - right= s (after, N (s)); + int a= tm_snap_after_boundary_for_code_wrap (s, after); + left = s (0, a); + right= s (a, N (s)); } string From c6804bf9481357af7ecbd07de0e5421122dc456e Mon Sep 17 00:00:00 2001 From: notfoundzzz Date: Wed, 21 Jan 2026 15:45:39 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=8F=90=E5=8F=96=E8=BE=85=E5=8A=A9?= =?UTF-8?q?=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/System/Language/code_wrap.hpp | 46 +++++++++++++++++++++++++++ src/System/Language/prog_language.cpp | 41 +----------------------- src/System/Language/verb_language.cpp | 41 +----------------------- 3 files changed, 48 insertions(+), 80 deletions(-) create mode 100644 src/System/Language/code_wrap.hpp diff --git a/src/System/Language/code_wrap.hpp b/src/System/Language/code_wrap.hpp new file mode 100644 index 0000000000..325ca2768c --- /dev/null +++ b/src/System/Language/code_wrap.hpp @@ -0,0 +1,46 @@ +#ifndef TM_CODE_WRAP_HPP +#define TM_CODE_WRAP_HPP + +#include "basic.hpp" +#include "string.hpp" +// Protect TeXmacs internal escape sequences like "<#4E2D>" (CJK, etc.) +// from being split during automatic line wrapping in code/prog environments. +// +// NOTE: +// - We only protect "<#...>" to avoid affecting normal code like "". +// - This is a last-resort safety net: even if the line breaker proposes an +// invalid split position, we snap it to a valid boundary here. +static inline int +tm_atom_end_for_code_wrap (string s, int i) { + int n= N (s); + if (i < 0 || i >= n) return i; + if (s[i] != '<') return i + 1; + + if (i + 1 >= n || s[i + 1] != '#') return i + 1; + + int j= i + 2; + while (j < n && s[j] != '>') + j++; + if (j < n && s[j] == '>') return j + 1; + + return i + 1; +} + +static inline int +tm_snap_after_boundary_for_code_wrap (string s, int after) { + int n= N (s); + if (after <= 0) return 0; + if (after >= n) return n; + + int i = 0; + int last= 0; + while (i < n) { + int j= tm_atom_end_for_code_wrap (s, i); + if (j > after) break; + last= j; + i = j; + } + return last; +} + +#endif // TM_CODE_WRAP_HPP diff --git a/src/System/Language/prog_language.cpp b/src/System/Language/prog_language.cpp index 02d393a6ef..218825ed5e 100644 --- a/src/System/Language/prog_language.cpp +++ b/src/System/Language/prog_language.cpp @@ -11,6 +11,7 @@ ******************************************************************************/ #include "analyze.hpp" +#include "code_wrap.hpp" #include "convert.hpp" #include "converter.hpp" #include "cork.hpp" @@ -21,46 +22,6 @@ #include "tm_url.hpp" #include "tree_helper.hpp" -// Protect TeXmacs internal escape sequences like "<#4E2D>" (CJK, etc.) -// from being split during automatic line wrapping in prog/code environments. -static inline int -tm_atom_end_for_code_wrap (string s, int i) { - int n= N (s); - if (i < 0 || i >= n) return i; - if (s[i] != '<') return i + 1; - - // Only treat "<#...>" as an indivisible atom to avoid affecting normal code - // like "" - if (i + 1 >= n || s[i + 1] != '#') return i + 1; - - int j= i + 2; - while (j < n && s[j] != '>') - j++; - if (j < n && s[j] == '>') return j + 1; - - // malformed sequence: degrade gracefully - return i + 1; -} - -// Snap "after" to the greatest atom boundary <= after, so we never split inside -// "<#...>". -static inline int -tm_snap_after_boundary_for_code_wrap (string s, int after) { - int n= N (s); - if (after <= 0) return 0; - if (after >= n) return n; - - int i = 0; - int last= 0; - while (i < n) { - int j= tm_atom_end_for_code_wrap (s, i); - if (j > after) break; - last= j; - i = j; - } - return last; -} - prog_language_rep::prog_language_rep (string name) : abstract_language_rep (name) { if (DEBUG_PARSER) diff --git a/src/System/Language/verb_language.cpp b/src/System/Language/verb_language.cpp index 9581a8025f..a31828a5c8 100644 --- a/src/System/Language/verb_language.cpp +++ b/src/System/Language/verb_language.cpp @@ -10,6 +10,7 @@ ******************************************************************************/ #include "analyze.hpp" +#include "code_wrap.hpp" #include "impl_language.hpp" #include "observers.hpp" #include "packrat.hpp" @@ -24,46 +25,6 @@ is_sep_char (char c) { return c == '-' || c == '/' || c == '\\' || c == ',' || c == '?'; } -// Protect TeXmacs internal escape sequences like "<#4E2D>" (CJK, etc.) -// from being split during automatic line wrapping in code/prog environments. -static inline int -tm_atom_end_for_code_wrap (string s, int i) { - int n= N (s); - if (i < 0 || i >= n) return i; - if (s[i] != '<') return i + 1; - - // Only treat "<#...>" as an indivisible atom to avoid affecting normal code - // like "" - if (i + 1 >= n || s[i + 1] != '#') return i + 1; - - int j= i + 2; - while (j < n && s[j] != '>') - j++; - if (j < n && s[j] == '>') return j + 1; - - // malformed sequence: degrade gracefully - return i + 1; -} - -// Snap "after" to the greatest atom boundary <= after, so we never split inside -// "<#...>". -static inline int -tm_snap_after_boundary_for_code_wrap (string s, int after) { - int n= N (s); - if (after <= 0) return 0; - if (after >= n) return n; - - int i = 0; - int last= 0; - while (i < n) { - int j= tm_atom_end_for_code_wrap (s, i); - if (j > after) break; - last= j; - i = j; - } - return last; -} - text_property verb_language_rep::advance (tree t, int& pos) { string s= t->label; From 9c008737ba9f54fe831cc87a2a2e9b512482a960 Mon Sep 17 00:00:00 2001 From: notfoundzzz Date: Thu, 5 Feb 2026 17:32:54 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0LICENCE=20HEADER=E5=92=8C?= =?UTF-8?q?=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/System/Language/code_wrap.hpp | 10 ++++++++++ src/System/Language/verb_language.cpp | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/System/Language/code_wrap.hpp b/src/System/Language/code_wrap.hpp index 325ca2768c..22cc9ae0ca 100644 --- a/src/System/Language/code_wrap.hpp +++ b/src/System/Language/code_wrap.hpp @@ -1,3 +1,13 @@ +/****************************************************************************** + * MODULE : code_wrap.hpp + * DESCRIPTION: helpers for safe code wrapping boundaries + * COPYRIGHT : (C) 2026 The MoganSTEM contributors + ******************************************************************************* + * This software falls under the GNU general public license version 3 or later. + * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE + * in the root directory or . + ******************************************************************************/ + #ifndef TM_CODE_WRAP_HPP #define TM_CODE_WRAP_HPP diff --git a/src/System/Language/verb_language.cpp b/src/System/Language/verb_language.cpp index a31828a5c8..e845cd0723 100644 --- a/src/System/Language/verb_language.cpp +++ b/src/System/Language/verb_language.cpp @@ -62,6 +62,16 @@ verb_language_rep::get_hyphens (string s) { return penalty; } +/** + * @brief 按代码原子边界切分 verbatim 文本,避免将 "<#...>" 内部拆开。 + * @param s 待切分的原始字符串。 + * @param after 布局器建议的断行位置(可能落在原子内部)。 + * @param left 返回断点左侧内容。 + * @param right 返回断点右侧内容。 + * + * @note 示例:当 s 为 "ab<#4E2D>cd" 且 after 落在 "<#4E2D>" 内部时, + * 会将断点回退到原子起止边界,避免产生非法拆分。 + */ void verb_language_rep::hyphenate (string s, int after, string& left, string& right) {