diff --git a/go.mod b/go.mod index 89fd71460..11c3a8168 100644 --- a/go.mod +++ b/go.mod @@ -43,6 +43,7 @@ require ( github.com/mark3labs/mcp-go v0.43.2 github.com/microcosm-cc/bluemonday v1.0.27 github.com/mozillazg/go-pinyin v0.20.0 + github.com/mozillazg/go-unidecode v0.2.0 github.com/ory/dockertest/v3 v3.11.0 github.com/robfig/cron/v3 v3.0.1 github.com/sashabaranov/go-openai v1.41.2 diff --git a/go.sum b/go.sum index bf30d85b3..be61e11f6 100644 --- a/go.sum +++ b/go.sum @@ -462,6 +462,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/mozillazg/go-pinyin v0.20.0 h1:BtR3DsxpApHfKReaPO1fCqF4pThRwH9uwvXzm+GnMFQ= github.com/mozillazg/go-pinyin v0.20.0/go.mod h1:iR4EnMMRXkfpFVV5FMi4FNB6wGq9NV6uDWbUuPhP4Yc= +github.com/mozillazg/go-unidecode v0.2.0 h1:vFGEzAH9KSwyWmXCOblazEWDh7fOkpmy/Z4ArmamSUc= +github.com/mozillazg/go-unidecode v0.2.0/go.mod h1:zB48+/Z5toiRolOZy9ksLryJ976VIwmDmpQ2quyt1aA= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg= github.com/nats-io/jwt v0.3.2/go.mod h1:/euKqTS1ZD+zzjYrY7pseZrTtWQSjujC7xjPc8wL6eU= diff --git a/pkg/htmltext/htmltext.go b/pkg/htmltext/htmltext.go index e2e017c8d..929080838 100644 --- a/pkg/htmltext/htmltext.go +++ b/pkg/htmltext/htmltext.go @@ -25,6 +25,8 @@ import ( "net/url" "regexp" "strings" + "sync/atomic" + "unicode" "unicode/utf8" "github.com/Machiel/slugify" @@ -32,6 +34,7 @@ import ( "github.com/apache/answer/pkg/converter" strip "github.com/grokify/html-strip-tags-go" "github.com/mozillazg/go-pinyin" + "github.com/mozillazg/go-unidecode" ) var ( @@ -47,8 +50,27 @@ var ( "\r", " ", "\t", " ", ) + + // Without this, pure non-Latin titles (Arabic, Cyrillic, Hebrew, ...) get + // stripped by slugify and collapse to the "topic" fallback. Chinese is + // handled separately by convertChinese. + transliterateNonLatin atomic.Bool ) +func init() { + transliterateNonLatin.Store(true) +} + +// SetTransliterateNonLatin toggles non-Latin script transliteration for URL slugs. +func SetTransliterateNonLatin(enabled bool) { + transliterateNonLatin.Store(enabled) +} + +// IsTransliterateNonLatinEnabled reports whether non-Latin transliteration is on. +func IsTransliterateNonLatinEnabled() bool { + return transliterateNonLatin.Load() +} + // ClearText clear HTML, get the clear text func ClearText(html string) string { if html == "" { @@ -66,6 +88,9 @@ func ClearText(html string) string { func UrlTitle(title string) (text string) { title = convertChinese(title) + if transliterateNonLatin.Load() { + title = convertNonLatin(title) + } title = clearEmoji(title) title = slugify.Slugify(title) title = url.QueryEscape(title) @@ -95,6 +120,30 @@ func convertChinese(content string) string { return strings.Join(pinyin.LazyConvert(content, nil), "-") } +// Short-circuits on Latin-only / Chinese-only input so existing slugs stay byte-identical. +func convertNonLatin(content string) string { + if !containsNonLatin(content) { + return content + } + return unidecode.Unidecode(content) +} + +func containsNonLatin(content string) bool { + for _, r := range content { + switch { + case r < 0x0080: // ASCII + continue + case r >= 0x0080 && r <= 0x024F: // Latin-1 Supplement, Latin Extended-A/B + continue + case unicode.Is(unicode.Han, r): // handled by convertChinese + continue + case unicode.IsLetter(r): + return true + } + } + return false +} + func cutLongTitle(title string) string { maxBytes := 150 if len(title) <= maxBytes { diff --git a/pkg/htmltext/htmltext_test.go b/pkg/htmltext/htmltext_test.go index 39de9e960..3fd263db6 100644 --- a/pkg/htmltext/htmltext_test.go +++ b/pkg/htmltext/htmltext_test.go @@ -87,6 +87,75 @@ func TestUrlTitle(t *testing.T) { } } +func TestUrlTitleTable(t *testing.T) { + // Long pure-Arabic title: 50 copies of the same Arabic word, joined by spaces. + // Unidecode of "كيف" is "kyf", so the slug becomes "kyf-" repeated and + // exceeds cutLongTitle's 150-byte cap. + longArabic := strings.Repeat("كيف ", 50) + wantLongArabic := strings.Repeat("kyf-", 37) + "ky" // 37*4 + 2 = 150 bytes + + cases := []struct { + name string + title string + want string + }{ + { + name: "empty", + title: "", + want: "topic", + }, + { + name: "pure latin unchanged", + title: "hello world", + want: "hello-world", + }, + { + // Pinyin conversion drops Latin runes by design — matches pre-fix behavior. + name: "pure chinese unchanged", + title: "这是一个,标题,title", + want: "zhe-shi-yi-ge-biao-ti", + }, + { + name: "pure arabic transliterated", + title: "كيف حالك", + want: "kyf-hlk", + }, + { + name: "mixed latin and arabic", + title: "hello كيف", + want: "hello-kyf", + }, + { + name: "emoji only falls back to topic", + title: "😂😂😂", + want: "topic", + }, + { + name: "long arabic truncates at cutLongTitle boundary", + title: longArabic, + want: wantLongArabic, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := UrlTitle(tc.title) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestUrlTitleTransliterationToggle(t *testing.T) { + defer SetTransliterateNonLatin(true) + + SetTransliterateNonLatin(false) + // With transliteration off, pure-Arabic titles collapse to the existing + // "topic" fallback (the pre-fix behavior). + assert.Equal(t, "topic", UrlTitle("كيف حالك")) + + SetTransliterateNonLatin(true) + assert.Equal(t, "kyf-hlk", UrlTitle("كيف حالك")) +} + func TestFindFirstMatchedWord(t *testing.T) { var ( expectedWord,