From a806e07a8be22f92aa81c0feae0885be3e00d798 Mon Sep 17 00:00:00 2001 From: butschster Date: Tue, 31 Mar 2026 22:13:06 +0400 Subject: [PATCH] fix: convert non-UTF-8 email body charset to UTF-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Email bodies arriving in charsets like windows-1250, iso-8859-2, etc. were stored as raw bytes and displayed as garbled text (�). Added convertToUTF8() using golang.org/x/text to transcode body content based on the charset parameter from Content-Type headers. --- go.mod | 1 + go.sum | 3 ++ modules/smtp/handler.go | 21 ++++++++ modules/smtp/handler_test.go | 93 ++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+) diff --git a/go.mod b/go.mod index 7b7400c..b440503 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,7 @@ require ( github.com/yosida95/uritemplate/v3 v3.0.2 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/sys v0.42.0 // indirect + golang.org/x/text v0.35.0 // indirect google.golang.org/protobuf v1.36.8 // indirect modernc.org/libc v1.70.0 // indirect modernc.org/mathutil v1.7.1 // indirect diff --git a/go.sum b/go.sum index 1af3d42..7b50f84 100644 --- a/go.sum +++ b/go.sum @@ -71,9 +71,12 @@ golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= diff --git a/modules/smtp/handler.go b/modules/smtp/handler.go index 7781723..f259afe 100644 --- a/modules/smtp/handler.go +++ b/modules/smtp/handler.go @@ -19,6 +19,7 @@ import ( "github.com/buggregator/go-buggregator/internal/event" "github.com/buggregator/go-buggregator/internal/storage" gosmtp "github.com/emersion/go-smtp" + "golang.org/x/text/encoding/ianaindex" ) // smtpServer wraps go-smtp and implements tcp.Starter. @@ -226,6 +227,7 @@ func parseEmail(raw []byte, recipients []string) (*ParsedEmail, []parsedAttachme if err != nil || !strings.HasPrefix(mediaType, "multipart/") { body, _ := io.ReadAll(msg.Body) decoded := decodeContent(body, msg.Header.Get("Content-Transfer-Encoding")) + decoded = convertToUTF8(decoded, params["charset"]) if strings.HasPrefix(mediaType, "text/html") { parsed.HTML = string(decoded) } else { @@ -307,6 +309,7 @@ func processPart(part *multipart.Part, parsed *ParsedEmail, atts *[]parsedAttach // Body content. body, _ := io.ReadAll(part) decoded := decodeContent(body, part.Header.Get("Content-Transfer-Encoding")) + decoded = convertToUTF8(decoded, params["charset"]) if strings.HasPrefix(mediaType, "text/html") { parsed.HTML += string(decoded) @@ -345,3 +348,21 @@ func decodeContent(data []byte, encoding string) []byte { return data } } + +// convertToUTF8 converts data from the given charset to UTF-8. +// If charset is empty, "utf-8", or "us-ascii", the data is returned as-is. +func convertToUTF8(data []byte, charset string) []byte { + charset = strings.ToLower(strings.TrimSpace(charset)) + if charset == "" || charset == "utf-8" || charset == "us-ascii" { + return data + } + enc, err := ianaindex.MIME.Encoding(charset) + if err != nil || enc == nil { + return data + } + decoded, err := enc.NewDecoder().Bytes(data) + if err != nil { + return data + } + return decoded +} diff --git a/modules/smtp/handler_test.go b/modules/smtp/handler_test.go index 72646f8..60c7790 100644 --- a/modules/smtp/handler_test.go +++ b/modules/smtp/handler_test.go @@ -128,6 +128,99 @@ func TestDecodeContent(t *testing.T) { }) } +func TestConvertToUTF8(t *testing.T) { + t.Run("utf-8 passthrough", func(t *testing.T) { + input := []byte("Привет мир") + got := convertToUTF8(input, "utf-8") + if string(got) != "Привет мир" { + t.Errorf("got %q", got) + } + }) + + t.Run("empty charset passthrough", func(t *testing.T) { + input := []byte("hello") + got := convertToUTF8(input, "") + if string(got) != "hello" { + t.Errorf("got %q", got) + } + }) + + t.Run("windows-1250 czech", func(t *testing.T) { + // "týmu se vám ozve" in windows-1250 + win1250 := []byte{0x74, 0xFD, 0x6D, 0x75, 0x20, 0x73, 0x65, 0x20, 0x76, 0xE1, 0x6D, 0x20, 0x6F, 0x7A, 0x76, 0x65} + got := convertToUTF8(win1250, "windows-1250") + want := "týmu se vám ozve" + if string(got) != want { + t.Errorf("got %q, want %q", got, want) + } + }) + + t.Run("iso-8859-1 latin", func(t *testing.T) { + // "café" in iso-8859-1: 0x63 0x61 0x66 0xe9 + latin1 := []byte{0x63, 0x61, 0x66, 0xe9} + got := convertToUTF8(latin1, "iso-8859-1") + if string(got) != "café" { + t.Errorf("got %q, want %q", got, "café") + } + }) + + t.Run("unknown charset passthrough", func(t *testing.T) { + input := []byte("data") + got := convertToUTF8(input, "x-nonexistent-999") + if string(got) != "data" { + t.Errorf("got %q", got) + } + }) +} + +func TestParseEmail_CharsetConversion(t *testing.T) { + t.Run("single part windows-1250", func(t *testing.T) { + // "café" in windows-1250: 0x63 0x61 0x66 0xe9 + body := string([]byte{0x63, 0x61, 0x66, 0xe9}) + raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: text/plain; charset=windows-1250\r\n\r\n" + body) + + parsed, _, err := parseEmail(raw, []string{"recipient@example.com"}) + if err != nil { + t.Fatal(err) + } + if parsed.Text != "café" { + t.Errorf("Text = %q, want %q", parsed.Text, "café") + } + }) + + t.Run("single part html iso-8859-2", func(t *testing.T) { + // "Vaše zpráva" in iso-8859-2: V=56 a=61 š=B9 e=65 sp=20 z=7A p=70 r=72 á=E1 v=76 a=61 + body := string([]byte{0x56, 0x61, 0xB9, 0x65, 0x20, 0x7A, 0x70, 0x72, 0xE1, 0x76, 0x61}) + raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: text/html; charset=iso-8859-2\r\n\r\n" + body) + + parsed, _, err := parseEmail(raw, []string{"recipient@example.com"}) + if err != nil { + t.Fatal(err) + } + want := "Vaše zpráva" + if parsed.HTML != want { + t.Errorf("HTML = %q, want %q", parsed.HTML, want) + } + }) + + t.Run("multipart with charset", func(t *testing.T) { + // "café" in iso-8859-1 + body := string([]byte{0x63, 0x61, 0x66, 0xe9}) + raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: multipart/alternative; boundary=\"bnd\"\r\n\r\n--bnd\r\nContent-Type: text/plain; charset=iso-8859-1\r\n\r\n" + body + "\r\n--bnd\r\nContent-Type: text/html; charset=iso-8859-1\r\n\r\n

" + body + "

\r\n--bnd--") + + parsed, _, err := parseEmail(raw, []string{"recipient@example.com"}) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(parsed.Text, "café") { + t.Errorf("Text = %q, want to contain %q", parsed.Text, "café") + } + if !strings.Contains(parsed.HTML, "café") { + t.Errorf("HTML = %q, want to contain %q", parsed.HTML, "café") + } + }) +} + func TestParseAddresses(t *testing.T) { t.Run("name and email", func(t *testing.T) { header := make(map[string][]string)