Skip to content

Commit dece0fe

Browse files
authored
Merge pull request #324 from buggregator/fix/smtp-charset-conversion
fix: convert non-UTF-8 email body charset to UTF-8
2 parents 32b9af3 + a806e07 commit dece0fe

File tree

4 files changed

+118
-0
lines changed

4 files changed

+118
-0
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ require (
3434
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
3535
go.yaml.in/yaml/v2 v2.4.2 // indirect
3636
golang.org/x/sys v0.42.0 // indirect
37+
golang.org/x/text v0.35.0 // indirect
3738
google.golang.org/protobuf v1.36.8 // indirect
3839
modernc.org/libc v1.70.0 // indirect
3940
modernc.org/mathutil v1.7.1 // indirect

go.sum

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,12 @@ golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
7171
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
7272
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
7373
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
74+
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
7475
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
7576
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
7677
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
78+
golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
79+
golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
7780
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
7881
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
7982
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=

modules/smtp/handler.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/buggregator/go-buggregator/internal/event"
2020
"github.com/buggregator/go-buggregator/internal/storage"
2121
gosmtp "github.com/emersion/go-smtp"
22+
"golang.org/x/text/encoding/ianaindex"
2223
)
2324

2425
// smtpServer wraps go-smtp and implements tcp.Starter.
@@ -226,6 +227,7 @@ func parseEmail(raw []byte, recipients []string) (*ParsedEmail, []parsedAttachme
226227
if err != nil || !strings.HasPrefix(mediaType, "multipart/") {
227228
body, _ := io.ReadAll(msg.Body)
228229
decoded := decodeContent(body, msg.Header.Get("Content-Transfer-Encoding"))
230+
decoded = convertToUTF8(decoded, params["charset"])
229231
if strings.HasPrefix(mediaType, "text/html") {
230232
parsed.HTML = string(decoded)
231233
} else {
@@ -307,6 +309,7 @@ func processPart(part *multipart.Part, parsed *ParsedEmail, atts *[]parsedAttach
307309
// Body content.
308310
body, _ := io.ReadAll(part)
309311
decoded := decodeContent(body, part.Header.Get("Content-Transfer-Encoding"))
312+
decoded = convertToUTF8(decoded, params["charset"])
310313

311314
if strings.HasPrefix(mediaType, "text/html") {
312315
parsed.HTML += string(decoded)
@@ -345,3 +348,21 @@ func decodeContent(data []byte, encoding string) []byte {
345348
return data
346349
}
347350
}
351+
352+
// convertToUTF8 converts data from the given charset to UTF-8.
353+
// If charset is empty, "utf-8", or "us-ascii", the data is returned as-is.
354+
func convertToUTF8(data []byte, charset string) []byte {
355+
charset = strings.ToLower(strings.TrimSpace(charset))
356+
if charset == "" || charset == "utf-8" || charset == "us-ascii" {
357+
return data
358+
}
359+
enc, err := ianaindex.MIME.Encoding(charset)
360+
if err != nil || enc == nil {
361+
return data
362+
}
363+
decoded, err := enc.NewDecoder().Bytes(data)
364+
if err != nil {
365+
return data
366+
}
367+
return decoded
368+
}

modules/smtp/handler_test.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,99 @@ func TestDecodeContent(t *testing.T) {
128128
})
129129
}
130130

131+
func TestConvertToUTF8(t *testing.T) {
132+
t.Run("utf-8 passthrough", func(t *testing.T) {
133+
input := []byte("Привет мир")
134+
got := convertToUTF8(input, "utf-8")
135+
if string(got) != "Привет мир" {
136+
t.Errorf("got %q", got)
137+
}
138+
})
139+
140+
t.Run("empty charset passthrough", func(t *testing.T) {
141+
input := []byte("hello")
142+
got := convertToUTF8(input, "")
143+
if string(got) != "hello" {
144+
t.Errorf("got %q", got)
145+
}
146+
})
147+
148+
t.Run("windows-1250 czech", func(t *testing.T) {
149+
// "týmu se vám ozve" in windows-1250
150+
win1250 := []byte{0x74, 0xFD, 0x6D, 0x75, 0x20, 0x73, 0x65, 0x20, 0x76, 0xE1, 0x6D, 0x20, 0x6F, 0x7A, 0x76, 0x65}
151+
got := convertToUTF8(win1250, "windows-1250")
152+
want := "týmu se vám ozve"
153+
if string(got) != want {
154+
t.Errorf("got %q, want %q", got, want)
155+
}
156+
})
157+
158+
t.Run("iso-8859-1 latin", func(t *testing.T) {
159+
// "café" in iso-8859-1: 0x63 0x61 0x66 0xe9
160+
latin1 := []byte{0x63, 0x61, 0x66, 0xe9}
161+
got := convertToUTF8(latin1, "iso-8859-1")
162+
if string(got) != "café" {
163+
t.Errorf("got %q, want %q", got, "café")
164+
}
165+
})
166+
167+
t.Run("unknown charset passthrough", func(t *testing.T) {
168+
input := []byte("data")
169+
got := convertToUTF8(input, "x-nonexistent-999")
170+
if string(got) != "data" {
171+
t.Errorf("got %q", got)
172+
}
173+
})
174+
}
175+
176+
func TestParseEmail_CharsetConversion(t *testing.T) {
177+
t.Run("single part windows-1250", func(t *testing.T) {
178+
// "café" in windows-1250: 0x63 0x61 0x66 0xe9
179+
body := string([]byte{0x63, 0x61, 0x66, 0xe9})
180+
raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: text/plain; charset=windows-1250\r\n\r\n" + body)
181+
182+
parsed, _, err := parseEmail(raw, []string{"recipient@example.com"})
183+
if err != nil {
184+
t.Fatal(err)
185+
}
186+
if parsed.Text != "café" {
187+
t.Errorf("Text = %q, want %q", parsed.Text, "café")
188+
}
189+
})
190+
191+
t.Run("single part html iso-8859-2", func(t *testing.T) {
192+
// "Vaše zpráva" in iso-8859-2: V=56 a=61 š=B9 e=65 sp=20 z=7A p=70 r=72 á=E1 v=76 a=61
193+
body := string([]byte{0x56, 0x61, 0xB9, 0x65, 0x20, 0x7A, 0x70, 0x72, 0xE1, 0x76, 0x61})
194+
raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: text/html; charset=iso-8859-2\r\n\r\n" + body)
195+
196+
parsed, _, err := parseEmail(raw, []string{"recipient@example.com"})
197+
if err != nil {
198+
t.Fatal(err)
199+
}
200+
want := "Vaše zpráva"
201+
if parsed.HTML != want {
202+
t.Errorf("HTML = %q, want %q", parsed.HTML, want)
203+
}
204+
})
205+
206+
t.Run("multipart with charset", func(t *testing.T) {
207+
// "café" in iso-8859-1
208+
body := string([]byte{0x63, 0x61, 0x66, 0xe9})
209+
raw := []byte("From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\nContent-Type: multipart/alternative; boundary=\"bnd\"\r\n\r\n--bnd\r\nContent-Type: text/plain; charset=iso-8859-1\r\n\r\n" + body + "\r\n--bnd\r\nContent-Type: text/html; charset=iso-8859-1\r\n\r\n<p>" + body + "</p>\r\n--bnd--")
210+
211+
parsed, _, err := parseEmail(raw, []string{"recipient@example.com"})
212+
if err != nil {
213+
t.Fatal(err)
214+
}
215+
if !strings.Contains(parsed.Text, "café") {
216+
t.Errorf("Text = %q, want to contain %q", parsed.Text, "café")
217+
}
218+
if !strings.Contains(parsed.HTML, "café") {
219+
t.Errorf("HTML = %q, want to contain %q", parsed.HTML, "café")
220+
}
221+
})
222+
}
223+
131224
func TestParseAddresses(t *testing.T) {
132225
t.Run("name and email", func(t *testing.T) {
133226
header := make(map[string][]string)

0 commit comments

Comments
 (0)