Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ go get github.com/joshtechnologygroup/chardet
<details>
<summary>Expand the list of supported encodings</summary>

- **Ascii**
- **ASCII** (“US-ASCII”)
- **UTF-8**
- **UTF-8-SIG**
- **UTF-16**
Expand All @@ -40,13 +40,13 @@ go get github.com/joshtechnologygroup/chardet
- **UTF-32LE**
- **GB2312**
- **HZ-GB-2312**
- **SHIFT_JIS**
- **Shift_JIS**
- **Big5**
- **Johab**
- **Johab** (“KS_C_5601-1987”)
- **KOI8-R**
- **TIS-620**
- **MacCyrillic**
- **MacRoman**
- **MacRoman** (“macintosh”)
- **EUC-TW**
- **EUC-KR**
- **EUC-JP**
Expand Down Expand Up @@ -112,7 +112,7 @@ func main() {
data := []byte("Your text data here...")
result := chardet.Detect(data)
fmt.Printf("Detected result: %+v\n", result)
//Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
//Output: Detected result: {Encoding:US-ASCII Confidence:1 Language:}
}
```

Expand All @@ -139,7 +139,7 @@ func main() {
// Get the result
result := detector.GetResult()
fmt.Printf("Detected result: %+v\n", result)
// Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
// Output: Detected result: {Encoding:US-ASCII Confidence:1 Language:}
}
```

Expand Down
23 changes: 13 additions & 10 deletions consts/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@ const (
Turkish = "Turkish"
)

// Please use IANA names when extending this
//
// http://www.iana.org/assignments/character-sets/character-sets.xhtml
const (
Ascii = "Ascii"
Ascii = "US-ASCII" // IANA Name
UTF8 = "UTF-8"
UTF8SIG = "UTF-8-SIG"
UTF16 = "UTF-16"
Expand All @@ -25,21 +28,21 @@ const (

GB2312 = "GB2312"
HzGB2312 = "HZ-GB-2312"
ShiftJis = "SHIFT_JIS"
ShiftJis = "Shift_JIS"
Big5 = "Big5"
Johab = "Johab"
Johab = "KS_C_5601-1987" // IANA name
Koi8R = "KOI8-R"
TIS620 = "TIS-620"

MacCyrillic = "MacCyrillic"
MacRoman = "MacRoman"
MacCyrillic = "MacCyrillic" // Not in IANA
MacRoman = "macintosh" // IANA name

EucTw = "EUC-TW"
EucTw = "EUC-TW" // Not in IANA
EucKr = "EUC-KR"
EucJp = "EUC-JP"

CP932 = "CP932"
CP949 = "CP949"
CP932 = "CP932" // Not in IANA
CP949 = "CP949" // Not in IANA

Windows1250 = "Windows-1250"
Windows1251 = "Windows-1251"
Expand All @@ -61,8 +64,8 @@ const (
ISO2022CN = "ISO-2022-CN"
ISO2022JP = "ISO-2022-JP"
ISO2022KR = "ISO-2022-KR"
UCS43412 = "X-ISO-10646-UCS-4-3412"
UCS42143 = "X-ISO-10646-UCS-4-2143"
UCS43412 = "X-ISO-10646-UCS-4-3412" // Testing “Unusual Octet” byte order
UCS42143 = "X-ISO-10646-UCS-4-2143" // Testing “Unusual Octet” byte order

IBM855 = "IBM855"
IBM866 = "IBM866"
Expand Down
6 changes: 5 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
module github.com/joshtechnologygroup/chardet

go 1.20
go 1.24.0

toolchain go1.24.4

require golang.org/x/text v0.32.0 // indirect
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
61 changes: 61 additions & 0 deletions lookup/lookup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package lookup

import (
"strings"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/unicode/utf32"
)

// Looks up an `golang.org/x/text/encoding` encoding by name
//
// Recognizes all encodings returned by this library for which a corresponding
// or compatible superset encoder exists.
//
// Like `golang.org/x/text/encoding/ianaindex.IANA.Encoding`, this returns
// `encoding, nil` on success, `nil, err` on error and `nil, nil` if the name
// was correct but has no corresponding encoder.
func LookupEncoding(name string) (encoding.Encoding, error) {
name = strings.ToLower(name)

// First try stdlib lookup function
encoding, err := ianaindex.IANA.Encoding(name)
if encoding != nil {
return encoding, nil
}

switch name {
// UTF-32 family appears to be an omission
case "utf-32", "csutf32":
return utf32.UTF32(utf32.BigEndian, utf32.UseBOM), nil
case "utf-32be", "csutf32be":
return utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM), nil
case "utf-32le", "csutf32le":
return utf32.UTF32(utf32.LittleEndian, utf32.IgnoreBOM), nil

// GB2312 is a subset of GBK which in turn is a subset of GB18030
case "gb2312", "csgb2319",
"gbk", "cp936", "ms936", "windows-936", "csgbk":
return simplifiedchinese.GB18030, nil

// MacCyrillic is missing IANA designation
case "maccyrillic", "x-mac-cyrillic":
return charmap.MacintoshCyrillic, nil

// Not supported and not in IANA are:
case "euc-tw",
"cp932", "ms932", "windows-932", "windows-31j", // Similar to Shift-JIS
"cp949", "ms949", "windows-949": // Similar to Johab
return nil, nil

// Not supported but in IANA are (`err` will be `nil` for these):
// * ISO-2022-CN
// * ISO-2022-KR
// * TIS-620
default:
return nil, err
}
}
Loading