diff --git a/filetype.go b/filetype.go index 3753c03..f202b90 100644 --- a/filetype.go +++ b/filetype.go @@ -24,7 +24,7 @@ var ErrUnknownBuffer = errors.New("Unknown buffer type") // AddType registers a new file type func AddType(ext, mime string) types.Type { - return types.NewType(ext, mime) + return types.NewType(ext, mime, 0) } // Is checks if a given buffer matches with the given file type extension diff --git a/fixtures/sample.docx b/fixtures/sample.docx new file mode 100644 index 0000000..dc56c55 Binary files /dev/null and b/fixtures/sample.docx differ diff --git a/fixtures/sample.pptx b/fixtures/sample.pptx new file mode 100644 index 0000000..2e8e813 Binary files /dev/null and b/fixtures/sample.pptx differ diff --git a/fixtures/sample.xlsx b/fixtures/sample.xlsx new file mode 100644 index 0000000..40c47d7 Binary files /dev/null and b/fixtures/sample.xlsx differ diff --git a/match.go b/match.go index 9b6e376..f341e10 100644 --- a/match.go +++ b/match.go @@ -2,6 +2,7 @@ package filetype import ( "io" + "io/ioutil" "os" "gopkg.in/h2non/filetype.v1/matchers" @@ -14,6 +15,9 @@ var Matchers = matchers.Matchers // NewMatcher is an alias to matchers.NewMatcher var NewMatcher = matchers.NewMatcher +// MatcherTypes is an alias to `matchers.MatcherTypes` +var MatcherTypes = matchers.MatcherTypes + // Match infers the file type of a given buffer inspecting its magic numbers signature func Match(buf []byte) (types.Type, error) { length := len(buf) @@ -21,7 +25,9 @@ func Match(buf []byte) (types.Type, error) { return types.Unknown, ErrEmptyBuffer } - for _, checker := range Matchers { + for _, typ := range MatcherTypes { + checker := Matchers[*typ] + match := checker(buf) if match != types.Unknown && match.Extension != "" { return match, nil @@ -49,10 +55,8 @@ func MatchFile(filepath string) (types.Type, error) { // MatchReader is convenient wrapper to Match() any Reader func MatchReader(reader io.Reader) (types.Type, error) { - buffer := make([]byte, 512) - - _, err := reader.Read(buffer) - if err != nil && err != io.EOF { + buffer, err := ioutil.ReadAll(reader) + if err != nil { return types.Unknown, err } diff --git a/match_test.go b/match_test.go index 5dcbb5e..604a499 100644 --- a/match_test.go +++ b/match_test.go @@ -146,6 +146,27 @@ func TestMatchesMap(t *testing.T) { } } +func TestMatchXlsx(t *testing.T) { + kind, _ := MatchFile("./fixtures/sample.xlsx") + if kind.Extension != matchers.TypeXlsx.Extension { + t.Fatalf("Invalid document type: %s != %s", kind.Extension, matchers.TypeDocx.Extension) + } +} + +func TestMatchDocx(t *testing.T) { + kind, _ := MatchFile("./fixtures/sample.docx") + if kind.Extension != matchers.TypeDocx.Extension { + t.Fatalf("Invalid document type: %s != %s", kind.Extension, matchers.TypeDocx.Extension) + } +} + +func TestMatchPptx(t *testing.T) { + kind, _ := MatchFile("./fixtures/sample.pptx") + if kind.Extension != matchers.TypePptx.Extension { + t.Fatalf("Invalid document type: %s != %s", kind.Extension, matchers.TypePptx.Extension) + } +} + // // Benchmarks // @@ -155,6 +176,9 @@ var zipBuffer, _ = ioutil.ReadFile("./fixtures/sample.zip") var jpgBuffer, _ = ioutil.ReadFile("./fixtures/sample.jpg") var gifBuffer, _ = ioutil.ReadFile("./fixtures/sample.gif") var pngBuffer, _ = ioutil.ReadFile("./fixtures/sample.png") +var xlsxBuffer, _ = ioutil.ReadFile("./fixtures/sample.xlsx") +var docxBuffer, _ = ioutil.ReadFile("./fixtures/sample.docx") +var pptxBuffer, _ = ioutil.ReadFile("./fixtures/sample.pptx") func BenchmarkMatchTar(b *testing.B) { for n := 0; n < b.N; n++ { @@ -185,3 +209,21 @@ func BenchmarkMatchPng(b *testing.B) { Match(pngBuffer) } } + +func BenchmarkMatchXlsx(b *testing.B) { + for n := 0; n < b.N; n++ { + Match(xlsxBuffer) + } +} + +func BenchmarkMatchDocx(b *testing.B) { + for n := 0; n < b.N; n++ { + Match(docxBuffer) + } +} + +func BenchmarkMatchPptx(b *testing.B) { + for n := 0; n < b.N; n++ { + Match(pptxBuffer) + } +} diff --git a/matchers/archive.go b/matchers/archive.go index 9c1270f..560d4a7 100644 --- a/matchers/archive.go +++ b/matchers/archive.go @@ -1,30 +1,32 @@ package matchers +const defaultArchivePriority = 10 + var ( - TypeEpub = newType("epub", "application/epub+zip") - TypeZip = newType("zip", "application/zip") - TypeTar = newType("tar", "application/x-tar") - TypeRar = newType("rar", "application/x-rar-compressed") - TypeGz = newType("gz", "application/gzip") - TypeBz2 = newType("bz2", "application/x-bzip2") - Type7z = newType("7z", "application/x-7z-compressed") - TypeXz = newType("xz", "application/x-xz") - TypePdf = newType("pdf", "application/pdf") - TypeExe = newType("exe", "application/x-msdownload") - TypeSwf = newType("swf", "application/x-shockwave-flash") - TypeRtf = newType("rtf", "application/rtf") - TypeEot = newType("eot", "application/octet-stream") - TypePs = newType("ps", "application/postscript") - TypeSqlite = newType("sqlite", "application/x-sqlite3") - TypeNes = newType("nes", "application/x-nintendo-nes-rom") - TypeCrx = newType("crx", "application/x-google-chrome-extension") - TypeCab = newType("cab", "application/vnd.ms-cab-compressed") - TypeDeb = newType("deb", "application/x-deb") - TypeAr = newType("ar", "application/x-unix-archive") - TypeZ = newType("Z", "application/x-compress") - TypeLz = newType("lz", "application/x-lzip") - TypeRpm = newType("rpm", "application/x-rpm") - TypeElf = newType("elf", "application/x-executable") + TypeEpub = newType("epub", "application/epub+zip", defaultArchivePriority) + TypeZip = newType("zip", "application/zip", defaultArchivePriority) + TypeTar = newType("tar", "application/x-tar", defaultArchivePriority) + TypeRar = newType("rar", "application/x-rar-compressed", defaultArchivePriority) + TypeGz = newType("gz", "application/gzip", defaultArchivePriority) + TypeBz2 = newType("bz2", "application/x-bzip2", defaultArchivePriority) + Type7z = newType("7z", "application/x-7z-compressed", defaultArchivePriority) + TypeXz = newType("xz", "application/x-xz", defaultArchivePriority) + TypePdf = newType("pdf", "application/pdf", defaultArchivePriority) + TypeExe = newType("exe", "application/x-msdownload", defaultArchivePriority) + TypeSwf = newType("swf", "application/x-shockwave-flash", defaultArchivePriority) + TypeRtf = newType("rtf", "application/rtf", defaultArchivePriority) + TypeEot = newType("eot", "application/octet-stream", defaultArchivePriority) + TypePs = newType("ps", "application/postscript", defaultArchivePriority) + TypeSqlite = newType("sqlite", "application/x-sqlite3", defaultArchivePriority) + TypeNes = newType("nes", "application/x-nintendo-nes-rom", defaultArchivePriority) + TypeCrx = newType("crx", "application/x-google-chrome-extension", defaultArchivePriority) + TypeCab = newType("cab", "application/vnd.ms-cab-compressed", defaultArchivePriority) + TypeDeb = newType("deb", "application/x-deb", defaultArchivePriority) + TypeAr = newType("ar", "application/x-unix-archive", defaultArchivePriority) + TypeZ = newType("Z", "application/x-compress", defaultArchivePriority) + TypeLz = newType("lz", "application/x-lzip", defaultArchivePriority) + TypeRpm = newType("rpm", "application/x-rpm", defaultArchivePriority) + TypeElf = newType("elf", "application/x-executable", defaultArchivePriority) ) var Archive = Map{ diff --git a/matchers/audio.go b/matchers/audio.go index 7b27caf..49d3060 100644 --- a/matchers/audio.go +++ b/matchers/audio.go @@ -1,13 +1,15 @@ package matchers +const defaultAudioPriority = 800 + var ( - TypeMidi = newType("mid", "audio/midi") - TypeMp3 = newType("mp3", "audio/mpeg") - TypeM4a = newType("m4a", "audio/m4a") - TypeOgg = newType("ogg", "audio/ogg") - TypeFlac = newType("flac", "audio/x-flac") - TypeWav = newType("wav", "audio/x-wav") - TypeAmr = newType("amr", "audio/amr") + TypeMidi = newType("mid", "audio/midi", defaultAudioPriority) + TypeMp3 = newType("mp3", "audio/mpeg", defaultAudioPriority) + TypeM4a = newType("m4a", "audio/m4a", defaultAudioPriority) + TypeOgg = newType("ogg", "audio/ogg", defaultAudioPriority) + TypeFlac = newType("flac", "audio/x-flac", defaultAudioPriority) + TypeWav = newType("wav", "audio/x-wav", defaultAudioPriority) + TypeAmr = newType("amr", "audio/amr", defaultAudioPriority) ) var Audio = Map{ diff --git a/matchers/document.go b/matchers/document.go index cc5ded2..ce11c51 100644 --- a/matchers/document.go +++ b/matchers/document.go @@ -1,14 +1,21 @@ package matchers -import "bytes" +import ( + "bytes" + + "gopkg.in/h2non/filetype.v1/util" +) + +const defaultDocumentPriority = 500 +const officeContentTypesFileName = "[Content_Types].xml" var ( - TypeDoc = newType("doc", "application/msword") - TypeDocx = newType("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document") - TypeXls = newType("xls", "application/vnd.ms-excel") - TypeXlsx = newType("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") - TypePpt = newType("ppt", "application/vnd.ms-powerpoint") - TypePptx = newType("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation") + TypeDoc = newType("doc", "application/msword", defaultDocumentPriority) + TypeDocx = newType("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", defaultDocumentPriority) + TypeXls = newType("xls", "application/vnd.ms-excel", defaultDocumentPriority) + TypeXlsx = newType("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", defaultDocumentPriority) + TypePpt = newType("ppt", "application/vnd.ms-powerpoint", defaultDocumentPriority) + TypePptx = newType("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", defaultDocumentPriority) ) var Document = Map{ @@ -29,10 +36,20 @@ func Doc(buf []byte) bool { } func Docx(buf []byte) bool { - return len(buf) > 3 && - buf[0] == 0x50 && buf[1] == 0x4B && - buf[2] == 0x03 && buf[3] == 0x04 && - bytes.Contains(buf[:256], []byte(TypeDocx.MIME.Value)) + if len(buf) <= 3 { + return false + } + if buf[0] != 0x50 || buf[1] != 0x4B || + buf[2] != 0x03 || buf[3] != 0x04 { + return false + } + + xml, err := util.ReadMSOfficeXMLFile(buf, officeContentTypesFileName) + if err != nil { + return false + } + + return bytes.Contains(xml, []byte(TypeDocx.MIME.Value)) } func Xls(buf []byte) bool { @@ -44,10 +61,20 @@ func Xls(buf []byte) bool { } func Xlsx(buf []byte) bool { - return len(buf) > 3 && - buf[0] == 0x50 && buf[1] == 0x4B && - buf[2] == 0x03 && buf[3] == 0x04 && - bytes.Contains(buf[:256], []byte(TypeXlsx.MIME.Value)) + if len(buf) <= 3 { + return false + } + if buf[0] != 0x50 || buf[1] != 0x4B || + buf[2] != 0x03 || buf[3] != 0x04 { + return false + } + + xml, err := util.ReadMSOfficeXMLFile(buf, officeContentTypesFileName) + if err != nil { + return false + } + + return bytes.Contains(xml, []byte(TypeXlsx.MIME.Value)) } func Ppt(buf []byte) bool { @@ -59,8 +86,18 @@ func Ppt(buf []byte) bool { } func Pptx(buf []byte) bool { - return len(buf) > 3 && - buf[0] == 0x50 && buf[1] == 0x4B && - buf[2] == 0x07 && buf[3] == 0x08 && - bytes.Contains(buf[:256], []byte(TypePptx.MIME.Value)) + if len(buf) <= 3 { + return false + } + if buf[0] != 0x50 || buf[1] != 0x4B || + buf[2] != 0x03 || buf[3] != 0x04 { + return false + } + + xml, err := util.ReadMSOfficeXMLFile(buf, officeContentTypesFileName) + if err != nil { + return false + } + + return bytes.Contains(xml, []byte(TypePptx.MIME.Value)) } diff --git a/matchers/font.go b/matchers/font.go index f391716..f65feb7 100644 --- a/matchers/font.go +++ b/matchers/font.go @@ -1,10 +1,12 @@ package matchers +const defaultFontPriority = 700 + var ( - TypeWoff = newType("woff", "application/font-woff") - TypeWoff2 = newType("woff2", "application/font-woff") - TypeTtf = newType("ttf", "application/font-sfnt") - TypeOtf = newType("otf", "application/font-sfnt") + TypeWoff = newType("woff", "application/font-woff", defaultFontPriority) + TypeWoff2 = newType("woff2", "application/font-woff", defaultFontPriority) + TypeTtf = newType("ttf", "application/font-sfnt", defaultFontPriority) + TypeOtf = newType("otf", "application/font-sfnt", defaultFontPriority) ) var Font = Map{ diff --git a/matchers/image.go b/matchers/image.go index bc3378d..640eaff 100644 --- a/matchers/image.go +++ b/matchers/image.go @@ -1,16 +1,18 @@ package matchers +const defaultImagePriority = 1000 + var ( - TypeJpeg = newType("jpg", "image/jpeg") - TypePng = newType("png", "image/png") - TypeGif = newType("gif", "image/gif") - TypeWebp = newType("webp", "image/webp") - TypeCR2 = newType("cr2", "image/x-canon-cr2") - TypeTiff = newType("tif", "image/tiff") - TypeBmp = newType("bmp", "image/bmp") - TypeJxr = newType("jxr", "image/vnd.ms-photo") - TypePsd = newType("psd", "image/vnd.adobe.photoshop") - TypeIco = newType("ico", "image/x-icon") + TypeJpeg = newType("jpg", "image/jpeg", defaultImagePriority) + TypePng = newType("png", "image/png", defaultImagePriority) + TypeGif = newType("gif", "image/gif", defaultImagePriority) + TypeWebp = newType("webp", "image/webp", defaultImagePriority) + TypeCR2 = newType("cr2", "image/x-canon-cr2", defaultImagePriority) + TypeTiff = newType("tif", "image/tiff", defaultImagePriority) + TypeBmp = newType("bmp", "image/bmp", defaultImagePriority) + TypeJxr = newType("jxr", "image/vnd.ms-photo", defaultImagePriority) + TypePsd = newType("psd", "image/vnd.adobe.photoshop", defaultImagePriority) + TypeIco = newType("ico", "image/x-icon", defaultImagePriority) ) var Image = Map{ diff --git a/matchers/matchers.go b/matchers/matchers.go index 4525c02..3265a92 100644 --- a/matchers/matchers.go +++ b/matchers/matchers.go @@ -1,6 +1,10 @@ package matchers -import "gopkg.in/h2non/filetype.v1/types" +import ( + "sort" + + "gopkg.in/h2non/filetype.v1/types" +) // Internal shortcut to NewType var newType = types.NewType @@ -17,6 +21,15 @@ type TypeMatcher func([]byte) types.Type // Store registered file type matchers var Matchers = make(map[types.Type]TypeMatcher) +// MatcherTypes store sorted matcher key +var MatcherTypes = make([]*types.Type, 0) + +type typs []*types.Type + +func (t typs) Len() int { return len(t) } +func (t typs) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t typs) Less(i, j int) bool { return t[i].Priority < t[j].Priority } + // Create and register a new type matcher function func NewMatcher(kind types.Type, fn Matcher) TypeMatcher { matcher := func(buf []byte) types.Type { @@ -27,15 +40,23 @@ func NewMatcher(kind types.Type, fn Matcher) TypeMatcher { } Matchers[kind] = matcher + MatcherTypes = append(MatcherTypes, &kind) + return matcher } +// When iterating over a map with a range loop, +// the iteration order is not specified and is not guaranteed to be the same from one iteration to the next +// If you require a stable iteration order you must maintain a separate data structure that specifies that order +// see: https://blog.golang.org/go-maps-in-action func register(matchers ...Map) { for _, m := range matchers { for kind, matcher := range m { NewMatcher(kind, matcher) } } + + sort.Sort(sort.Reverse(typs(MatcherTypes))) } func init() { diff --git a/matchers/video.go b/matchers/video.go index 7714dce..94fa0b7 100644 --- a/matchers/video.go +++ b/matchers/video.go @@ -1,15 +1,17 @@ package matchers +const defaultVideoPriority = 900 + var ( - TypeMp4 = newType("mp4", "video/mp4") - TypeM4v = newType("m4v", "video/x-m4v") - TypeMkv = newType("mkv", "video/x-matroska") - TypeWebm = newType("webm", "video/webm") - TypeMov = newType("mov", "video/quicktime") - TypeAvi = newType("avi", "video/x-msvideo") - TypeWmv = newType("wmv", "video/x-ms-wmv") - TypeMpeg = newType("mpg", "video/mpeg") - TypeFlv = newType("flv", "video/x-flv") + TypeMp4 = newType("mp4", "video/mp4", defaultVideoPriority) + TypeM4v = newType("m4v", "video/x-m4v", defaultVideoPriority) + TypeMkv = newType("mkv", "video/x-matroska", defaultVideoPriority) + TypeWebm = newType("webm", "video/webm", defaultVideoPriority) + TypeMov = newType("mov", "video/quicktime", defaultVideoPriority) + TypeAvi = newType("avi", "video/x-msvideo", defaultVideoPriority) + TypeWmv = newType("wmv", "video/x-ms-wmv", defaultVideoPriority) + TypeMpeg = newType("mpg", "video/mpeg", defaultVideoPriority) + TypeFlv = newType("flv", "video/x-flv", defaultVideoPriority) ) var Video = Map{ diff --git a/types/defaults.go b/types/defaults.go index bb1ea62..fa41327 100644 --- a/types/defaults.go +++ b/types/defaults.go @@ -1,4 +1,4 @@ package types // Unkown default type -var Unknown = NewType("unknown", "") +var Unknown = NewType("unknown", "", -1) diff --git a/types/type.go b/types/type.go index 5cf7dfc..36264c3 100644 --- a/types/type.go +++ b/types/type.go @@ -4,13 +4,15 @@ package types type Type struct { MIME MIME Extension string + Priority int } // NewType creates a new Type -func NewType(ext, mime string) Type { +func NewType(ext, mime string, p int) Type { t := Type{ MIME: NewMIME(mime), Extension: ext, + Priority: p, } return Add(t) } diff --git a/util/util.go b/util/util.go new file mode 100644 index 0000000..dc82a26 --- /dev/null +++ b/util/util.go @@ -0,0 +1,43 @@ +package util + +import ( + "archive/zip" + "bytes" + "fmt" + "io/ioutil" +) + +// ReadMSOfficeXMLFile Read the contents of the xml file +// contained in the ms office file according to the specified file name. +func ReadMSOfficeXMLFile(content []byte, filename string) ([]byte, error) { + br := bytes.NewReader(content) + zipr, err := zip.NewReader(br, int64(len(content))) + if err != nil { + return nil, err + } + + var file *zip.File + for _, f := range zipr.File { + if f.FileInfo().Name() == filename { + file = f + break + } + } + + if file == nil { + return nil, fmt.Errorf("The specified file could not be found: %s", filename) + } + + rc, err := file.Open() + if err != nil { + return nil, err + } + defer rc.Close() + + buf, err := ioutil.ReadAll(rc) + if err != nil { + return nil, err + } + + return buf, nil +}