-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdir.go
More file actions
1330 lines (1232 loc) · 39.9 KB
/
dir.go
File metadata and controls
1330 lines (1232 loc) · 39.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package main
/*
Copyright 2023, 2024, 2025, 2026 RoboMac
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Problem: -error may be parsed after the errors have been suppressed.
// Perhaps also should have levels it does/does not apply to.
import (
"archive/tar"
"archive/zip"
"bufio"
"bytes"
"compress/gzip"
_ "embed"
"errors"
"fmt"
"io"
"io/fs"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"slices"
"sort"
"strings"
"time"
"github.com/bodgit/sevenzip"
"github.com/gobwas/glob"
)
/* Potential Enhancements: Allow defining the type sort order. mdfind integration on the mac, for wider file type support. */
/* PDF Notes: None of the Go-based PDF libraries worked on newer PDF files, so using pdftotext */
// Run tests with go test ./...
// DO NOT DELETE THIS "COMMENT"; it includes the file.
//
//go:embed dirhelp.txt
var helptext string
const versionDate = "2026-04-15.1"
const (
COLUMN_DATEMODIFIED = "m"
COLUMN_DATECREATED = "c"
COLUMN_DATEACCESSED = "a"
COLUMN_FILESIZE = "s"
COLUMN_MODE = "p" // for permissions
COLUMN_NAME = "n" // filename
COLUMN_LINK = "l" // e.g. symlink target
COLUMN_PATH = "f" // Path == folders, especially for dir <mask> -r
)
var columnDef = "p m (c) s nl" // See above. Spaces and parens, etc, are relevant. This is the default.
type sortfield string
type sortorder struct {
field sortfield
ascending bool
}
type sizeformat int
type Attributes string
type InclusionMod string
type searchtype int
type ArchiveType int
type Filetype int
const (
SORT_NAME sortfield = "n"
SORT_DATE sortfield = "d" // Sort by last modified. (Which is "m" in columns)
SORT_CREATED sortfield = COLUMN_DATECREATED
SORT_ACCESSED sortfield = COLUMN_DATEACCESSED
SORT_SIZE sortfield = "s"
SORT_TYPE sortfield = "e" // Uses mod and knowledge of extensions to group, e.g. image, archive, code, document
SORT_EXT sortfield = "x" // Extension in DOS
SORT_NATURAL sortfield = "o" // Don't sort
SIZE_NATURAL sizeformat = 0 // Sizes as unformatted bytes
SIZE_SEPARATOR sizeformat = 1 // Sizes formatted with localconv non-monetary separator
SIZE_QUANTA sizeformat = 2 // Sizes formatted with units/quanta - e.g. GB, TB...
SEARCH_NONE searchtype = 0
SEARCH_CASE searchtype = 1
SEARCH_NOCASE searchtype = 2
SEARCH_REGEX searchtype = 3 // Technically, all but none become REGEX, with NOCASE being modified.
PROGRAM_NOT_FOUND = "program not found"
ARCHIVE_NA = iota
ARCHIVE_ZIP
ARCHIVE_TGZ
ARCHIVE_7Z
)
const (
// Filetypes
NONE Filetype = iota // starts at 0, also used for reset
AUDIO // 1 ...
ARCHIVE
IMAGE
VIDEO
DOCUMENT // Enhanced start here
DATA
CONFIG
CODE
DIRECTORY // No extensions
EXECUTABLE
SYMLINK // No extensions
HIDDEN // Prefix, not suffix. Matches DEFAULT unless set otherwise. Last so other types override on colors.
DEFAULT
)
func (ft Filetype) String() string {
return [...]string{"None", "Audio", "Archive", "Image", "Video", "Document", "Data", "Configuration", "Source Code", "Directory", "Executable", "SymLink", "Hidden", "Default"}[ft]
}
// Notes: See https://docs.fileformat.com for a great list. Some are value judgements.
var Extensions = map[Filetype]string{
AUDIO: ",aac,au,flac,mid,midi,mka,mp3,mpc,ogg,ra,wav,axa,oga,opus,spx,xspf,",
ARCHIVE: ",7z,ace,apk,arj,bz,bz2,cpio,deb,dmg,dz,gz,jar,lz,lzh,lzma,msi,rar,rpm,rz,tar,taz,tbz,tbz2,tgz,tlz,txz,tz,xz,z,Z,zip,zoo,",
IMAGE: ",bmp,cgm,dib,dl,emf,gif,gl,jpeg,jpg,mng,pbm,pcx,pdn,pgm,png,ppm,svg,svgz,tga,tif,tiff,xbm,xcf,xpm,xwd,",
VIDEO: ",3g2,3gp,anx,asf,avi,axv,flc,fli,flv,m2ts,m2v,m4v,mkv,mov,mp4,mp4v,mpeg,mpg,mts,nuv,ogm,ogv,ogx,qt,rm,rmvb,ts,vob,webm,wmv,yuv,",
// The following are "Enhanced" options.
DOCUMENT: ",doc,docx,ebk,epub,html,htm,markdown,mbox,mbp,md,mht,mhtml,mobi,msg,odt,ofx,one,pages,pdf,ppt,pptx,ps,pub,rtf,tex,txt,vsdx,xls,xlsx,",
DATA: ",cdb,csv,dat,db3,dbf,graphql,json,log,m3u8,rpt,sdf,sql,xml,",
CONFIG: ",adp,ant,cfg,confit,ini,prefs,rc,tcl,yaml,",
CODE: ",ahk,applescript,asm,au3,bas,bash,bat,c,cmake,cmd,coffee,cpp,cs,cxx,dockerfile,elf,es,exe,go,gradle,groovy,gvy,h,hpp,hxx,inc,ino,java,js,kt,ktm,kts,lua,m,mak,mm,perl,ph,php,pl,pp,ps1,psm1,py,rake,rb,rbw,rbuild,rbx,rs,ru,ruby,scpt,sh,ts,tsx,v,vb,vbs,vhd,vhdl,zsh,",
}
// Could use a slice here, since it's indexing in by int, but naming the spots makes it clearer.
var FileTypeSortOrder = map[Filetype]int{DIRECTORY: 0, HIDDEN: 1, NONE: 2, DEFAULT: 3, CODE: 4, EXECUTABLE: 5, CONFIG: 6,
DATA: 7, DOCUMENT: 8, AUDIO: 9, IMAGE: 10, VIDEO: 11, ARCHIVE: 12}
// By convention, but not typically part of LS_COLORS, archives are bold red, audio is cyan, media and some others are bold magenta.
// Colors that get mapped to extensions.
// 00=none, 01=bold, 04=underscore, 05=blink, 07=reverse, 08=concealed.
// FG: 30=black, 31=red, 32=green, 33=yellow, 34=blue, 35=magenta, 36=cyan, 37=white,
// BG: 40=black 41=red 42=green 43=yellow 44=blue 45=magenta 46=cyan 47=white
var FileColors = map[Filetype]string{
NONE: "0", DIRECTORY: "1;36", DEFAULT: "37",
EXECUTABLE: "31", SYMLINK: "35", ARCHIVE: "01;31", IMAGE: "01;35", VIDEO: "01;34", AUDIO: "00;36",
// Extensions
DOCUMENT: "01;32", DATA: "32", CONFIG: "01;37", CODE: "01;34",
}
var ( // Runtime configuration
show_errors = false
debug_messages = false
progress_messages = false // For long searches / recursions.
bare bool = false // Only print filenames
include_path = false // Turn on in bare+ mode
sortby = sortorder{SORT_NAME, true}
directories_first = true
listdirectories bool = true
listfiles bool = true
listInArchives bool = false
listhidden bool = true
only_executables bool = false // Set by -ax, limit to executable files only
listFoundText bool = false // Set by -ct, for list found text. Also implies find ALL matches in a file.
directory_header bool = true // Print name of directory. Usually with size_calculations
pathIsArchive bool = false
size_calculations bool = true // Print directory byte totals
recurse_directories bool = false
mindate time.Time // Filter for min/max date, requires minmaxdatetype
maxdate time.Time
minmaxdatetype string = "m" // May be m = modified, a = accessed, c = created. Only one is allowed.
minsize int64 = -1
maxsize int64 = math.MaxInt64
matcher glob.Glob
start_directory string
file_mask string
filenameParsed bool = false
namePadding int = 0
haveGlobber = false
case_sensitive bool = false
exclude_exts []string // Upper-case list of extensions to ignore.
include_exts []string // Upper-case list of extensions to include. If set, others are excluded.
exclude_dir_globs []glob.Glob // Compiled glob patterns for directories to exclude.
filesizes_format sizeformat = SIZE_NATURAL
use_colors bool = false
use_enhanced_colors bool = true // only applies if use_colors is on.
show_column_headers bool = false // Show column headers (field names) defaults off. If on, only applies to beginning of each dir.
text_search_type searchtype = SEARCH_NONE
text_regex *regexp.Regexp
PdftotextPath string = "*" // Uninitialized
TotalFiles int
TotalBytes int64
ColumnOrder string = ""
pw7zip string = ""
skipArchiveEntryMask bool = false // If true, do not apply outer file mask to files inside an archive.
)
func ternaryString(condition bool, s1 string, s2 string) string {
if condition {
return s1
}
return s2
}
/******* HANDLING COLORS *******/
/* General description of the LS_COLORS format: It is a two-letter index and up to three digits separated by semicolons.
Style;foreground color; background color. They occupy different numeric spaces.
Style: 00=none, 01=bold, 04=underscore, 05=blink, 07=reverse, 08=concealed.
Color: 30=black, 31=red, 32=green, 33=yellow, 34=blue, 35=magenta, 36=cyan, 37=white.
*/
func colorSetString(ftype Filetype) string {
if len(FileColors[ftype]) == 0 {
ftype = DEFAULT
}
return fmt.Sprintf("\033[%sm", FileColors[ftype])
}
// Read the LS_COLORS variable and turn into our settings for coloring.
func mapColors() {
lscolors := os.Getenv("LS_COLORS")
if len(lscolors) > 6 || runtime.GOOS == "windows" {
use_colors = true
}
colorDirectives := strings.Split(lscolors, ":")
for _, directive := range colorDirectives {
components := strings.Split(directive, "=")
if len(components) < 2 {
continue
}
var ft Filetype
switch components[0] {
case "ac":
ft = ARCHIVE
case "au":
ft = AUDIO
case "di":
ft = DIRECTORY
case "ex":
ft = EXECUTABLE
case "fi":
ft = DEFAULT
case "im":
ft = IMAGE
case "vi":
ft = VIDEO
case "ln":
ft = SYMLINK
}
if ft != NONE { // i.e. it was set; we don't change "reset"
FileColors[ft] = components[1]
}
}
}
// We only want to check for pdftotext once, only if doing text searches,
// and only if a PDF is found. This runs in that case.
func resolveCommand(cmd string) string {
// See if it's in the execution directory
var path string
var err error
executablePath, err := os.Executable()
if err == nil {
path = filepath.Dir(executablePath)
}
path = filepath.Join(path, cmd)
_, err = os.Stat(path)
if err == nil {
conditionalPrint(debug_messages, "Found "+cmd+" at "+path+".\n")
return path
} else {
conditionalPrint(debug_messages, "No "+cmd+" at "+path+".\n")
}
if !errors.Is(err, os.ErrNotExist) {
conditionalPrint(show_errors, "Found but could not open %s: %s\n", cmd, err.Error())
}
path, err = exec.LookPath(cmd)
if err == nil {
conditionalPrint(debug_messages, "Found "+cmd+" at "+path+".\n")
return path
} else {
conditionalPrint(debug_messages, "No "+cmd+" at "+path+".\n")
}
return ""
}
// Allows inline checking of conditions.
// if listFoundText, does a full search for all occurances and returns a list of matches.
func fileCheckMeetsConditions(target fileitem, foundText *string) bool {
success := false
var textFound string
success, textFound = fileMeetsConditions(target, false)
if success {
*foundText = textFound
}
return success
}
func archiveNameMatchesMask(name string) bool {
if !haveGlobber {
return false
}
testString := ternaryString(case_sensitive, name, strings.ToUpper(name))
return matcher.Match(testString)
}
// Does this file meet current conditions for inclusion?
func fileMeetsConditions(target fileitem, noTextSearch bool) (isFound bool, foundText string) {
if (!listdirectories) && target.IsDir {
return false, foundText
}
if (!listfiles) && !target.IsDir {
return false, foundText
}
if len(exclude_exts) > 0 && slices.Contains(exclude_exts, target.Extension()) {
return false, foundText
}
if len(include_exts) > 0 && !slices.Contains(include_exts, target.Extension()) {
return false, foundText
}
filename := target.Name
if (!listhidden) && filename[0] == '.' {
return false, foundText
}
// Check date ranges - there are three possibilities
if !mindate.IsZero() {
switch minmaxdatetype {
case "m":
if target.Modified.Before(mindate) {
return false, foundText
}
case "c":
if target.Created.Before(mindate) {
return false, foundText
}
default:
if target.Accessed.Before(mindate) { // Default a
return false, foundText
}
}
}
if !maxdate.IsZero() {
switch minmaxdatetype {
case "m":
if target.Modified.After(maxdate) {
return false, foundText
}
case "c":
if target.Created.After(maxdate) {
return false, foundText
}
default:
if target.Accessed.After(maxdate) { // Default a
return false, foundText
}
}
}
if target.Size < minsize || target.Size > maxsize {
return false, foundText
}
// If we don't have the globber, return true. Otherwise match it.
if haveGlobber && !(target.InArchive && skipArchiveEntryMask) {
testString := ternaryString(case_sensitive, filename, strings.ToUpper(filename))
if !matcher.Match(testString) {
return false, foundText
}
}
t_ext := target.Extension()
// Only text search if there is one and it isn't overridden. It's overridden for 7z initial checks.
if (text_search_type != SEARCH_NONE) && !noTextSearch {
if target.IsDir {
return false, foundText
}
// If caller targeted archive container names (e.g. encrypted.7z), keep those
// containers in the root listing and apply text filtering to the archive entries.
if listInArchives && !target.InArchive && target.IsArchive() && archiveNameMatchesMask(target.Name) {
return true, foundText
}
if target.InArchive {
return archiveFileTextSearch(target)
} else if t_ext == "DOCX" || t_ext == "PPTX" || t_ext == "XLSX" || t_ext == "VSDX" {
conditionalPrint(progress_messages, "Embedded Zip text search on %s.\n", target.Name)
embeddedFiles, err := filesInZipArchive(filepath.Join(target.Path, target.Name), false)
if err != nil {
conditionalPrint(show_errors, "Could not unzip %s: %s\n", target.Name, err.Error())
return false, foundText
}
found := false
// In a docx or office file, the "files" are things like "word/theme/theme1.xml", "word/document.xml",
// "word/_rels/document.xml.rels", "_rels/.rels", "[Content_Types].xml", settings.xml, etc.
for _, f := range embeddedFiles.MatchedFiles {
var data []byte
data, err = extractZipFileBytes(f.Path, f.Name, 0, int(f.Size))
if !listFoundText { // Stop at the first matching file
found = text_regex.Match(data)
if found {
break
}
} else { // Do all files
newfound, newFoundText := matchTextBuffer(text_regex, data, listFoundText)
if newfound {
found = true
foundText += newFoundText
}
}
}
if err != nil { // Try brute forcè
found, foundText = diskFileTextSearch(target)
}
if !found {
return false, foundText
}
// We want to fall through to brute-force on any error. Error may be PROGRAM_NOT_FOUND
} else if s, e := PDFText(filepath.Join(target.Path, target.Name), false); e == nil {
if !listFoundText { // Could just call matchTextBuffer as is, but for speed...
if !text_regex.Match([]byte(s)) {
return false, foundText
}
} else {
return matchTextBuffer(text_regex, []byte(s), listFoundText)
}
} else {
var f bool
f, foundText = diskFileTextSearch(target)
if !f {
return false, foundText
}
}
}
// Check for executable flag on *nix if set.
if only_executables {
info, err := os.Stat(filepath.Join(target.Path, target.Name))
if err != nil {
return false, foundText
}
mode := info.Mode()
if runtime.GOOS != "windows" {
if mode&0111 == 0 {
return false, foundText
}
} else {
// On Windows, check for .exe, .bat, .cmd, .com extensions
ext := strings.ToUpper(target.Extension())
if ext != "EXE" && ext != "BAT" && ext != "CMD" && ext != "COM" {
return false, foundText
}
}
}
return true, foundText
}
// Returns an error if not opened or no utility (pdftotext)
func PDFText(filepath string, ignoreExtension bool) (string, error) {
// Due to limitations of Go, I'm doing a fitness check here.
extension := strings.ToUpper(filepath[strings.LastIndex(filepath, ".")+1:])
if !ignoreExtension && extension != "PDF" {
return "", errors.New("not a pdf file")
}
// Have we already checked?
if PdftotextPath == "" {
return "", errors.New(PROGRAM_NOT_FOUND)
}
// Or do we need to initialize this value?
if PdftotextPath == "*" {
if runtime.GOOS == "windows" {
PdftotextPath = resolveCommand("pdftotext.exe")
} else {
PdftotextPath = resolveCommand("pdftotext")
}
if len(PdftotextPath) == 0 {
conditionalPrint(debug_messages, "Could not find pdftotext. PDF text will not be found.\n")
conditionalPrint(show_errors, "Could not find pdftotext. PDF text will not be found.\n")
return "", errors.New(PROGRAM_NOT_FOUND)
}
}
// pdftotext uses - to send output to stdout.
cmd := exec.Command(PdftotextPath, filepath, "-")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
if err != nil {
conditionalPrint(debug_messages, "Could not run pdftotext on "+filepath+"; "+err.Error()+"\n")
return "", errors.New("could not run pdftotext on " + filepath + "; " + err.Error())
}
if stderr.Len() > 0 {
fmt.Printf("Got errors: %s on file %s\n", stderr.String(), filepath)
}
return stdout.String(), err
}
// Load one file in the archive into bytes, with a maximum size.
func archiveFileBytes(target fileitem) ([]byte, error) {
var data []byte
var err error
if target.Size > 1000000 {
return nil, errors.New("archive entry too large")
}
conditionalPrint(progress_messages, "- Recursing into archive file: "+target.Path+"/"+target.Name+"\n")
switch FileIsArchiveType(target.Path) {
case ARCHIVE_ZIP:
data, err = extractZipFileBytes(target.Path, target.Name, 0, int(target.Size))
case ARCHIVE_7Z:
data, err = extract7ZFileBytes(target.Path, target.Name, 0, int(target.Size))
case ARCHIVE_TGZ:
data, err = extractTgzFileBytes(target.Path, target.Name, 0, int(target.Size))
default:
// No handler found.
return nil, errors.New("unsupported archive type")
}
if err != nil {
return nil, err
}
return data, nil
}
// Search archive entry contents that are already loaded in memory.
func archiveFileTextSearchFromData(target fileitem, data []byte) (bool, string) {
var t_ext string = target.Extension()
if t_ext == "DOCX" || t_ext == "PPTX" || t_ext == "XLSX" || t_ext == "VSDX" || t_ext == "PDF" {
// Write to a temp file so we can more easily uncompress the docx or run a util on the PDF
var err error
var pfile *os.File
pfile, err = os.CreateTemp("", sanitizeTempPattern(target.Name))
if err == nil {
pfilename := pfile.Name()
pfile.Write(data)
pfile.Close()
defer os.Remove(pfilename)
data = nil
if t_ext == "PDF" {
s, e := PDFText(pfile.Name(), true)
if e == nil {
//return text_regex.Match([]byte(s))
return matchTextBuffer(text_regex, []byte(s), listFoundText)
}
} else { // Handle Office files - decompress and check
embeddedFiles, err := filesInZipArchive(pfile.Name(), false)
if err == nil {
found := false
allFoundText := ""
for _, f := range embeddedFiles.MatchedFiles {
var data []byte
data, err = extractZipFileBytes(f.Path, f.Name, 0, int(f.Size))
if err != nil {
continue
}
newfound, newFoundText := matchTextBuffer(text_regex, data, listFoundText)
if newfound {
if !listFoundText {
return true, ""
}
found = true
allFoundText += newFoundText
}
}
if found {
return true, allFoundText
}
}
}
} else { // temp file creation success
conditionalPrint(show_errors, "Could not create temp file for text search on %s: %s\n", target.Name, err.Error())
}
} // office or pdf file
//return text_regex.Match(data)
return matchTextBuffer(text_regex, data, listFoundText)
}
func sanitizeTempPattern(name string) string {
// Archive entries may include nested path segments. CreateTemp pattern must not contain separators.
pattern := strings.ReplaceAll(name, "/", "_")
pattern = strings.ReplaceAll(pattern, "\\", "_")
pattern = strings.TrimSpace(pattern)
if pattern == "" || pattern == "." {
return "archive-entry-*"
}
return pattern
}
// Load and search one file in the archive.
func archiveFileTextSearch(target fileitem) (bool, string) {
data, err := archiveFileBytes(target)
if err != nil {
return false, ""
}
return archiveFileTextSearchFromData(target, data)
}
// Searches the file in chunks.
// Returns true if the file has the text. False on error or not found.
func diskFileTextSearch(target fileitem) (bool, string) {
text_found := false
allFoundText := ""
// Load file in blocks of 200KB for speed and memory.
file, err := os.Open(filepath.Join(target.Path, target.Name))
if err != nil {
conditionalPrint(show_errors, "Could not open file for text search: %s - %s\n", target.Name, err.Error())
return false, allFoundText
}
defer file.Close()
reader := bufio.NewReader(file)
// Any "Go" purist who thought generics are a bad idea... would fail an interview at any productive company.
// Min() and Max() should not be this hard. I understand the philosophy, but those philosophers are idiots
// who don't deserve paying jobs.
chunkSize := 20000
overlapSize := 400
if chunkSize > int(target.Size) {
chunkSize = int(target.Size)
overlapSize = 0
}
searchBuffer := make([]byte, chunkSize+overlapSize)
for !text_found || listFoundText { // Will exit on EOF from break if finding all matches
n, err := reader.Read(searchBuffer[overlapSize:])
if err != nil && err.Error() != "EOF" {
conditionalPrint(show_errors, "Could not open file for text search: %s - %s\n", target.Name, err.Error())
return false, allFoundText
}
if !listFoundText {
text_found = text_regex.Match(searchBuffer)
} else {
found, newFoundText := matchTextBuffer(text_regex, searchBuffer, true)
if found {
text_found = true
allFoundText += newFoundText
}
}
// Check for EOF
if (n < chunkSize) || n == int(target.Size) {
break
}
}
return text_found, allFoundText
}
// matchTextBuffer scans a buffer line by line for matches to the regex.
// If findAll is true, returns all matched excerpts (with some context); else stops at first match.
func matchTextBuffer(regex *regexp.Regexp, buffer []byte, findAll bool) (bool, string) {
var result strings.Builder
found := false
scanner := bufio.NewScanner(bytes.NewReader(buffer))
for scanner.Scan() {
line := scanner.Text()
matches := regex.FindAllStringIndex(line, -1)
if len(matches) > 0 {
found = true
if !findAll {
return true, ""
}
for _, loc := range matches {
start := loc[0]
end := loc[1]
lineStart := start - 5
if lineStart < 0 {
lineStart = 0
}
lineEnd := end + 60
if lineEnd > len(line) {
lineEnd = len(line)
}
excerpt := line[lineStart:lineEnd]
result.WriteString(excerpt)
result.WriteString("\n")
}
}
}
return found, result.String()
}
type ListingSet struct {
// Matched files, to sort/format
Subdirs []string // Subdirectories to recurse through
Archives []string
MatchedFiles []fileitem
Filecount int
Directorycount int
Bytesfound int64
}
func extractZipFileBytes(zippath string, filename string, offset int, length int) ([]byte, error) {
var buffer = make([]byte, length)
zipReader, err := zip.OpenReader(zippath)
if err != nil {
if show_errors {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
return nil, err
}
defer zipReader.Close()
for _, fileInZip := range zipReader.File {
if fileInZip.Name != filename {
continue
}
readCloser, err := fileInZip.Open()
if err != nil {
return nil, err
}
defer readCloser.Close()
// Pseudo-seek - read buffer size until we get there.
curPos := 0
for curPos < offset {
readAmount := length
if readAmount+curPos > offset {
readAmount = offset - curPos
newBuf := make([]byte, readAmount)
readCloser.Read(newBuf)
} else {
readCloser.Read(buffer)
}
curPos += length
}
// Pseudo-Seek done. Uggah.
readCloser.Read(buffer)
break
}
return buffer, err
}
// Extracts the bytes of the file, if necessary decrypting first. Note that 7z does not support seeking, so we have to read and discard until we get to the offset.
func extract7ZFileBytes(zippath string, filename string, offset int, length int) ([]byte, error) {
zipReader, err := sevenzip.OpenReaderWithPassword(zippath, pw7zip)
if err != nil {
if show_errors {
var re *sevenzip.ReadError
if errors.As(err, &re) && re.Encrypted {
fmt.Printf("Error: Invalid password for %s.\n", filename)
} else {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
}
return nil, err
}
var buffer = make([]byte, length)
for _, fileInZip := range zipReader.File {
if fileInZip.Name != filename {
continue
}
readCloser, err := fileInZip.Open()
if err != nil {
return nil, err
}
defer readCloser.Close()
// Pseudo-seek - read buffer size until we get there.
curPos := 0
for curPos < offset {
readAmount := length
if readAmount+curPos > offset {
readAmount = offset - curPos
newBuf := make([]byte, readAmount)
readCloser.Read(newBuf)
} else {
readCloser.Read(buffer)
}
curPos += length
}
// Pseudo-Seek done. Uggah.
_, err = readCloser.Read(buffer)
if (show_errors) && err != nil {
conditionalPrint(true, "7z Read Error "+zippath+" : "+filename+": "+err.Error())
}
break
}
return buffer, err
}
func extractTgzFileBytes(zippath string, filename string, offset int, length int) ([]byte, error) {
var gzReader *gzip.Reader
var tarReader *tar.Reader
var buffer = make([]byte, length)
file, err := os.Open(zippath)
if err == nil {
defer file.Close()
gzReader, err = gzip.NewReader(file)
}
if err == nil {
defer gzReader.Close()
tarReader = tar.NewReader(gzReader)
}
if err != nil {
if show_errors {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
return nil, err
}
// Locate file
head, err := tarReader.Next()
for head != nil && err == nil {
if head.Name != filename {
head, err = tarReader.Next()
continue
}
break
}
// Seek to offset
curPos := 0
for curPos < offset {
readAmount := length
if readAmount+curPos > offset {
readAmount = offset - curPos
newBuf := make([]byte, readAmount)
tarReader.Read(newBuf)
} else {
tarReader.Read(buffer)
}
curPos += length
}
// Pseudo-Seek done. Uggah. Read data
tarReader.Read(buffer)
return buffer, err
}
func FileIsArchiveType(filename string) ArchiveType {
extension := strings.ToLower(filename[strings.LastIndex(filename, ".")+1:])
if extension == "zip" {
return ARCHIVE_ZIP
} else if extension == "tgz" || extension == "gz" {
return ARCHIVE_TGZ
} else if extension == "7z" {
return ARCHIVE_7Z
}
return ARCHIVE_NA
}
func filesInZipArchive(filename string, checkConditions bool) (ListingSet, error) {
var ls ListingSet
zipReader, err := zip.OpenReader(filename)
if err != nil {
if show_errors {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
return ls, err
}
defer zipReader.Close()
for _, fileInZip := range zipReader.File {
var foundText string
var item fileitem = fileitem{filename, fileInZip.Name, int64(fileInZip.UncompressedSize64), fileInZip.ModTime(), time.Time{}, time.Time{},
fileInZip.FileInfo().IsDir(), fileInZip.Mode(), "", true, NONE, ""}
if !checkConditions || fileCheckMeetsConditions(item, &foundText) {
item.FoundText = foundText
ls.MatchedFiles = append(ls.MatchedFiles, item)
if item.IsDir {
ls.Directorycount++
} else {
ls.Filecount++
ls.Bytesfound += item.Size
}
}
}
return ls, err
}
// Lists files in the archive. If we aren't looking into the files, this is fast.
// But if we are doing text search, because it was loading the contents distinctly and there's no seek, it got slow on large files.
// See linearFilesIn7ZArchive for an optimized version that does the text search while loading the file, and skips loading if it doesn't meet non-text conditions.
func filesIn7ZArchive(filename string) (ListingSet, error) {
var ls ListingSet
zipReader, err := sevenzip.OpenReaderWithPassword(filename, pw7zip)
if err != nil {
if show_errors {
var re *sevenzip.ReadError
if errors.As(err, &re) && re.Encrypted {
fmt.Printf("Error: Invalid password for %s.\n", filename)
} else {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
}
return ls, err
}
defer zipReader.Close()
for _, fileInZip := range zipReader.File {
var item fileitem = fileitem{filename, fileInZip.Name, fileInZip.FileInfo().Size(),
fileInZip.Modified, time.Time{}, time.Time{}, fileInZip.FileInfo().IsDir(), fileInZip.Mode(), "", true, NONE, ""}
var foundText string
if fileCheckMeetsConditions(item, &foundText) {
item.FoundText = foundText
ls.MatchedFiles = append(ls.MatchedFiles, item)
if item.IsDir {
ls.Directorycount++
} else {
ls.Filecount++
ls.Bytesfound += item.Size
}
}
}
return ls, err
}
/***************************************************/
/* 7z SevenZip linear optimization for large files */
/***************************************************/
type SevenZSkipMode int
const (
SevenZSkipNoop SevenZSkipMode = iota
SevenZSkipDrain
)
type SevenZIterator struct {
zr *sevenzip.ReadCloser
index int
skipMode SevenZSkipMode
}
func linearFilesIn7ZArchive(filename string) (ListingSet, error) {
var ls ListingSet
it, err := SevenZOpenIterator(filename, pw7zip, SevenZSkipNoop)
if err != nil {
var re *sevenzip.ReadError
if show_errors {
if errors.As(err, &re) && re.Encrypted {
fmt.Printf("Error: Invalid password for %s.\n", filename)
} else {
fmt.Printf("Error: Could not open %s. %s\n", filename, err.Error())
}
}
return ls, err
}
defer it.Close()
for {
fileInZip, ok := it.SevenZNext()
if !ok {
break
}
if fileInZip.FileInfo().IsDir() {
continue // Safe to skip, because the filenames will include the folder names when we get to them.
}
var item fileitem = fileitem{filename, fileInZip.Name, fileInZip.FileInfo().Size(),
fileInZip.Modified, time.Time{}, time.Time{}, fileInZip.FileInfo().IsDir(), fileInZip.Mode(), "", true, NONE, ""}
// Check file without text search first.
meetsNonTextConditions, _ := fileMeetsConditions(item, true)
// If SEARCH_NONE, add to the list right now and skip
if meetsNonTextConditions {
var foundText string
if text_search_type != SEARCH_NONE {
contents, err := it.SevenZReadAll(fileInZip)
if err != nil {
return ls, err
}
matched, textMatches := archiveFileTextSearchFromData(item, contents)
if !matched {
continue
}
foundText = textMatches
}
item.FoundText = foundText
ls.MatchedFiles = append(ls.MatchedFiles, item)
conditionalPrint(progress_messages, " Matched: %s\n", item.ToString())
ls.Filecount++
ls.Bytesfound += item.Size
} else {
if err := it.SevenZSkip(fileInZip); err != nil {
return ls, err
}
}
}
return ls, err
}
func SevenZOpenIterator(path string, password string, skipMode SevenZSkipMode) (*SevenZIterator, error) {
zr, err := sevenzip.OpenReaderWithPassword(path, password)
if err != nil {
return nil, err
}
return &SevenZIterator{
zr: zr,
index: 0,
skipMode: skipMode,
}, nil