pixelb · dipietro-salvatore · Aug 4, 2018 · Aug 4, 2018 · Aug 8, 2018 · Aug 9, 2018
diff --git a/README.md b/README.md
@@ -0,0 +1,8 @@
+# FSlint
+Linux file system lint checker/cleaner with database support to reduce time and operations if you need to scan twice the same files in the folders or subfolders.
+
+Each hashing algorithm saves the hash values of the files that it calculates on a specific database. In addition to the hash value, it saves the file creation and last modification time. If one of these two parameters changed from the last execution of FSlint, the hashing algorithm re-calculates the hash value otherwise it considers the existing hash value of the file as valid.
+
+To accomplish it, the fslint/supprt/database file has been created and the fslint/findup file has been modified to pass the information to the database script and reduce the execution time for the hashing calculation. Moreover, with the database support, the find duplication file process can be stopped and re-started avoiding to re-calculate the hash from the beginning.
+
+The usage of the command line or GUI has not been modified, so you can use the FSlint as normal. 
diff --git a/doc/FAQ b/doc/FAQ
@@ -49,8 +49,10 @@ A. In "Advanced Search Parameters"->"Paths to exclude", add
    "*/CVS" to exclude all CVS dirs for example
 
 Q. In the GUI how do I exclude files matching a pattern?
-A. In "Advanced Search Parameters"->"Extra find parameters", add
-   "\( ! -name COPYING* -a ! -name LICENSE \)" to exclude licence files for e.g.
+A. In "Advanced Search Parameters"->"Extra find parameters", add:
+     \( ! -name COPYING* -a ! -name LICENSE \)
+   to exclude licence files, for example. See `man find` (section 'EXPRESSION')
+   for more guidance on the proper format for queries.
 
 Q. From the command line how do I exclude paths matching a pattern?
 A. To exclude directories: findup \( -path "*/.svn" \) -prune -o

diff --git a/fslint/findup b/fslint/findup
@@ -55,6 +55,11 @@
 
 script_dir=$(dirname "$0")              #directory of this script
 script_dir=$(readlink -f "$script_dir") #Make sure absolute path
+database_dir="$HOME/.fslint/databases" # Databases directory
+
+if [ ! -d $database_dir ]; then
+  mkdir -p $database_dir 2> /dev/null
+fi
 
 . "$script_dir"/supprt/fslver
 
@@ -180,7 +185,8 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
 # even more so if they are large. This usually adds a small amount of
 # runtime, however it can save a large amount of time in certain situations.
 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
-    xargs -r0 "$script_dir"/supprt/md5sum_approx |
+    #xargs -r0 "$script_dir"/supprt/md5sum_approx |
+    xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
     sort |                     #group duplicate files together
     uniq --all-repeated -w32 | #pick just duplicates
     cut -d' ' -f3- |           #get filenames
@@ -191,7 +197,8 @@ else
 fi |
 
 # This block selects duplicates using md5sum of whole file
-xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
+#xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
+xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum  |      #calculate md5sums for possible duplicates
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
@@ -203,6 +210,8 @@ cut -s -d' ' -f3- |        #get filenames
 sort |                     #sort by paths to try to minimise disk seeks
 tr '\n' '\0' |             #delimit names with \0
 xargs -r0 sha1sum -- |     #to be sure to be sure
+# Disabled to do the last test before show the results
+#xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum  |     #to be sure to be sure
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates

diff --git a/fslint/supprt/database b/fslint/supprt/database
@@ -0,0 +1,115 @@
+#!/usr/bin/env python2
+# encoding: utf8
+
+import sys, os, subprocess
+import sqlite3
+
+
+HASH_CMD = None
+DEBUG = False
+COMMIT_COUNT = 100
+
+def hash_file(path):
+    cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8")
+    hash = cmd_output.split(' ')[0]
+    return hash
+
+
+def _get_path_files(files_list):
+    paths_list = list()
+    for file in files_list:
+        dirname = "%s/"%(os.path.dirname(os.path.abspath(file)))
+        if len(paths_list) < 1: paths_list.append(dirname)
+        for p in paths_list:
+            if p not in dirname:
+                paths_list.append(dirname)
+                print("append")
+
+    if DEBUG: print("Folders: ", ", ".join(paths_list))
+    return paths_list
+
+
+
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Error, provide the <database path>, <hashing command> and <files list with \\0> ")
+
+    db_path = sys.argv[1]
+    HASH_CMD = sys.argv[2]
+    files_list = [f.decode("utf-8") for f in sys.argv[3:]]
+
+
+    if not os.path.exists(db_path):
+        directory = os.path.dirname(db_path)
+        print(directory)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        db = sqlite3.connect(db_path)
+        #db.text_factory = str
+        cursor = db.cursor()
+        cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT )  ''')
+        db.commit()
+    else:
+        db = sqlite3.connect(db_path)
+        cursor = db.cursor()
+
+
+    #Existing Files
+    # @TODO Optimize downloading only relevant folders from database
+    #paths_list = _get_path_files(files_list)
+    #if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
+    #cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
+    cursor.execute('''SELECT * FROM files ;''')
+    filesdb = dict()
+    for f in cursor.fetchall():
+        filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}
+
+    files_hashes = dict()
+    commit_count = 0
+    for file in files_list:
+        if file in filesdb.keys():
+            filedb = filesdb[file]
+            try:
+                if str(os.path.getctime(file)) == filedb['ctime'] and str(os.path.getmtime(file)) == filedb['mtime']:
+                    if DEBUG: print("Found %s" % (file))
+                    files_hashes[file] = filedb['hash']
+                else:
+                    if DEBUG: print("Update %s" % (file))
+                    hash = hash_file(file)
+                    cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
+                                   (str(os.path.getctime(file)), str(os.path.getmtime(file)), hash, file))
+                    commit_count += 1
+                    files_hashes[file] = hash
+
+            except Exception as e:
+                hash = hash_file(file)
+                files_hashes[file] = hash
+
+        else:
+            #New files
+            try:
+                if DEBUG: print("Insert %s" % (file))
+                hash = hash_file(file)
+                cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
+                             (str(file), str(os.path.getctime(file)), str(os.path.getmtime(file)), hash))
+                commit_count += 1
+                files_hashes[file] = hash
+
+            except Exception as e:
+                hash = hash_file(file)
+                files_hashes[file] = hash
+
+            if commit_count >= COMMIT_COUNT:
+                commit_count = 0
+                db.commit()
+
+    db.commit()
+    db.close()
+
+
+    for file_hash in files_hashes:
+        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], file_hash.encode('utf-8')))
+
+