From e959588703c92310720f6ea26d672f202d1b7f1a Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Sat, 4 Aug 2018 01:08:49 +0100
Subject: [PATCH 01/12] database support

---
 README.md              |  8 +++++
 fslint/findup          |  6 ++--
 fslint/supprt/database | 72 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 README.md
 create mode 100755 fslint/supprt/database
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f8c976d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+# FSlint
+Linux file system lint checker/cleaner with database support to reduce time and operations if you need to scan twice the same files in the folders or subfolders.
+
+Each hashing algorithm saves the hash values of the files that it calculates on a specific database. In addition to the hash value, it saves the file creation and last modification time. If one of these two parameters changed from the last execution of FSlint, the hashing algorithm re-calculates the hash value otherwise it considers the existing hash value of the file as valid.
+
+To accomplish it, the fslint/supprt/database file has been created and the fslint/findup file has been modified to pass the information to the database script and reduce the execution time for the hashing calculation. Moreover, with the database support, the find duplication file process can be stopped and re-started avoiding to re-calculate the hash from the beginning.
+  
+The usage of the command line or GUI has not been modified, so you can use the FSlint as normal. 
\ No newline at end of file
diff --git a/fslint/findup b/fslint/findup
index fe8aee8..18403e0 100755
--- a/fslint/findup
+++ b/fslint/findup
@@ -180,7 +180,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
 # even more so if they are large. This usually adds a small amount of
 # runtime, however it can save a large amount of time in certain situations.
 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
-    xargs -r0 "$script_dir"/supprt/md5sum_approx |
+    xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
     sort |                     #group duplicate files together
     uniq --all-repeated -w32 | #pick just duplicates
     cut -d' ' -f3- |           #get filenames
@@ -191,7 +191,7 @@ else
 fi |
 
 # This block selects duplicates using md5sum of whole file
-xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
+xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum.sqlite md5sum  |      #calculate md5sums for possible duplicates
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
@@ -202,7 +202,7 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
 cut -s -d' ' -f3- |        #get filenames
 sort |                     #sort by paths to try to minimise disk seeks
 tr '\n' '\0' |             #delimit names with \0
-xargs -r0 sha1sum -- |     #to be sure to be sure
+xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum  |     #to be sure to be sure
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
diff --git a/fslint/supprt/database b/fslint/supprt/database
new file mode 100755
index 0000000..0e1f957
--- /dev/null
+++ b/fslint/supprt/database
@@ -0,0 +1,72 @@
+#!/usr/bin/env python2
+import sys, os, subprocess
+import sqlite3
+
+HASH_CMD = None
+DEBUG = False
+
+def hash_file(path):
+    cmd_output = subprocess.check_output([HASH_CMD, path]).decode("utf-8")
+    hash = cmd_output.split(' ')[0]
+    return hash
+
+
+
+
+
+
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Error, provide the <database path>, <hashing command> and <files list with \\0> ")
+
+    db_path = sys.argv[1]
+    HASH_CMD = sys.argv[2]
+    files_list = sys.argv[3:]
+    files_hashes = dict()
+
+    if not os.path.exists(db_path):
+        directory = os.path.dirname(db_path)
+        print(directory)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        db = sqlite3.connect(db_path)
+        cursor = db.cursor()
+        cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT )  ''')
+        db.commit()
+    else:
+        db = sqlite3.connect(db_path)
+        cursor = db.cursor()
+
+
+    for file_path in files_list:
+        cursor.execute('''SELECT * FROM files WHERE path=?''', (file_path,))
+        file_record = cursor.fetchone()
+        if file_record is not None and len(file_record) > 0:
+            if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
+                if DEBUG: print("Found")
+                files_hashes[file_path] = file_record[4]
+            else:
+                if DEBUG: print("Update")
+                hash = hash_file(file_path)
+                cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
+                               (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
+                db.commit()
+                files_hashes[file_path] = hash
+        else:
+            if DEBUG: print("Insert")
+            hash = hash_file(file_path)
+            cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
+                           (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash))
+            db.commit()
+            files_hashes[file_path] = hash
+
+    db.commit()
+    db.close()
+
+
+    for file_hash in files_hashes:
+        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], file_hash))
+
+

From 00a1e3d8eb27c7053116949a80e664ef8164cab2 Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Sat, 4 Aug 2018 01:41:51 +0100
Subject: [PATCH 02/12] added "Execution Time" in Duplicates (GUI)

---
 fslint-gui | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fslint-gui b/fslint-gui
index c961e5f..cff86e1 100755
--- a/fslint-gui
+++ b/fslint-gui
@@ -1038,6 +1038,8 @@ on your system at present."))
         return (str(row) + _(" files"), pe)
 
     def findup(self, clist_dups):
+        import time
+        start_time = time.time()
         po, pe = self.get_fslint("./findup --gui" + self.findParams)
 
         numdups = 0
@@ -1088,9 +1090,10 @@ on your system at present."))
                 clist_dups.set_row_data(row,file[2]) #mtime
                 row += 1
 
+        execTime = str(time.time() - start_time)
         return (human_num(byteWaste,1000).strip() + 'B' + _(" wasted in ") +
                 str(numWaste) + _(" files (in ") + str(len(alldups)) +
-                _(" groups)"), pe)
+                _(" groups).") + " Execution time: " + execTime + " sec.", pe)
 
     find_dispatch = (findup,findpkgs,findnl,findsn,findtf,
                      findbl,findid,finded,findns,findrs) #order NB

From 469b79a17d0300e44706f965a651060068fef7a4 Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Wed, 8 Aug 2018 20:50:00 +0100
Subject: [PATCH 03/12] database file single select query

---
 fslint/supprt/database | 62 +++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index 0e1f957..c76f4f5 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -1,7 +1,10 @@
 #!/usr/bin/env python2
+# encoding: utf8
+
 import sys, os, subprocess
 import sqlite3
 
+
 HASH_CMD = None
 DEBUG = False
 
@@ -40,26 +43,53 @@ if __name__ == "__main__":
         cursor = db.cursor()
 
 
-    for file_path in files_list:
-        cursor.execute('''SELECT * FROM files WHERE path=?''', (file_path,))
-        file_record = cursor.fetchone()
-        if file_record is not None and len(file_record) > 0:
-            if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
-                if DEBUG: print("Found")
-                files_hashes[file_path] = file_record[4]
-            else:
-                if DEBUG: print("Update")
+    #Existing Files
+    if len(files_list) > 0:
+        cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([str(f) for f in files_list]))
+        commit_count = 0
+        for file_record in cursor.fetchall():
+            file_path = str(file_record[1])
+            try:
+                if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
+                    if DEBUG: print("Found")
+                    files_hashes[file_path] = file_record[4]
+                    files_list.remove(file_path)
+                else:
+                    if DEBUG: print("Update")
+                    hash = hash_file(file_path)
+                    cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
+                                   (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
+                    commit_count += 1
+                    files_hashes[file_path] = hash
+
+                if commit_count == 1000:
+                    commit_count = 0
+                    db.commit()
+
+            # except UnicodeDecodeError as ude:
+            except Exception as e:
                 hash = hash_file(file_path)
-                cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
-                               (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
-                db.commit()
                 files_hashes[file_path] = hash
-        else:
+
+
+    #New files
+    for file_path in files_list:
+        file_path = str(file_path)
+        try:
             if DEBUG: print("Insert")
             hash = hash_file(file_path)
             cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
-                           (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash))
-            db.commit()
+                         (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash))
+            commit_count += 1
+            files_hashes[file_path] = hash
+
+            if commit_count == 1000:
+                commit_count = 0
+                db.commit()
+
+        #except UnicodeDecodeError as ude:
+        except Exception as e:
+            hash = hash_file(file_path)
             files_hashes[file_path] = hash
 
     db.commit()
@@ -67,6 +97,6 @@ if __name__ == "__main__":
 
 
     for file_hash in files_hashes:
-        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], file_hash))
+        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], str(file_hash)))
 
 

From 4ba300583f4c73285756b57c02c8e5f284e33f17 Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Thu, 9 Aug 2018 20:28:22 +0100
Subject: [PATCH 04/12] database file: support UFT-8 filename

---
 fslint/supprt/database | 55 +++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index c76f4f5..46710d0 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -9,7 +9,7 @@ HASH_CMD = None
 DEBUG = False
 
 def hash_file(path):
-    cmd_output = subprocess.check_output([HASH_CMD, path]).decode("utf-8")
+    cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8")
     hash = cmd_output.split(' ')[0]
     return hash
 
@@ -26,7 +26,7 @@ if __name__ == "__main__":
 
     db_path = sys.argv[1]
     HASH_CMD = sys.argv[2]
-    files_list = sys.argv[3:]
+    files_list = [f.decode("utf-8") for f in sys.argv[3:]]
     files_hashes = dict()
 
     if not os.path.exists(db_path):
@@ -35,6 +35,7 @@ if __name__ == "__main__":
         if not os.path.exists(directory):
             os.makedirs(directory)
         db = sqlite3.connect(db_path)
+        #db.text_factory = str
         cursor = db.cursor()
         cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT )  ''')
         db.commit()
@@ -44,37 +45,35 @@ if __name__ == "__main__":
 
 
     #Existing Files
-    if len(files_list) > 0:
-        cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([str(f) for f in files_list]))
-        commit_count = 0
-        for file_record in cursor.fetchall():
-            file_path = str(file_record[1])
-            try:
-                if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
-                    if DEBUG: print("Found")
-                    files_hashes[file_path] = file_record[4]
-                    files_list.remove(file_path)
-                else:
-                    if DEBUG: print("Update")
-                    hash = hash_file(file_path)
-                    cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
-                                   (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
-                    commit_count += 1
-                    files_hashes[file_path] = hash
-
-                if commit_count == 1000:
-                    commit_count = 0
-                    db.commit()
-
-            # except UnicodeDecodeError as ude:
-            except Exception as e:
+    cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([f for f in files_list]))
+    commit_count = 0
+    for file_record in cursor.fetchall():
+        file_path = str(file_record[1])
+        try:
+            if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
+                if DEBUG: print("Found")
+                files_hashes[file_path] = file_record[4]
+                files_list.remove(file_path)
+            else:
+                if DEBUG: print("Update")
                 hash = hash_file(file_path)
+                cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
+                               (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
+                commit_count += 1
                 files_hashes[file_path] = hash
 
+            if commit_count == 1000:
+                commit_count = 0
+                db.commit()
+
+        # except UnicodeDecodeError as ude:
+        except Exception as e:
+            hash = hash_file(file_path)
+            files_hashes[file_path] = hash
+
 
     #New files
     for file_path in files_list:
-        file_path = str(file_path)
         try:
             if DEBUG: print("Insert")
             hash = hash_file(file_path)
@@ -97,6 +96,6 @@ if __name__ == "__main__":
 
 
     for file_hash in files_hashes:
-        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], str(file_hash)))
+        sys.stdout.write("%s  %s\n" % (files_hashes[file_hash], file_hash.encode('utf-8')))
 
 

From 8e923de96eb4ddf19880681cb54e7c8a25429f8a Mon Sep 17 00:00:00 2001
From: Salvatore <dipietro.salvatore@gmail.com>
Date: Sun, 29 Sep 2019 14:07:45 +0100
Subject: [PATCH 05/12] Update findup

---
 fslint/findup | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fslint/findup b/fslint/findup
index 18403e0..432119b 100755
--- a/fslint/findup
+++ b/fslint/findup
@@ -202,7 +202,8 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
 cut -s -d' ' -f3- |        #get filenames
 sort |                     #sort by paths to try to minimise disk seeks
 tr '\n' '\0' |             #delimit names with \0
-xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum  |     #to be sure to be sure
+# Disabled to do the last test before show the results
+#xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum  |     #to be sure to be sure
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates

From b11f423dc37fe8f74881f739e073bc0d76e03d1b Mon Sep 17 00:00:00 2001
From: Salvatore <dipietro.salvatore@gmail.com>
Date: Wed, 13 Nov 2019 10:41:11 +0000
Subject: [PATCH 06/12] fix permission issue databases folder

fix permission issue databases folder
---
 fslint/findup | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fslint/findup b/fslint/findup
index 432119b..4a10d42 100755
--- a/fslint/findup
+++ b/fslint/findup
@@ -55,6 +55,11 @@
 
 script_dir=$(dirname "$0")              #directory of this script
 script_dir=$(readlink -f "$script_dir") #Make sure absolute path
+database_dir="$HOME/.fslint/databases" # Databases directory
+
+if [ ! -d $database_dir ]; then
+  mkdir -p $database_dir 2> /dev/null
+fi
 
 . "$script_dir"/supprt/fslver
 
@@ -180,7 +185,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
 # even more so if they are large. This usually adds a small amount of
 # runtime, however it can save a large amount of time in certain situations.
 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
-    xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
+    xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
     sort |                     #group duplicate files together
     uniq --all-repeated -w32 | #pick just duplicates
     cut -d' ' -f3- |           #get filenames
@@ -191,7 +196,7 @@ else
 fi |
 
 # This block selects duplicates using md5sum of whole file
-xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum.sqlite md5sum  |      #calculate md5sums for possible duplicates
+xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum  |      #calculate md5sums for possible duplicates
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
@@ -203,7 +208,7 @@ cut -s -d' ' -f3- |        #get filenames
 sort |                     #sort by paths to try to minimise disk seeks
 tr '\n' '\0' |             #delimit names with \0
 # Disabled to do the last test before show the results
-#xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum  |     #to be sure to be sure
+#xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum  |     #to be sure to be sure
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates

From 49e62a5f9dbd4441eb3f05e2eec1d979c16f70bc Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Sun, 17 Nov 2019 17:31:31 +0000
Subject: [PATCH 07/12] fix database support

---
 doc/FAQ       | 6 ++++--
 fslint-gui    | 5 +----
 fslint/findup | 3 +++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/FAQ b/doc/FAQ
index 3a191fc..463e70c 100644
--- a/doc/FAQ
+++ b/doc/FAQ
@@ -49,8 +49,10 @@ A. In "Advanced Search Parameters"->"Paths to exclude", add
    "*/CVS" to exclude all CVS dirs for example
 
 Q. In the GUI how do I exclude files matching a pattern?
-A. In "Advanced Search Parameters"->"Extra find parameters", add
-   "\( ! -name COPYING* -a ! -name LICENSE \)" to exclude licence files for e.g.
+A. In "Advanced Search Parameters"->"Extra find parameters", add:
+     \( ! -name COPYING* -a ! -name LICENSE \)
+   to exclude licence files, for example. See `man find` (section 'EXPRESSION')
+   for more guidance on the proper format for queries.
 
 Q. From the command line how do I exclude paths matching a pattern?
 A. To exclude directories: findup \( -path "*/.svn" \) -prune -o
diff --git a/fslint-gui b/fslint-gui
index cff86e1..c961e5f 100755
--- a/fslint-gui
+++ b/fslint-gui
@@ -1038,8 +1038,6 @@ on your system at present."))
         return (str(row) + _(" files"), pe)
 
     def findup(self, clist_dups):
-        import time
-        start_time = time.time()
         po, pe = self.get_fslint("./findup --gui" + self.findParams)
 
         numdups = 0
@@ -1090,10 +1088,9 @@ on your system at present."))
                 clist_dups.set_row_data(row,file[2]) #mtime
                 row += 1
 
-        execTime = str(time.time() - start_time)
         return (human_num(byteWaste,1000).strip() + 'B' + _(" wasted in ") +
                 str(numWaste) + _(" files (in ") + str(len(alldups)) +
-                _(" groups).") + " Execution time: " + execTime + " sec.", pe)
+                _(" groups)"), pe)
 
     find_dispatch = (findup,findpkgs,findnl,findsn,findtf,
                      findbl,findid,finded,findns,findrs) #order NB
diff --git a/fslint/findup b/fslint/findup
index 4a10d42..8b4e9b5 100755
--- a/fslint/findup
+++ b/fslint/findup
@@ -185,6 +185,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
 # even more so if they are large. This usually adds a small amount of
 # runtime, however it can save a large amount of time in certain situations.
 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
+    #xargs -r0 "$script_dir"/supprt/md5sum_approx |
     xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
     sort |                     #group duplicate files together
     uniq --all-repeated -w32 | #pick just duplicates
@@ -196,6 +197,7 @@ else
 fi |
 
 # This block selects duplicates using md5sum of whole file
+#xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
 xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum  |      #calculate md5sums for possible duplicates
 cleanup_sum |              #undo any backslash escaping
 sort |                     #group duplicate files together
@@ -207,6 +209,7 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
 cut -s -d' ' -f3- |        #get filenames
 sort |                     #sort by paths to try to minimise disk seeks
 tr '\n' '\0' |             #delimit names with \0
+xargs -r0 sha1sum -- |     #to be sure to be sure
 # Disabled to do the last test before show the results
 #xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum  |     #to be sure to be sure
 cleanup_sum |              #undo any backslash escaping

From b1387d0079c4c85bb393f2ee0675fb3399a9a443 Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Sun, 17 Nov 2019 17:37:32 +0000
Subject: [PATCH 08/12] decrease COMMIT_COUNT

---
 fslint/supprt/database | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index 46710d0..954d731 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -7,6 +7,7 @@ import sqlite3
 
 HASH_CMD = None
 DEBUG = False
+COMMIT_COUNT = 100
 
 def hash_file(path):
     cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8")
@@ -62,7 +63,7 @@ if __name__ == "__main__":
                 commit_count += 1
                 files_hashes[file_path] = hash
 
-            if commit_count == 1000:
+            if commit_count == COMMIT_COUNT:
                 commit_count = 0
                 db.commit()
 
@@ -82,7 +83,7 @@ if __name__ == "__main__":
             commit_count += 1
             files_hashes[file_path] = hash
 
-            if commit_count == 1000:
+            if commit_count == COMMIT_COUNT:
                 commit_count = 0
                 db.commit()
 

From d501561f09286cacbe0772b2b535c05e4bb32a8d Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Tue, 19 Nov 2019 11:40:34 +0000
Subject: [PATCH 09/12] new qeury model and improved DEBUG

---
 fslint/supprt/database | 87 +++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index 954d731..d33bdde 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -15,7 +15,14 @@ def hash_file(path):
     return hash
 
 
+def _get_path_files(files_list):
+    paths_list = list()
+    for file in files_list:
+        dirname = os.path.dirname(os.path.abspath(file))
+        if dirname not in paths_list:
+            paths_list.append(dirname)
 
+    return paths_list
 
 
 
@@ -28,7 +35,8 @@ if __name__ == "__main__":
     db_path = sys.argv[1]
     HASH_CMD = sys.argv[2]
     files_list = [f.decode("utf-8") for f in sys.argv[3:]]
-    files_hashes = dict()
+    paths_list = _get_path_files(files_list)
+
 
     if not os.path.exists(db_path):
         directory = os.path.dirname(db_path)
@@ -46,52 +54,51 @@ if __name__ == "__main__":
 
 
     #Existing Files
-    cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([f for f in files_list]))
+    filesdb = dict()
+    if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list]))
+    cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list]))
+    for f in cursor.fetchall():
+        filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}
+
+    files_hashes = dict()
     commit_count = 0
-    for file_record in cursor.fetchall():
-        file_path = str(file_record[1])
-        try:
-            if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]:
-                if DEBUG: print("Found")
-                files_hashes[file_path] = file_record[4]
-                files_list.remove(file_path)
-            else:
-                if DEBUG: print("Update")
-                hash = hash_file(file_path)
-                cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
-                               (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path))
+    for file in files_list:
+        if file in filesdb.keys():
+            filedb = filesdb[file]
+            try:
+                if str(os.path.getctime(file)) == filedb['ctime'] and str(os.path.getmtime(file)) == filedb['mtime']:
+                    if DEBUG: print("Found %s" % (file))
+                    files_hashes[file] = filedb['hash']
+                else:
+                    if DEBUG: print("Update %s" % (file))
+                    hash = hash_file(file)
+                    cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
+                                   (str(os.path.getctime(file)), str(os.path.getmtime(file)), hash, file))
+                    commit_count += 1
+                    files_hashes[file] = hash
+
+            except Exception as e:
+                hash = hash_file(file)
+                files_hashes[file] = hash
+
+        else:
+            #New files
+            try:
+                if DEBUG: print("Insert %s" % (file))
+                hash = hash_file(file)
+                cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
+                             (str(file), str(os.path.getctime(file)), str(os.path.getmtime(file)), hash))
                 commit_count += 1
-                files_hashes[file_path] = hash
-
-            if commit_count == COMMIT_COUNT:
-                commit_count = 0
-                db.commit()
-
-        # except UnicodeDecodeError as ude:
-        except Exception as e:
-            hash = hash_file(file_path)
-            files_hashes[file_path] = hash
+                files_hashes[file] = hash
 
+            except Exception as e:
+                hash = hash_file(file)
+                files_hashes[file] = hash
 
-    #New files
-    for file_path in files_list:
-        try:
-            if DEBUG: print("Insert")
-            hash = hash_file(file_path)
-            cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
-                         (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash))
-            commit_count += 1
-            files_hashes[file_path] = hash
-
-            if commit_count == COMMIT_COUNT:
+            if commit_count >= COMMIT_COUNT:
                 commit_count = 0
                 db.commit()
 
-        #except UnicodeDecodeError as ude:
-        except Exception as e:
-            hash = hash_file(file_path)
-            files_hashes[file_path] = hash
-
     db.commit()
     db.close()
 

From 92c3985dc6f043faa4694053082cba766fbbe076 Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Tue, 19 Nov 2019 12:16:53 +0000
Subject: [PATCH 10/12] reduce number of folders where to search files into db

---
 fslint/supprt/database | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index d33bdde..d8ff1d3 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -6,7 +6,7 @@ import sqlite3
 
 
 HASH_CMD = None
-DEBUG = False
+DEBUG = True
 COMMIT_COUNT = 100
 
 def hash_file(path):
@@ -18,10 +18,14 @@ def hash_file(path):
 def _get_path_files(files_list):
     paths_list = list()
     for file in files_list:
-        dirname = os.path.dirname(os.path.abspath(file))
-        if dirname not in paths_list:
-            paths_list.append(dirname)
-
+        dirname = "%s/"%(os.path.dirname(os.path.abspath(file)))
+        if len(paths_list) < 1: paths_list.append(dirname)
+        for p in paths_list:
+            if p not in dirname:
+                paths_list.append(dirname)
+                print("append")
+
+    if DEBUG: print("Folders: ", ", ".join(paths_list))
     return paths_list
 
 
@@ -55,8 +59,8 @@ if __name__ == "__main__":
 
     #Existing Files
     filesdb = dict()
-    if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list]))
-    cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list]))
+    if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
+    cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
     for f in cursor.fetchall():
         filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}
 

From 3b2bac0b5a1175c3abb9bbe3ef1c1305a650eeba Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Tue, 19 Nov 2019 12:44:02 +0000
Subject: [PATCH 11/12] DEBUG=False

---
 fslint/supprt/database | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index d8ff1d3..41c7b93 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -6,7 +6,7 @@ import sqlite3
 
 
 HASH_CMD = None
-DEBUG = True
+DEBUG = False
 COMMIT_COUNT = 100
 
 def hash_file(path):

From 9b278607d01c934305954e821870677f463d8abd Mon Sep 17 00:00:00 2001
From: Salvatore Dipietro <s.dipietro14@imperial.ac.uk>
Date: Tue, 19 Nov 2019 13:44:21 +0000
Subject: [PATCH 12/12] change query to read DB

---
 fslint/supprt/database | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fslint/supprt/database b/fslint/supprt/database
index 41c7b93..d52f8b0 100755
--- a/fslint/supprt/database
+++ b/fslint/supprt/database
@@ -39,7 +39,6 @@ if __name__ == "__main__":
     db_path = sys.argv[1]
     HASH_CMD = sys.argv[2]
     files_list = [f.decode("utf-8") for f in sys.argv[3:]]
-    paths_list = _get_path_files(files_list)
 
 
     if not os.path.exists(db_path):
@@ -58,9 +57,12 @@ if __name__ == "__main__":
 
 
     #Existing Files
+    # @TODO Optimize downloading only relevant folders from database
+    #paths_list = _get_path_files(files_list)
+    #if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
+    #cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
+    cursor.execute('''SELECT * FROM files ;''')
     filesdb = dict()
-    if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
-    cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
     for f in cursor.fetchall():
         filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}