From e959588703c92310720f6ea26d672f202d1b7f1a Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Sat, 4 Aug 2018 01:08:49 +0100 Subject: [PATCH 01/12] database support --- README.md | 8 +++++ fslint/findup | 6 ++-- fslint/supprt/database | 72 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 README.md create mode 100755 fslint/supprt/database diff --git a/README.md b/README.md new file mode 100644 index 0000000..f8c976d --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# FSlint +Linux file system lint checker/cleaner with database support to reduce time and operations if you need to scan twice the same files in the folders or subfolders. + +Each hashing algorithm saves the hash values of the files that it calculates on a specific database. In addition to the hash value, it saves the file creation and last modification time. If one of these two parameters changed from the last execution of FSlint, the hashing algorithm re-calculates the hash value otherwise it considers the existing hash value of the file as valid. + +To accomplish it, the fslint/supprt/database file has been created and the fslint/findup file has been modified to pass the information to the database script and reduce the execution time for the hashing calculation. Moreover, with the database support, the find duplication file process can be stopped and re-started avoiding to re-calculate the hash from the beginning. + +The usage of the command line or GUI has not been modified, so you can use the FSlint as normal. \ No newline at end of file diff --git a/fslint/findup b/fslint/findup index fe8aee8..18403e0 100755 --- a/fslint/findup +++ b/fslint/findup @@ -180,7 +180,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 # even more so if they are large. This usually adds a small amount of # runtime, however it can save a large amount of time in certain situations. if "$script_dir"/supprt/md5sum_approx /dev/null; then - xargs -r0 "$script_dir"/supprt/md5sum_approx | + xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx | sort | #group duplicate files together uniq --all-repeated -w32 | #pick just duplicates cut -d' ' -f3- | #get filenames @@ -191,7 +191,7 @@ else fi | # This block selects duplicates using md5sum of whole file -xargs -r0 md5sum -- | #calculate md5sums for possible duplicates +xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w32 | #pick just duplicates @@ -202,7 +202,7 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 -xargs -r0 sha1sum -- | #to be sure to be sure +xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum | #to be sure to be sure cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w40 | #pick just duplicates diff --git a/fslint/supprt/database b/fslint/supprt/database new file mode 100755 index 0000000..0e1f957 --- /dev/null +++ b/fslint/supprt/database @@ -0,0 +1,72 @@ +#!/usr/bin/env python2 +import sys, os, subprocess +import sqlite3 + +HASH_CMD = None +DEBUG = False + +def hash_file(path): + cmd_output = subprocess.check_output([HASH_CMD, path]).decode("utf-8") + hash = cmd_output.split(' ')[0] + return hash + + + + + + + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Error, provide the , and ") + + db_path = sys.argv[1] + HASH_CMD = sys.argv[2] + files_list = sys.argv[3:] + files_hashes = dict() + + if not os.path.exists(db_path): + directory = os.path.dirname(db_path) + print(directory) + if not os.path.exists(directory): + os.makedirs(directory) + db = sqlite3.connect(db_path) + cursor = db.cursor() + cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT ) ''') + db.commit() + else: + db = sqlite3.connect(db_path) + cursor = db.cursor() + + + for file_path in files_list: + cursor.execute('''SELECT * FROM files WHERE path=?''', (file_path,)) + file_record = cursor.fetchone() + if file_record is not None and len(file_record) > 0: + if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: + if DEBUG: print("Found") + files_hashes[file_path] = file_record[4] + else: + if DEBUG: print("Update") + hash = hash_file(file_path) + cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', + (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) + db.commit() + files_hashes[file_path] = hash + else: + if DEBUG: print("Insert") + hash = hash_file(file_path) + cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''', + (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash)) + db.commit() + files_hashes[file_path] = hash + + db.commit() + db.close() + + + for file_hash in files_hashes: + sys.stdout.write("%s %s\n" % (files_hashes[file_hash], file_hash)) + + From 00a1e3d8eb27c7053116949a80e664ef8164cab2 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Sat, 4 Aug 2018 01:41:51 +0100 Subject: [PATCH 02/12] added "Execution Time" in Duplicates (GUI) --- fslint-gui | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fslint-gui b/fslint-gui index c961e5f..cff86e1 100755 --- a/fslint-gui +++ b/fslint-gui @@ -1038,6 +1038,8 @@ on your system at present.")) return (str(row) + _(" files"), pe) def findup(self, clist_dups): + import time + start_time = time.time() po, pe = self.get_fslint("./findup --gui" + self.findParams) numdups = 0 @@ -1088,9 +1090,10 @@ on your system at present.")) clist_dups.set_row_data(row,file[2]) #mtime row += 1 + execTime = str(time.time() - start_time) return (human_num(byteWaste,1000).strip() + 'B' + _(" wasted in ") + str(numWaste) + _(" files (in ") + str(len(alldups)) + - _(" groups)"), pe) + _(" groups).") + " Execution time: " + execTime + " sec.", pe) find_dispatch = (findup,findpkgs,findnl,findsn,findtf, findbl,findid,finded,findns,findrs) #order NB From 469b79a17d0300e44706f965a651060068fef7a4 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Wed, 8 Aug 2018 20:50:00 +0100 Subject: [PATCH 03/12] database file single select query --- fslint/supprt/database | 62 +++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index 0e1f957..c76f4f5 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -1,7 +1,10 @@ #!/usr/bin/env python2 +# encoding: utf8 + import sys, os, subprocess import sqlite3 + HASH_CMD = None DEBUG = False @@ -40,26 +43,53 @@ if __name__ == "__main__": cursor = db.cursor() - for file_path in files_list: - cursor.execute('''SELECT * FROM files WHERE path=?''', (file_path,)) - file_record = cursor.fetchone() - if file_record is not None and len(file_record) > 0: - if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: - if DEBUG: print("Found") - files_hashes[file_path] = file_record[4] - else: - if DEBUG: print("Update") + #Existing Files + if len(files_list) > 0: + cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([str(f) for f in files_list])) + commit_count = 0 + for file_record in cursor.fetchall(): + file_path = str(file_record[1]) + try: + if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: + if DEBUG: print("Found") + files_hashes[file_path] = file_record[4] + files_list.remove(file_path) + else: + if DEBUG: print("Update") + hash = hash_file(file_path) + cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', + (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) + commit_count += 1 + files_hashes[file_path] = hash + + if commit_count == 1000: + commit_count = 0 + db.commit() + + # except UnicodeDecodeError as ude: + except Exception as e: hash = hash_file(file_path) - cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', - (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) - db.commit() files_hashes[file_path] = hash - else: + + + #New files + for file_path in files_list: + file_path = str(file_path) + try: if DEBUG: print("Insert") hash = hash_file(file_path) cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''', - (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash)) - db.commit() + (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash)) + commit_count += 1 + files_hashes[file_path] = hash + + if commit_count == 1000: + commit_count = 0 + db.commit() + + #except UnicodeDecodeError as ude: + except Exception as e: + hash = hash_file(file_path) files_hashes[file_path] = hash db.commit() @@ -67,6 +97,6 @@ if __name__ == "__main__": for file_hash in files_hashes: - sys.stdout.write("%s %s\n" % (files_hashes[file_hash], file_hash)) + sys.stdout.write("%s %s\n" % (files_hashes[file_hash], str(file_hash))) From 4ba300583f4c73285756b57c02c8e5f284e33f17 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Thu, 9 Aug 2018 20:28:22 +0100 Subject: [PATCH 04/12] database file: support UFT-8 filename --- fslint/supprt/database | 55 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index c76f4f5..46710d0 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -9,7 +9,7 @@ HASH_CMD = None DEBUG = False def hash_file(path): - cmd_output = subprocess.check_output([HASH_CMD, path]).decode("utf-8") + cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8") hash = cmd_output.split(' ')[0] return hash @@ -26,7 +26,7 @@ if __name__ == "__main__": db_path = sys.argv[1] HASH_CMD = sys.argv[2] - files_list = sys.argv[3:] + files_list = [f.decode("utf-8") for f in sys.argv[3:]] files_hashes = dict() if not os.path.exists(db_path): @@ -35,6 +35,7 @@ if __name__ == "__main__": if not os.path.exists(directory): os.makedirs(directory) db = sqlite3.connect(db_path) + #db.text_factory = str cursor = db.cursor() cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT ) ''') db.commit() @@ -44,37 +45,35 @@ if __name__ == "__main__": #Existing Files - if len(files_list) > 0: - cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([str(f) for f in files_list])) - commit_count = 0 - for file_record in cursor.fetchall(): - file_path = str(file_record[1]) - try: - if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: - if DEBUG: print("Found") - files_hashes[file_path] = file_record[4] - files_list.remove(file_path) - else: - if DEBUG: print("Update") - hash = hash_file(file_path) - cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', - (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) - commit_count += 1 - files_hashes[file_path] = hash - - if commit_count == 1000: - commit_count = 0 - db.commit() - - # except UnicodeDecodeError as ude: - except Exception as e: + cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([f for f in files_list])) + commit_count = 0 + for file_record in cursor.fetchall(): + file_path = str(file_record[1]) + try: + if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: + if DEBUG: print("Found") + files_hashes[file_path] = file_record[4] + files_list.remove(file_path) + else: + if DEBUG: print("Update") hash = hash_file(file_path) + cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', + (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) + commit_count += 1 files_hashes[file_path] = hash + if commit_count == 1000: + commit_count = 0 + db.commit() + + # except UnicodeDecodeError as ude: + except Exception as e: + hash = hash_file(file_path) + files_hashes[file_path] = hash + #New files for file_path in files_list: - file_path = str(file_path) try: if DEBUG: print("Insert") hash = hash_file(file_path) @@ -97,6 +96,6 @@ if __name__ == "__main__": for file_hash in files_hashes: - sys.stdout.write("%s %s\n" % (files_hashes[file_hash], str(file_hash))) + sys.stdout.write("%s %s\n" % (files_hashes[file_hash], file_hash.encode('utf-8'))) From 8e923de96eb4ddf19880681cb54e7c8a25429f8a Mon Sep 17 00:00:00 2001 From: Salvatore Date: Sun, 29 Sep 2019 14:07:45 +0100 Subject: [PATCH 05/12] Update findup --- fslint/findup | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fslint/findup b/fslint/findup index 18403e0..432119b 100755 --- a/fslint/findup +++ b/fslint/findup @@ -202,7 +202,8 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 -xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum | #to be sure to be sure +# Disabled to do the last test before show the results +#xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum | #to be sure to be sure cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w40 | #pick just duplicates From b11f423dc37fe8f74881f739e073bc0d76e03d1b Mon Sep 17 00:00:00 2001 From: Salvatore Date: Wed, 13 Nov 2019 10:41:11 +0000 Subject: [PATCH 06/12] fix permission issue databases folder fix permission issue databases folder --- fslint/findup | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fslint/findup b/fslint/findup index 432119b..4a10d42 100755 --- a/fslint/findup +++ b/fslint/findup @@ -55,6 +55,11 @@ script_dir=$(dirname "$0") #directory of this script script_dir=$(readlink -f "$script_dir") #Make sure absolute path +database_dir="$HOME/.fslint/databases" # Databases directory + +if [ ! -d $database_dir ]; then + mkdir -p $database_dir 2> /dev/null +fi . "$script_dir"/supprt/fslver @@ -180,7 +185,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 # even more so if they are large. This usually adds a small amount of # runtime, however it can save a large amount of time in certain situations. if "$script_dir"/supprt/md5sum_approx /dev/null; then - xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx | + xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx | sort | #group duplicate files together uniq --all-repeated -w32 | #pick just duplicates cut -d' ' -f3- | #get filenames @@ -191,7 +196,7 @@ else fi | # This block selects duplicates using md5sum of whole file -xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates +xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w32 | #pick just duplicates @@ -203,7 +208,7 @@ cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 # Disabled to do the last test before show the results -#xargs -r0 "$script_dir"/supprt/database "$script_dir"/../databases/sha1sum.sqlite sha1sum | #to be sure to be sure +#xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum | #to be sure to be sure cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w40 | #pick just duplicates From 49e62a5f9dbd4441eb3f05e2eec1d979c16f70bc Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Sun, 17 Nov 2019 17:31:31 +0000 Subject: [PATCH 07/12] fix database support --- doc/FAQ | 6 ++++-- fslint-gui | 5 +---- fslint/findup | 3 +++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/FAQ b/doc/FAQ index 3a191fc..463e70c 100644 --- a/doc/FAQ +++ b/doc/FAQ @@ -49,8 +49,10 @@ A. In "Advanced Search Parameters"->"Paths to exclude", add "*/CVS" to exclude all CVS dirs for example Q. In the GUI how do I exclude files matching a pattern? -A. In "Advanced Search Parameters"->"Extra find parameters", add - "\( ! -name COPYING* -a ! -name LICENSE \)" to exclude licence files for e.g. +A. In "Advanced Search Parameters"->"Extra find parameters", add: + \( ! -name COPYING* -a ! -name LICENSE \) + to exclude licence files, for example. See `man find` (section 'EXPRESSION') + for more guidance on the proper format for queries. Q. From the command line how do I exclude paths matching a pattern? A. To exclude directories: findup \( -path "*/.svn" \) -prune -o diff --git a/fslint-gui b/fslint-gui index cff86e1..c961e5f 100755 --- a/fslint-gui +++ b/fslint-gui @@ -1038,8 +1038,6 @@ on your system at present.")) return (str(row) + _(" files"), pe) def findup(self, clist_dups): - import time - start_time = time.time() po, pe = self.get_fslint("./findup --gui" + self.findParams) numdups = 0 @@ -1090,10 +1088,9 @@ on your system at present.")) clist_dups.set_row_data(row,file[2]) #mtime row += 1 - execTime = str(time.time() - start_time) return (human_num(byteWaste,1000).strip() + 'B' + _(" wasted in ") + str(numWaste) + _(" files (in ") + str(len(alldups)) + - _(" groups).") + " Execution time: " + execTime + " sec.", pe) + _(" groups)"), pe) find_dispatch = (findup,findpkgs,findnl,findsn,findtf, findbl,findid,finded,findns,findrs) #order NB diff --git a/fslint/findup b/fslint/findup index 4a10d42..8b4e9b5 100755 --- a/fslint/findup +++ b/fslint/findup @@ -185,6 +185,7 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 # even more so if they are large. This usually adds a small amount of # runtime, however it can save a large amount of time in certain situations. if "$script_dir"/supprt/md5sum_approx /dev/null; then + #xargs -r0 "$script_dir"/supprt/md5sum_approx | xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx | sort | #group duplicate files together uniq --all-repeated -w32 | #pick just duplicates @@ -196,6 +197,7 @@ else fi | # This block selects duplicates using md5sum of whole file +#xargs -r0 md5sum -- | #calculate md5sums for possible duplicates xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates cleanup_sum | #undo any backslash escaping sort | #group duplicate files together @@ -207,6 +209,7 @@ uniq --all-repeated=$sep_mode -w32 | #pick just duplicates cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 +xargs -r0 sha1sum -- | #to be sure to be sure # Disabled to do the last test before show the results #xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum | #to be sure to be sure cleanup_sum | #undo any backslash escaping From b1387d0079c4c85bb393f2ee0675fb3399a9a443 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Sun, 17 Nov 2019 17:37:32 +0000 Subject: [PATCH 08/12] decrease COMMIT_COUNT --- fslint/supprt/database | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index 46710d0..954d731 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -7,6 +7,7 @@ import sqlite3 HASH_CMD = None DEBUG = False +COMMIT_COUNT = 100 def hash_file(path): cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8") @@ -62,7 +63,7 @@ if __name__ == "__main__": commit_count += 1 files_hashes[file_path] = hash - if commit_count == 1000: + if commit_count == COMMIT_COUNT: commit_count = 0 db.commit() @@ -82,7 +83,7 @@ if __name__ == "__main__": commit_count += 1 files_hashes[file_path] = hash - if commit_count == 1000: + if commit_count == COMMIT_COUNT: commit_count = 0 db.commit() From d501561f09286cacbe0772b2b535c05e4bb32a8d Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Tue, 19 Nov 2019 11:40:34 +0000 Subject: [PATCH 09/12] new qeury model and improved DEBUG --- fslint/supprt/database | 87 +++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index 954d731..d33bdde 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -15,7 +15,14 @@ def hash_file(path): return hash +def _get_path_files(files_list): + paths_list = list() + for file in files_list: + dirname = os.path.dirname(os.path.abspath(file)) + if dirname not in paths_list: + paths_list.append(dirname) + return paths_list @@ -28,7 +35,8 @@ if __name__ == "__main__": db_path = sys.argv[1] HASH_CMD = sys.argv[2] files_list = [f.decode("utf-8") for f in sys.argv[3:]] - files_hashes = dict() + paths_list = _get_path_files(files_list) + if not os.path.exists(db_path): directory = os.path.dirname(db_path) @@ -46,52 +54,51 @@ if __name__ == "__main__": #Existing Files - cursor.execute('''SELECT * FROM files WHERE path IN ({seq})'''.format(seq=','.join(['?']*len(files_list))), ([f for f in files_list])) + filesdb = dict() + if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list])) + cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list])) + for f in cursor.fetchall(): + filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]} + + files_hashes = dict() commit_count = 0 - for file_record in cursor.fetchall(): - file_path = str(file_record[1]) - try: - if str(os.path.getctime(file_path)) == file_record[2] and str(os.path.getmtime(file_path)) == file_record[3]: - if DEBUG: print("Found") - files_hashes[file_path] = file_record[4] - files_list.remove(file_path) - else: - if DEBUG: print("Update") - hash = hash_file(file_path) - cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', - (str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash, file_path)) + for file in files_list: + if file in filesdb.keys(): + filedb = filesdb[file] + try: + if str(os.path.getctime(file)) == filedb['ctime'] and str(os.path.getmtime(file)) == filedb['mtime']: + if DEBUG: print("Found %s" % (file)) + files_hashes[file] = filedb['hash'] + else: + if DEBUG: print("Update %s" % (file)) + hash = hash_file(file) + cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', + (str(os.path.getctime(file)), str(os.path.getmtime(file)), hash, file)) + commit_count += 1 + files_hashes[file] = hash + + except Exception as e: + hash = hash_file(file) + files_hashes[file] = hash + + else: + #New files + try: + if DEBUG: print("Insert %s" % (file)) + hash = hash_file(file) + cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''', + (str(file), str(os.path.getctime(file)), str(os.path.getmtime(file)), hash)) commit_count += 1 - files_hashes[file_path] = hash - - if commit_count == COMMIT_COUNT: - commit_count = 0 - db.commit() - - # except UnicodeDecodeError as ude: - except Exception as e: - hash = hash_file(file_path) - files_hashes[file_path] = hash + files_hashes[file] = hash + except Exception as e: + hash = hash_file(file) + files_hashes[file] = hash - #New files - for file_path in files_list: - try: - if DEBUG: print("Insert") - hash = hash_file(file_path) - cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''', - (str(file_path), str(os.path.getctime(file_path)), str(os.path.getmtime(file_path)), hash)) - commit_count += 1 - files_hashes[file_path] = hash - - if commit_count == COMMIT_COUNT: + if commit_count >= COMMIT_COUNT: commit_count = 0 db.commit() - #except UnicodeDecodeError as ude: - except Exception as e: - hash = hash_file(file_path) - files_hashes[file_path] = hash - db.commit() db.close() From 92c3985dc6f043faa4694053082cba766fbbe076 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Tue, 19 Nov 2019 12:16:53 +0000 Subject: [PATCH 10/12] reduce number of folders where to search files into db --- fslint/supprt/database | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index d33bdde..d8ff1d3 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -6,7 +6,7 @@ import sqlite3 HASH_CMD = None -DEBUG = False +DEBUG = True COMMIT_COUNT = 100 def hash_file(path): @@ -18,10 +18,14 @@ def hash_file(path): def _get_path_files(files_list): paths_list = list() for file in files_list: - dirname = os.path.dirname(os.path.abspath(file)) - if dirname not in paths_list: - paths_list.append(dirname) - + dirname = "%s/"%(os.path.dirname(os.path.abspath(file))) + if len(paths_list) < 1: paths_list.append(dirname) + for p in paths_list: + if p not in dirname: + paths_list.append(dirname) + print("append") + + if DEBUG: print("Folders: ", ", ".join(paths_list)) return paths_list @@ -55,8 +59,8 @@ if __name__ == "__main__": #Existing Files filesdb = dict() - if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list])) - cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s/%%" % (f) for f in paths_list])) + if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) + cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) for f in cursor.fetchall(): filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]} From 3b2bac0b5a1175c3abb9bbe3ef1c1305a650eeba Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Tue, 19 Nov 2019 12:44:02 +0000 Subject: [PATCH 11/12] DEBUG=False --- fslint/supprt/database | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index d8ff1d3..41c7b93 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -6,7 +6,7 @@ import sqlite3 HASH_CMD = None -DEBUG = True +DEBUG = False COMMIT_COUNT = 100 def hash_file(path): From 9b278607d01c934305954e821870677f463d8abd Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Tue, 19 Nov 2019 13:44:21 +0000 Subject: [PATCH 12/12] change query to read DB --- fslint/supprt/database | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fslint/supprt/database b/fslint/supprt/database index 41c7b93..d52f8b0 100755 --- a/fslint/supprt/database +++ b/fslint/supprt/database @@ -39,7 +39,6 @@ if __name__ == "__main__": db_path = sys.argv[1] HASH_CMD = sys.argv[2] files_list = [f.decode("utf-8") for f in sys.argv[3:]] - paths_list = _get_path_files(files_list) if not os.path.exists(db_path): @@ -58,9 +57,12 @@ if __name__ == "__main__": #Existing Files + # @TODO Optimize downloading only relevant folders from database + #paths_list = _get_path_files(files_list) + #if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) + #cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) + cursor.execute('''SELECT * FROM files ;''') filesdb = dict() - if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) - cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) for f in cursor.fetchall(): filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}