diff --git a/README.md b/README.md new file mode 100644 index 0000000..f8c976d --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# FSlint +Linux file system lint checker/cleaner with database support to reduce time and operations if you need to scan twice the same files in the folders or subfolders. + +Each hashing algorithm saves the hash values of the files that it calculates on a specific database. In addition to the hash value, it saves the file creation and last modification time. If one of these two parameters changed from the last execution of FSlint, the hashing algorithm re-calculates the hash value otherwise it considers the existing hash value of the file as valid. + +To accomplish it, the fslint/supprt/database file has been created and the fslint/findup file has been modified to pass the information to the database script and reduce the execution time for the hashing calculation. Moreover, with the database support, the find duplication file process can be stopped and re-started avoiding to re-calculate the hash from the beginning. + +The usage of the command line or GUI has not been modified, so you can use the FSlint as normal. \ No newline at end of file diff --git a/doc/FAQ b/doc/FAQ index 3a191fc..463e70c 100644 --- a/doc/FAQ +++ b/doc/FAQ @@ -49,8 +49,10 @@ A. In "Advanced Search Parameters"->"Paths to exclude", add "*/CVS" to exclude all CVS dirs for example Q. In the GUI how do I exclude files matching a pattern? -A. In "Advanced Search Parameters"->"Extra find parameters", add - "\( ! -name COPYING* -a ! -name LICENSE \)" to exclude licence files for e.g. +A. In "Advanced Search Parameters"->"Extra find parameters", add: + \( ! -name COPYING* -a ! -name LICENSE \) + to exclude licence files, for example. See `man find` (section 'EXPRESSION') + for more guidance on the proper format for queries. Q. From the command line how do I exclude paths matching a pattern? A. To exclude directories: findup \( -path "*/.svn" \) -prune -o diff --git a/fslint/findup b/fslint/findup index fe8aee8..8b4e9b5 100755 --- a/fslint/findup +++ b/fslint/findup @@ -55,6 +55,11 @@ script_dir=$(dirname "$0") #directory of this script script_dir=$(readlink -f "$script_dir") #Make sure absolute path +database_dir="$HOME/.fslint/databases" # Databases directory + +if [ ! -d $database_dir ]; then + mkdir -p $database_dir 2> /dev/null +fi . "$script_dir"/supprt/fslver @@ -180,7 +185,8 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 # even more so if they are large. This usually adds a small amount of # runtime, however it can save a large amount of time in certain situations. if "$script_dir"/supprt/md5sum_approx /dev/null; then - xargs -r0 "$script_dir"/supprt/md5sum_approx | + #xargs -r0 "$script_dir"/supprt/md5sum_approx | + xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx | sort | #group duplicate files together uniq --all-repeated -w32 | #pick just duplicates cut -d' ' -f3- | #get filenames @@ -191,7 +197,8 @@ else fi | # This block selects duplicates using md5sum of whole file -xargs -r0 md5sum -- | #calculate md5sums for possible duplicates +#xargs -r0 md5sum -- | #calculate md5sums for possible duplicates +xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w32 | #pick just duplicates @@ -203,6 +210,8 @@ cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 xargs -r0 sha1sum -- | #to be sure to be sure +# Disabled to do the last test before show the results +#xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum | #to be sure to be sure cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w40 | #pick just duplicates diff --git a/fslint/supprt/database b/fslint/supprt/database new file mode 100755 index 0000000..d52f8b0 --- /dev/null +++ b/fslint/supprt/database @@ -0,0 +1,115 @@ +#!/usr/bin/env python2 +# encoding: utf8 + +import sys, os, subprocess +import sqlite3 + + +HASH_CMD = None +DEBUG = False +COMMIT_COUNT = 100 + +def hash_file(path): + cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8") + hash = cmd_output.split(' ')[0] + return hash + + +def _get_path_files(files_list): + paths_list = list() + for file in files_list: + dirname = "%s/"%(os.path.dirname(os.path.abspath(file))) + if len(paths_list) < 1: paths_list.append(dirname) + for p in paths_list: + if p not in dirname: + paths_list.append(dirname) + print("append") + + if DEBUG: print("Folders: ", ", ".join(paths_list)) + return paths_list + + + + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Error, provide the , and ") + + db_path = sys.argv[1] + HASH_CMD = sys.argv[2] + files_list = [f.decode("utf-8") for f in sys.argv[3:]] + + + if not os.path.exists(db_path): + directory = os.path.dirname(db_path) + print(directory) + if not os.path.exists(directory): + os.makedirs(directory) + db = sqlite3.connect(db_path) + #db.text_factory = str + cursor = db.cursor() + cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT ) ''') + db.commit() + else: + db = sqlite3.connect(db_path) + cursor = db.cursor() + + + #Existing Files + # @TODO Optimize downloading only relevant folders from database + #paths_list = _get_path_files(files_list) + #if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) + #cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list])) + cursor.execute('''SELECT * FROM files ;''') + filesdb = dict() + for f in cursor.fetchall(): + filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]} + + files_hashes = dict() + commit_count = 0 + for file in files_list: + if file in filesdb.keys(): + filedb = filesdb[file] + try: + if str(os.path.getctime(file)) == filedb['ctime'] and str(os.path.getmtime(file)) == filedb['mtime']: + if DEBUG: print("Found %s" % (file)) + files_hashes[file] = filedb['hash'] + else: + if DEBUG: print("Update %s" % (file)) + hash = hash_file(file) + cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''', + (str(os.path.getctime(file)), str(os.path.getmtime(file)), hash, file)) + commit_count += 1 + files_hashes[file] = hash + + except Exception as e: + hash = hash_file(file) + files_hashes[file] = hash + + else: + #New files + try: + if DEBUG: print("Insert %s" % (file)) + hash = hash_file(file) + cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''', + (str(file), str(os.path.getctime(file)), str(os.path.getmtime(file)), hash)) + commit_count += 1 + files_hashes[file] = hash + + except Exception as e: + hash = hash_file(file) + files_hashes[file] = hash + + if commit_count >= COMMIT_COUNT: + commit_count = 0 + db.commit() + + db.commit() + db.close() + + + for file_hash in files_hashes: + sys.stdout.write("%s %s\n" % (files_hashes[file_hash], file_hash.encode('utf-8'))) + +