Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# FSlint
Linux file system lint checker/cleaner with database support to reduce time and operations if you need to scan twice the same files in the folders or subfolders.

Each hashing algorithm saves the hash values of the files that it calculates on a specific database. In addition to the hash value, it saves the file creation and last modification time. If one of these two parameters changed from the last execution of FSlint, the hashing algorithm re-calculates the hash value otherwise it considers the existing hash value of the file as valid.

To accomplish it, the fslint/supprt/database file has been created and the fslint/findup file has been modified to pass the information to the database script and reduce the execution time for the hashing calculation. Moreover, with the database support, the find duplication file process can be stopped and re-started avoiding to re-calculate the hash from the beginning.

The usage of the command line or GUI has not been modified, so you can use the FSlint as normal.
6 changes: 4 additions & 2 deletions doc/FAQ
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ A. In "Advanced Search Parameters"->"Paths to exclude", add
"*/CVS" to exclude all CVS dirs for example

Q. In the GUI how do I exclude files matching a pattern?
A. In "Advanced Search Parameters"->"Extra find parameters", add
"\( ! -name COPYING* -a ! -name LICENSE \)" to exclude licence files for e.g.
A. In "Advanced Search Parameters"->"Extra find parameters", add:
\( ! -name COPYING* -a ! -name LICENSE \)
to exclude licence files, for example. See `man find` (section 'EXPRESSION')
for more guidance on the proper format for queries.

Q. From the command line how do I exclude paths matching a pattern?
A. To exclude directories: findup \( -path "*/.svn" \) -prune -o
Expand Down
13 changes: 11 additions & 2 deletions fslint/findup
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@

script_dir=$(dirname "$0") #directory of this script
script_dir=$(readlink -f "$script_dir") #Make sure absolute path
database_dir="$HOME/.fslint/databases" # Databases directory

if [ ! -d $database_dir ]; then
mkdir -p $database_dir 2> /dev/null
fi

. "$script_dir"/supprt/fslver

Expand Down Expand Up @@ -180,7 +185,8 @@ tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
# even more so if they are large. This usually adds a small amount of
# runtime, however it can save a large amount of time in certain situations.
if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
xargs -r0 "$script_dir"/supprt/md5sum_approx |
#xargs -r0 "$script_dir"/supprt/md5sum_approx |
xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum512.sqlite "$script_dir"/supprt/md5sum_approx |
sort | #group duplicate files together
uniq --all-repeated -w32 | #pick just duplicates
cut -d' ' -f3- | #get filenames
Expand All @@ -191,7 +197,8 @@ else
fi |

# This block selects duplicates using md5sum of whole file
xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
#xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
xargs -r0 "$script_dir"/supprt/database "$database_dir"/md5sum.sqlite md5sum | #calculate md5sums for possible duplicates
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
Expand All @@ -203,6 +210,8 @@ cut -s -d' ' -f3- | #get filenames
sort | #sort by paths to try to minimise disk seeks
tr '\n' '\0' | #delimit names with \0
xargs -r0 sha1sum -- | #to be sure to be sure
# Disabled to do the last test before show the results
#xargs -r0 "$script_dir"/supprt/database "$database_dir"/sha1sum.sqlite sha1sum | #to be sure to be sure
cleanup_sum | #undo any backslash escaping
sort | #group duplicate files together
uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
Expand Down
115 changes: 115 additions & 0 deletions fslint/supprt/database
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python2
# encoding: utf8

import sys, os, subprocess
import sqlite3


HASH_CMD = None
DEBUG = False
COMMIT_COUNT = 100

def hash_file(path):
cmd_output = subprocess.check_output([HASH_CMD, path.encode('utf-8')]) #.decode("utf-8")
hash = cmd_output.split(' ')[0]
return hash


def _get_path_files(files_list):
paths_list = list()
for file in files_list:
dirname = "%s/"%(os.path.dirname(os.path.abspath(file)))
if len(paths_list) < 1: paths_list.append(dirname)
for p in paths_list:
if p not in dirname:
paths_list.append(dirname)
print("append")

if DEBUG: print("Folders: ", ", ".join(paths_list))
return paths_list





if __name__ == "__main__":
if len(sys.argv) < 3:
print("Error, provide the <database path>, <hashing command> and <files list with \\0> ")

db_path = sys.argv[1]
HASH_CMD = sys.argv[2]
files_list = [f.decode("utf-8") for f in sys.argv[3:]]


if not os.path.exists(db_path):
directory = os.path.dirname(db_path)
print(directory)
if not os.path.exists(directory):
os.makedirs(directory)
db = sqlite3.connect(db_path)
#db.text_factory = str
cursor = db.cursor()
cursor.execute(''' CREATE TABLE files(id INTEGER PRIMARY KEY, path TEXT, ctime VARCHAR(50), mtime VARCHAR(50), hash TEXT ) ''')
db.commit()
else:
db = sqlite3.connect(db_path)
cursor = db.cursor()


#Existing Files
# @TODO Optimize downloading only relevant folders from database
#paths_list = _get_path_files(files_list)
#if DEBUG: print('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
#cursor.execute('''SELECT * FROM files WHERE path LIKE {seq}'''.format(seq=' OR '.join(['?']*len(paths_list))), ([ "%s%%" % (f) for f in paths_list]))
cursor.execute('''SELECT * FROM files ;''')
filesdb = dict()
for f in cursor.fetchall():
filesdb[f[1]] = {'id': f[0], 'name': f[1], 'ctime': f[2], 'mtime': f[3], 'hash': f[4]}

files_hashes = dict()
commit_count = 0
for file in files_list:
if file in filesdb.keys():
filedb = filesdb[file]
try:
if str(os.path.getctime(file)) == filedb['ctime'] and str(os.path.getmtime(file)) == filedb['mtime']:
if DEBUG: print("Found %s" % (file))
files_hashes[file] = filedb['hash']
else:
if DEBUG: print("Update %s" % (file))
hash = hash_file(file)
cursor.execute('''UPDATE files SET ctime=?, mtime=?, hash=? WHERE path=?''',
(str(os.path.getctime(file)), str(os.path.getmtime(file)), hash, file))
commit_count += 1
files_hashes[file] = hash

except Exception as e:
hash = hash_file(file)
files_hashes[file] = hash

else:
#New files
try:
if DEBUG: print("Insert %s" % (file))
hash = hash_file(file)
cursor.execute('''INSERT INTO files (path, ctime, mtime, hash) VALUES (?,?,?,?)''',
(str(file), str(os.path.getctime(file)), str(os.path.getmtime(file)), hash))
commit_count += 1
files_hashes[file] = hash

except Exception as e:
hash = hash_file(file)
files_hashes[file] = hash

if commit_count >= COMMIT_COUNT:
commit_count = 0
db.commit()

db.commit()
db.close()


for file_hash in files_hashes:
sys.stdout.write("%s %s\n" % (files_hashes[file_hash], file_hash.encode('utf-8')))