diff --git a/Projects/duplicate_files_remover/README.md b/Projects/duplicate_files_remover/README.md new file mode 100644 index 0000000..69a0b24 --- /dev/null +++ b/Projects/duplicate_files_remover/README.md @@ -0,0 +1,15 @@ +# Near Duplicate Files Remover +A script that searches for duplicate files and delete them to save up storage. + + + +## Setup instructions +``` +python pip install -r requirements.txt +python main.py +``` + +## Detailed explanation of script, if needed +After running the script, it crawls a given directory returning all the files in the directory, after that it generates a hash for every file +and save them in a pandas dataframe hashtable. Then, The script looks for similar hashes in the hashtable and deletes all the files with the +similar hashtable only keeping the original one. diff --git a/Projects/duplicate_files_remover/duplicate_files_remover.py b/Projects/duplicate_files_remover/duplicate_files_remover.py new file mode 100644 index 0000000..18e5c5a --- /dev/null +++ b/Projects/duplicate_files_remover/duplicate_files_remover.py @@ -0,0 +1,95 @@ +import os +import sys +import hashlib +import pandas as pd + + +# get list of all files in a dir +def file_list(folder): + path = os.path.abspath(folder) + files = [entry.path for entry in os.scandir(path) if entry.is_file()] + print(f'[+] Found {len(files)} files in {folder}. ') + + return files + + +# Calculate the hash for any given file +def get_hash(filename): + block_size = 65536 + + with open(filename, 'rb') as f: + m = hashlib.sha256() + block = f.read(block_size) + while len(block) > 0: + m.update(block) + block = f.read(block_size) + digest = m.hexdigest() + + return digest + +# create hashtable + + +def hashtable(files): + if not isinstance(files, list): + files = [files] + else: + pass + + hash_identifier = [] + for f in files: + try: + hash_identifier.extend([get_hash(f)]) + except OSError: + hash_identifier.extend(['Hash could not be generated.']) + + return hash_identifier + + +# crawl through a directory and return the hashes of each file as a pd +# dataframe. +def create_hashtable(folder): + files = file_list(folder) + + df = pd.DataFrame(columns=['file', 'hash']) + df['file'] = files + df['hash'] = hashtable(files) + print('[+] Generated all hashes.') + + return df + + +# get duplicates +def list_duplicates(folder): + duplicates_files = create_hashtable(folder) + duplicates_files = duplicates_files[duplicates_files['hash'].duplicated( + keep=False)] + duplicates_files.sort_values(by='hash', inplace=True) + duplicates_files = duplicates_files.drop_duplicates( + subset='hash', keep='first') + print(f'[+] Found {len(duplicates_files)} duplicates.\n') + print(duplicates_files) + + return duplicates_files + + +# list_duplicates('C:/Users/ramij/Desktop/secret') + +if __name__ == '__main__': + folder = str(input('Folder full path (eg: C:/Users/bob/Desktop): ')) + if not os.path.exists(folder): + print('Folder does not exist.') + sys.exit(1) + else: + pass + + duplicates = list_duplicates(folder) + delete = input('\n[!] Do you want to delete the duplicates (y/n):') + print('\n') + if delete.lower() == 'y': + duplicates = duplicates['file'].tolist() + for f in duplicates: + os.remove(f) + print(f'Deleted {f}') + else: + print('[X] Exiting...') diff --git a/Projects/duplicate_files_remover/requirements.txt b/Projects/duplicate_files_remover/requirements.txt new file mode 100644 index 0000000..3fd8886 --- /dev/null +++ b/Projects/duplicate_files_remover/requirements.txt @@ -0,0 +1 @@ +pandas==1.2.0