-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_and_hash.py
More file actions
46 lines (37 loc) · 1.49 KB
/
read_and_hash.py
File metadata and controls
46 lines (37 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import hashlib
import os
def read_and_hash(file_path: str, single_segment_size: int=512,
hash_func=hashlib.sha1, first_segment=False):
"""
This function reads one or multiple segments of a file and hashes them.
It returns a single segment has for further comparison.
Or yields a segment hash for potential duplicates.
Attributes:
-----------
file_path: str
path-like object, the full path of the file to be read.
single_segment_size: int
the size in bytes of a segment to be read fist. Files of the same size,
having the same hash of this first segment are considered as potential duplicates.
hash_func: object
reference to the object of the hashing function.
first_segment: bool
when true, only the first segment of the file is read and hashed.
Returns:
-------
hashed_object.digest()
The digest of the first segment.
Yields:
------
hashed_object.update(segment)
The updated with a next segment digest for the file.
"""
hashed_object = hash_func()
with open(file_path, "rb") as f:
if first_segment:
read_first_segment = f.read(single_segment_size)
hashed_object.update(read_first_segment)
else:
full_file = f.read(os.path.getsize(file_path))
hashed_object.update(full_file)
return hashed_object.hexdigest()