forked from elisemercury/Duplicate-Image-Finder
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathduplicate_generator.py
More file actions
342 lines (291 loc) · 14.7 KB
/
duplicate_generator.py
File metadata and controls
342 lines (291 loc) · 14.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""
Use this file to generate test cases for duplicates. Two modes are supported:
- Partition
- Copy
## Partition Mode.
Partition mode is used to test for dir_a and dir_b. It has a probability to duplicate a file in both directories
and a probability with which a file is moved to the dir_b.
probability of duplication = 0.001 (is evaluated first)
probability of moving to dir_b = 0.5
The files can be moved or symlinked.
## Copy Mode
In Copy Mode, the files are copied with a probability to a secondary directory. The files in the initial directory are
always left. leading to a directory of duplicates and a directory of originals.
"""
import os
import random
import warnings
from typing import Tuple
import shutil
import argparse
def remove_prefix(text, prefix):
"""
Remove a prefix from a string
"""
if text.startswith(prefix):
return text[len(prefix):]
return text
def partition(source: str,
dir_a: str,
dir_b: str,
verbose: bool,
pd: float = 0.001,
pb: float = 0.5,
op: str = "MOVE",
limit: int = 40000) -> Tuple[int, int, int]:
"""
Partition mode to generate duplicates. Uses Symlinks in dir_a and dir_b to link to the files in the src directory
:param source: The source directory
:param dir_a: The first directory
:param dir_b: The second directory
:param verbose: Verbose output
:param pd: The probability of duplication
:param pb: The probability of moving to dir_b
:param op: The operation to perform (MOVE, COPY, LINK)
:param limit: The limit of files to process (scanning not duplicating)
:return: Number of files in dir a, number of files in dir b, number of duplicates
"""
op = op.upper()
if op not in ["MOVE", "COPY", "LINK"]:
raise ValueError(f"Operation {op} not supported")
def partition_internal(_src: str,
_cur: str,
_dir_a: str,
_dir_b: str,
_verb: bool,
_pd: float,
_pb: float,
_op: str,
_ca: int, _cb: int, _cd: int, _limit: int = None) -> Tuple[int, int, int]:
"""
Internal partition function
:param _src: The source directory
:param _cur: The current directory prefix within the source directory
:param _dir_a: Partition Directory A
:param _dir_b: Partition Directory B
:param _verb: Verbose output
:param _pd: Probability of duplication during the partition. Duplicating iff random.random() < _pd
:param _pb: Probability of moving to Partition B. Move to Partition if random.random() < _pb
:param _limit: Number of files to process (scanning)
:param _op: Operation to perform (MOVE, COPY, LINK)
:param _ca: Current number of files in partition a
:param _cb: Current number of files in partition b.
:param _cd: Current number of files in both partitions.
:return: Number of files in dir a, number of files in dir b, number of duplicates
"""
a, b, d = _ca, _cb, _cd
abs_src = os.path.abspath(_src)
abs_a = os.path.abspath(_dir_a)
abs_b = os.path.abspath(_dir_b)
cp = os.path.join(abs_src, _cur)
_ca = os.path.join(abs_a, _cur)
_cb = os.path.join(abs_b, _cur)
for f in os.listdir(cp):
if _limit is not None and (a > _limit or b > _limit):
return a, b, d
# If it's a directory, we need to recurse
if os.path.isdir(os.path.join(cp, f)):
a, b, d = partition_internal(_src, os.path.join(_cur, f),
_dir_a, _dir_b, _verb, _pd, _pb, _op, a, b, d, _limit)
# If it's a file, we need to copy it
elif os.path.isfile(os.path.join(cp, f)):
# Duplicate the file
if random.random() < _pd:
# Create directory in dir_a
if not os.path.exists(_ca):
if _verb:
print(f"Creating Directory {_ca}")
os.makedirs(_ca)
# Create directory in dir_b
if not os.path.exists(_cb):
if _verb:
print(f"Creating Directory {_cb}")
os.makedirs(_cb)
if _op == "LINK":
if _verb:
print(f"Duplicate LINK {os.path.join(cp, f)} "
f"to {os.path.join(_ca, f)} and {os.path.join(_cb, f)}")
os.symlink(os.path.join(cp, f), os.path.join(_ca, f))
os.symlink(os.path.join(cp, f), os.path.join(_cb, f))
elif _op == "MOVE":
if _verb:
print(f"Duplicate MOVE {os.path.join(cp, f)} "
f"to {os.path.join(_ca, f)} and {os.path.join(_cb, f)}")
shutil.move(os.path.join(cp, f), os.path.join(_ca, f))
shutil.copy(os.path.join(_ca, f), os.path.join(_cb, f))
elif _op == "COPY":
if _verb:
print(f"Duplicate COPY {os.path.join(cp, f)} "
f"to {os.path.join(_ca, f)} and {os.path.join(_cb, f)}")
shutil.copy(os.path.join(cp, f), os.path.join(_ca, f))
shutil.copy(os.path.join(cp, f), os.path.join(_cb, f))
a, b, d = a + 1, b + 1, d + 1
# Symlink to either or
else:
# Symlink to dir_b
if random.random() < _pb:
if not os.path.exists(_cb):
if _verb:
print(f"Creating Directory {_cb}")
os.makedirs(_cb)
if _op == "LINK":
if _verb:
print(f"Partitioning LINK {os.path.join(cp, f)} to {os.path.join(_cb, f)}")
os.symlink(os.path.join(cp, f), os.path.join(_cb, f))
elif _op == "MOVE":
if _verb:
print(f"Partitioning MOVE {os.path.join(cp, f)} to {os.path.join(_cb, f)}")
shutil.move(os.path.join(cp, f), os.path.join(_cb, f))
elif _op == "COPY":
if _verb:
print(f"Partitioning COPY {os.path.join(cp, f)} to {os.path.join(_cb, f)}")
shutil.copy(os.path.join(cp, f), os.path.join(_cb, f))
b += 1
# Symlink to dir_a
else:
if not os.path.exists(_ca):
if _verb:
print(f"Creating Directory {_ca}")
os.makedirs(_ca)
if _op == "LINK":
if _verb:
print(f"Partitioning LINK {os.path.join(cp, f)} to {os.path.join(_ca, f)}")
os.symlink(os.path.join(cp, f), os.path.join(_ca, f))
elif _op == "MOVE":
if _verb:
print(f"Partitioning MOVE {os.path.join(cp, f)} to {os.path.join(_ca, f)}")
shutil.move(os.path.join(cp, f), os.path.join(_ca, f))
elif _op == "COPY":
if _verb:
print(f"Partitioning COPY {os.path.join(cp, f)} to {os.path.join(_ca, f)}")
shutil.copy(os.path.join(cp, f), os.path.join(_ca, f))
a += 1
else:
print(f"Skipping {f}")
return a, b, d
return partition_internal(source, "", dir_a, dir_b, verbose, pd, pb, _op=op, _limit=limit, _ca=0, _cb=0, _cd=0)
def duplicate(src: str,
dst: str,
verbose: bool,
pc: float = 0.5,
op: str = "COPY",
limit: int = None) -> Tuple[int, int]:
"""
Duplicate mode to generate duplicates.
:param src: The first directory
:param dst: The second directory
:param verbose: Verbose output
:param pc: The probability of Duplicating. Duplicate iff random.random() < pc
:param op: The operation to perform (COPY, LINK)
:param limit: The limit of files to process (max number of duplicates)
:return: Number of files in dir a, number of files in dir b
"""
op = op.upper()
if op not in ["COPY", "LINK"]:
raise ValueError(f"Operation {op} not supported")
def duplicate_internal(_src: str,
_dst: str,
_cur: str,
_verb: bool,
_pc: float,
_op: str,
_s: int,
_d: int,
_limit: int = None) -> Tuple[int, int]:
"""
Internal function to generate duplicates.
:param _src: The directory to get the images from
:param _dst: The directory to copy all duplicates into
:param _cur: The current suffix of the src directory to be replicated in the dst directory
:param _verb: Verbose output
:param _pc: The probability of copying. Copy iff random.random() < pc
:param _op: The operation to perform (COPY, LINK)
:param _limit: The limit of files to process (max number of duplicates)
:param _s: Number of files scanned
:param _d: Number of files copied into dst
:return: Number of files scanned, number of files duplicated
"""
abs_src = os.path.abspath(_src)
abs_dst = os.path.abspath(_dst)
cp = os.path.join(abs_src, _cur)
cd = os.path.join(abs_dst, _cur)
for f in os.listdir(cp):
if _limit is not None and _d > _limit:
return _s, _d
# If it's a directory, we need to recurse
if os.path.isdir(os.path.join(cp, f)):
_s, _d = duplicate_internal(_src, _dst, os.path.join(_cur, f), _verb, _pc, _op, _s, _d, _limit)
# If it's a file, we need to copy it
elif os.path.isfile(os.path.join(cp, f)):
_s += 1
if random.random() < _pc:
# Create directory in dst
if not os.path.exists(cd):
if verbose:
print(f"Creating Directory {cd}")
os.makedirs(cd)
if _op == "LINK":
if verbose:
print(f"LINKING {os.path.join(cp, f)} to {os.path.join(cd, f)}")
os.symlink(os.path.join(cp, f), os.path.join(cd, f))
elif _op == "COPY":
if verbose:
print(f"COPYING {os.path.join(cp, f)} to {os.path.join(cd, f)}")
shutil.copy(os.path.join(cp, f), os.path.join(cd, f))
_d += 1
# We've got something we don't know.
else:
print(f"Skipping {f}")
return duplicate_internal(src, dst, "", verbose, pc, op, limit, 0, 0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate duplicate images")
# Required
parser.add_argument("mode", type=str, help="Directory to get images from",
choices=["PARTITION", "DUPLICATE"])
parser.add_argument("-o", "--operation", type=str,
help="Operation to perform. Can be COPY, LINK and only for partition MOVE",
choices=["COPY", "LINK", "MOVE"],
default="COPY",
required=False)
parser.add_argument("-s", "--source", type=str,
help="Directory to get images from",
required=True)
parser.add_argument("-a", "--partition_a", type=str,
help="First Partition in PARTITION mode and destination for duplicates in DUPLICATE mode",
required=True)
# Optionals
parser.add_argument("-b", "--partition_b", type=str,
help="Second Partition in PARTITION")
parser.add_argument("-d", "--duplication", type=float, default=0.001,
help="Probability to generate a duplicate. Uses a random float between 0 and 1. "
"A file is duplicated iff random.random() < pc")
parser.add_argument("-l", "--limit", type=int,
help="Limit number of files to process, in PARTITION mode limits the number of files in either "
"partition (so if size(part_a) > limit or size(part_b) > limit stop), in DUPLICATE mode"
"it limits the number of duplicates generated!")
parser.add_argument("-p", "--probability_b", type=float, default=0.5,
help="Probability of a file going into partition b in PARTITION mode. Goes into Partition B iff"
"random.random() < p. Has no effect on DUPLICATE mode")
parser.add_argument("-v", "--verbose", action="store_true",
help="Print info about every action taken.")
args = parser.parse_args()
# Validate arguments
if not 0 < args.duplication < 1:
raise ValueError("Duplication value must be between 0 and 1")
if not 0 < args.probability_b < 1:
raise ValueError("Probability value must be between 0 and 1")
if args.mode == "DUPLICATE":
if args.operation == "MOVE":
raise ValueError("Duplicate mode not supported for MOVE")
if args.partition_b is not None:
warnings.warn("Duplicate mode doesn't use partition_b, argument will be ignored")
if args.mode == "PARTITION":
if args.partition_b is None:
raise ValueError("PARTITION mode needs partition_b")
if args.mode == "DUPLICATE":
res = duplicate(args.source, args.partition_a, args.verbose, args.duplication, args.operation, args.limit)
print(f"Scanned {res[0]} files and duplicated {res[1]} files.")
else:
res = partition(args.source, args.partition_a, args.partition_b, args.verbose,
args.duplication, args.probability_b, args.operation, args.limit)
print(f"Files in Partition A: {res[0]}, Files in Partition B: {res[1]}, Files in both Partitions: {res[2]}")