forked from TeaWithLucas/Sort-Photos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sortphotos.py
executable file
·573 lines (465 loc) · 24 KB
/
sortphotos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
#!/usr/bin/env python
# encoding: utf-8
"""
sortphotos.py
Version 1.3 Alpha
"""
from __future__ import print_function
from __future__ import with_statement
import os
import shutil
import sys
try:
import json
except:
import simplejson as json
import filecmp
import datetime as dt
import re
import locale
sys.path.insert(0, './exiftool')
import exiftool
# Setting locale to the 'local' value
locale.setlocale(locale.LC_ALL, '')
exiftool_location = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'exiftool-10.82', 'exiftool(-k).exe')
# -------- convenience methods -------------
def parse_date_exif(date_string, disable_time_zone_adjust):
"""
extract date info from EXIF data
YYYY:MM:DD HH:MM:SS
or YYYY:MM:DD HH:MM:SS+HH:MM
or YYYY:MM:DD HH:MM:SS-HH:MM
or YYYY:MM:DD HH:MM:SSZ
"""
# split into date and time
elements = str(date_string).strip().split() # ['YYYY:MM:DD', 'HH:MM:SS']
if len(elements) < 1:
return None
# parse year, month, day
date_entries = elements[0].split(':') # ['YYYY', 'MM', 'DD']
# check if three entries, nonzero data, and no decimal (which occurs for timestamps with only time but no date)
if len(date_entries) == 3 and date_entries[0] > '0000' and '.' not in ''.join(date_entries):
year = int(date_entries[0])
month = int(date_entries[1])
day = int(date_entries[2])
else:
return None
# parse hour, min, second
time_zone_adjust = False
hour = 12 # defaulting to noon if no time data provided
minute = 0
second = 0
if len(elements) > 1:
time_entries = re.split('(\+|-|Z)', elements[1]) # ['HH:MM:SS', '+', 'HH:MM']
time = time_entries[0].split(':') # ['HH', 'MM', 'SS']
if len(time) == 3:
hour = int(time[0])
minute = int(time[1])
second = int(time[2].split('.')[0])
elif len(time) == 2:
hour = int(time[0])
minute = int(time[1])
# adjust for time-zone if needed
if len(time_entries) > 2:
time_zone = time_entries[2].split(':') # ['HH', 'MM']
if len(time_zone) == 2:
time_zone_hour = int(time_zone[0])
time_zone_min = int(time_zone[1])
# check if + or -
if time_entries[1] == '-':
time_zone_hour *= -1
dateadd = dt.timedelta(hours=time_zone_hour, minutes=time_zone_min)
time_zone_adjust = True
# form date object
try:
date = dt.datetime(year, month, day, hour, minute, second)
except ValueError:
return None # errors in time format
# try converting it (some "valid" dates are way before 1900 and cannot be parsed by strtime later)
try:
date.strftime('%Y/%m-%b') # any format with year, month, day, would work here.
except ValueError:
return None # errors in time format
# adjust for time zone if necessary
if not disable_time_zone_adjust and time_zone_adjust:
date += dateadd
return date
def get_prioritized_timestamp(data, prioritized_groups, prioritized_tags, additional_groups_to_ignore,
additional_tags_to_ignore, disable_time_zone_adjust=False):
# loop through user specified prioritized groups/tags
prioritized_date = None
prioritized_keys = []
# save src file
src_file = data['SourceFile']
# start with tags as they are more specific
if prioritized_tags:
for tag in prioritized_tags:
date = None
# create a hash slice of data with just the specified tag
subdata = {key: value for key, value in data.iteritems() if tag in key}
if subdata:
# re-use get_oldest_timestamp to get the data needed
subdata['SourceFile'] = src_file
src_file, date, keys = get_oldest_timestamp(subdata, additional_groups_to_ignore,
additional_tags_to_ignore, disable_time_zone_adjust)
if not date:
continue
prioritized_date = date
prioritized_keys = keys
# return as soon as a match is found
return src_file, prioritized_date, prioritized_keys
# if no matching tags are found, look for matching groups
if prioritized_groups:
for group in prioritized_groups:
date = None
# create a hash slice of data to find the oldest date within the specified group
subdata = {key: value for key, value in data.iteritems() if key.startswith(group)}
if subdata:
# find the oldest date for that group
subdata['SourceFile'] = src_file
src_file, date, keys = get_oldest_timestamp(subdata, additional_groups_to_ignore,
additional_tags_to_ignore, disable_time_zone_adjust)
if not date:
continue
prioritized_date = date
prioritized_keys = keys
# return as soon as a match is found
return src_file, prioritized_date, prioritized_keys
# reaching here means no matches were found
return src_file, prioritized_date, prioritized_keys
def get_oldest_timestamp(data, additional_groups_to_ignore, additional_tags_to_ignore, disable_time_zone_adjust=False,
print_all_tags=False):
"""data as dictionary from json. Should contain only time stamps except SourceFile"""
# save only the oldest date
date_available = False
oldest_date = dt.datetime.now()
sanity_date = dt.datetime(year=1901, month=1, day=1)
oldest_keys = []
# save src file
src_file = data['SourceFile']
# setup tags to ignore
ignore_groups = ['ICC_Profile'] + additional_groups_to_ignore
ignore_tags = ['SourceFile', 'XMP:HistoryWhen'] + additional_tags_to_ignore
if print_all_tags:
print('All relevant tags:')
# run through all keys
for key in data.keys():
# check if this key needs to be ignored, or is in the set of tags that must be used
if (key not in ignore_tags) and (key.split(':')[0] not in ignore_groups) and 'GPS' not in key:
date = data[key]
if print_all_tags:
print(str(key) + ', ' + str(date))
# (rare) check if multiple dates returned in a list, take the first one which is the oldest
if isinstance(date, list):
date = date[0]
try:
exifdate = parse_date_exif(date,
disable_time_zone_adjust) # check for poor-formed exif data, but allow continuation
except Exception as e:
exifdate = None
if exifdate and exifdate < oldest_date and exifdate > sanity_date:
date_available = True
oldest_date = exifdate
oldest_keys = [key]
elif exifdate and exifdate == oldest_date and exifdate > sanity_date:
oldest_keys.append(key)
if not date_available:
oldest_date = None
if print_all_tags:
print()
return src_file, oldest_date, oldest_keys
def check_for_early_morning_photos(date, day_begins):
"""check for early hour photos to be grouped with previous day"""
if date.hour < day_begins:
print('moving this photo to the previous day for classification purposes (day_begins=' + str(day_begins) + ')')
date = date - dt.timedelta(hours=date.hour + 1) # push it to the day before for classificiation purposes
return date
def sortPhotos(src_dir, dest_dir, sort_format, rename_format, recursive=False, day_begins=0, copy_files=False,
test=False, remove_duplicates=True, keep_filename=False, verbose=True, disable_time_zone_adjust=False, ignore_file_types=[],
additional_groups_to_ignore=['File'], additional_tags_to_ignore=[], use_only_groups=None,
use_only_tags=None, rename_with_camera_model=False, show_warnings=True, src_file_regex=None,
src_file_extension=[], prioritize_groups=None, prioritize_tags=None):
"""
This function is a convenience wrapper around ExifTool based on common usage scenarios for sortphotos.py
Parameters
---------------
src_dir : str
directory containing files you want to process
dest_dir : str
directory where you want to move/copy the files to
sort_format : str
date format code for how you want your photos sorted
(https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior)
rename_format : str
date format code for how you want your files renamed
(https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior)
None to not rename file
recursive : bool
True if you want src_dir to be searched recursively for files (False to search only in top-level of src_dir)
copy_files : bool
True if you want files to be copied over from src_dir to dest_dir rather than moved
test : bool
True if you just want to simulate how the files will be moved without actually doing any moving/copying
ignore_file_types : list(str)
file types to be ignored. By default, hidden files (.*) are ignored
remove_duplicates : bool
True to remove files that are exactly the same in name and a file hash
keep_filename : bool
True to append original filename in case of duplicates instead of increasing number
disable_time_zone_adjust : bool
True to disable time zone adjustments
day_begins : int
what hour of the day you want the day to begin (only for classification purposes). Defaults at 0 as midnight.
Can be used to group early morning photos with the previous day. must be a number between 0-23
additional_groups_to_ignore : list(str)
tag groups that will be ignored when searching for file data. By default File is ignored
additional_tags_to_ignore : list(str)
specific tags that will be ignored when searching for file data.
use_only_groups : list(str)
a list of groups that will be exclusived searched across for date info
use_only_tags : list(str)
a list of tags that will be exclusived searched across for date info
rename_with_camera_model : bool
True if you want to append the camera model or brand name to the renamed file. Does nothing if rename_format
is None. (MHB: added this)
show_warnings : bool
True if you want to see warnings
src_file_regex: str
pick your source file using regex
src_file_extension: list(str)
Limit your script to process only specific files
prioritize_groups : list(str)
a list of groups that will be prioritized for date info
prioritize_tags : list(str)
a list of tags that will be prioritized for date info
verbose : bool
True if you want to see details of file processing
"""
# some error checking
if not os.path.exists(src_dir):
raise Exception('Source directory does not exist')
files = []
for root, sub_dirs, sub_files in os.walk(src_dir):
for name in sub_files:
files.append(os.path.join(root, name).encode('utf-8'))
print('Preprocessing with ExifTool. May take a while for a large number of files.')
print(files)
# get all metadata
with exiftool.ExifTool(exiftool_location) as et:
metadata = et.get_metadata_batch(files)
# setup output to screen
num_files = len(metadata)
print("number of files:", num_files)
if test:
test_file_dict = {}
# track files modified/skipped
files_modified = []
files_skipped = []
# parse output extracting oldest relevant date
for idx, data in enumerate(metadata):
# extract timestamp date for photo
date = None
if prioritize_groups or prioritize_tags:
src_file, date, keys = get_prioritized_timestamp(data, prioritize_groups, prioritize_tags,
additional_groups_to_ignore, additional_tags_to_ignore,
disable_time_zone_adjust)
if not date:
src_file, date, keys = get_oldest_timestamp(data, additional_groups_to_ignore, additional_tags_to_ignore,
disable_time_zone_adjust)
# fixes further errors when using unicode characters like "\u20AC"
src_file.encode('utf-8')
if verbose:
# write out which photo we are at
ending = ']'
if test:
ending = '] (TEST - no files are being moved/copied)'
print('[' + str(idx + 1) + '/' + str(num_files) + ending)
print('Source: ' + src_file)
else:
# progress bar
numdots = int(20.0 * (idx + 1) / num_files)
sys.stdout.write('\r')
sys.stdout.write('[%-20s] %d of %d ' % ('=' * numdots, idx + 1, num_files))
sys.stdout.flush()
# check if no valid date found
if not date:
if show_warnings:
print('No valid dates were found using the specified tags. File will remain where it is.')
print()
# sys.stdout.flush()
files_skipped.append(src_file)
continue
# ignore hidden files
if os.path.basename(src_file).startswith('.'):
print('hidden file. will be skipped')
print()
files_skipped.append(src_file)
continue
# ignore specified file extensions
fileextension = os.path.splitext(src_file)[-1].upper().replace('.', '')
if fileextension in map(str.upper, ignore_file_types):
print(fileextension + ' files ignored. will be skipped')
print()
files_skipped.append(src_file)
continue
if verbose:
print('Date/Time: ' + str(date))
print('Corresponding Tags: ' + ', '.join(keys))
# early morning photos can be grouped with previous day (depending on user setting)
date = check_for_early_morning_photos(date, day_begins)
# create folder structure
dir_structure = date.strftime(sort_format)
dirs = dir_structure.split('/')
dest_file = dest_dir
for thedir in dirs:
dest_file = os.path.join(dest_file, thedir)
if not os.path.exists(dest_file) and not test:
os.makedirs(dest_file)
# rename file if necessary
filename = os.path.basename(src_file)
if rename_format is not None:
# MHB: get camera model (else camera brand) and add to file name if specified
camera_model = data.get('EXIF:Model', None)
if not camera_model:
camera_model = data.get('EXIF:Make', None)
if rename_with_camera_model and camera_model:
_, ext = os.path.splitext(filename)
filename = date.strftime(rename_format) + '_' \
+ ''.join([c for c in camera_model if c.isalnum()]) + ext.lower()
else:
_, ext = os.path.splitext(filename)
filename = date.strftime(rename_format) + ext.lower()
# setup destination file
dest_file = os.path.join(dest_file, filename)
root, ext = os.path.splitext(dest_file)
if verbose:
name = 'Destination '
if copy_files:
name += '(copy): '
else:
name += '(move): '
print(name + dest_file)
# check for collisions
append = 1
fileIsIdentical = False
while True:
if (not test and os.path.isfile(dest_file)) or (
test and dest_file in test_file_dict.keys()): # check for existing name
if test:
dest_compare = test_file_dict[dest_file]
else:
dest_compare = dest_file
if remove_duplicates and filecmp.cmp(src_file, dest_compare): # check for identical files
fileIsIdentical = True
if show_warnings:
print("Identical file already exists. Duplicate will be ignored.\n\
Source: " + src_file + "\n\
Dest: " + dest_file)
break
else: # name is same, but file is different
if keep_filename:
orig_filename = os.path.splitext(os.path.basename(src_file))[0]
dest_file = root + '_' + orig_filename + '_' + str(append) + ext
else:
dest_file = root + '_' + str(append) + ext
append += 1
if show_warnings:
print('Same name already exists...renaming to: ' + dest_file)
else:
break
# finally move or copy the file
if test:
test_file_dict[dest_file] = src_file
if fileIsIdentical:
files_skipped.append(src_file)
continue # ignore identical files
else:
if copy_files:
files_modified.append(dest_file)
if not test:
shutil.copy2(src_file, dest_file)
else:
files_modified.append(dest_file)
if not test:
if not (os.path.exists(src_file)):
print(os.path.dirname(src_file))
shutil.move(src_file, dest_file)
if verbose:
print()
# sys.stdout.flush()
if not verbose:
print()
print('Files modified (' + str(len(files_modified)) + '): ')
for modified in files_modified:
print('\t' + str(modified))
print('Files skipped (' + str(len(files_skipped)) + '): ')
for skipped in files_skipped:
print('\t' + str(skipped))
def main():
import argparse
# setup command line parsing
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
description='Sort files (primarily photos and videos) into folders by date\nusing EXIF and other metadata')
parser.add_argument('src_dir', type=str, help='source directory')
parser.add_argument('dest_dir', type=str, help='destination directory')
parser.add_argument('-r', '--recursive', action='store_true', help='search src_dir recursively', default=False)
parser.add_argument('-c', '--copy', action='store_true', help='copy files instead of move', default=False)
parser.add_argument('-s', '--silent', action='store_true', help='don\'t display parsing details.', default=False)
parser.add_argument('-t', '--test', action='store_true',
help='run a test. files will not be moved/copied\ninstead you will just a list of would happen',
default=False)
parser.add_argument('-z', '--disable-time-zone-adjust', action='store_true',
help='disables time zone adjust\nuseful for devices that store local time + time zone instead of UTC + time zone',
default=False)
parser.add_argument('--sort', type=str, default='%Y/%m-%b',
help="choose destination folder structure using datetime format \n\ https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior. \n\ Use forward slashes / to indicate subdirectory(ies) (independent of your OS convention). \n\ The default is '%%Y/%%m-%%b', which separates by year then month \n\ with both the month number and name (e.g., 2012/02-Feb).")
parser.add_argument('--rename', type=str, default=None,
help="rename file using format codes \n\ https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior. \n\ default is None which just uses original filename")
parser.add_argument('--keep-filename', action='store_true',
help='In case of duplicated output filenames an increasing number and the original file name will be appended',
default=False)
parser.add_argument('--ignore-file-types', type=str, nargs='+', default=[],
help="ignore file types\n\ default is to only ignore hidden files (.*)")
parser.add_argument('--keep-duplicates', action='store_true',
help='If file is a duplicate keep it anyway (after renaming).', default=False)
parser.add_argument('--day-begins', type=int, default=0,
help='hour of day that new day begins (0-23), \n\ defaults to 0 which corresponds to midnight. Useful for grouping pictures with previous day.')
parser.add_argument('--ignore-groups', type=str, nargs='+', default=[],
help='a list of tag groups that will be ignored for date informations.\n\ list of groups and tags here: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/\n\ by default the group \'File\' is ignored which contains file timestamp data')
parser.add_argument('--ignore-tags', type=str, nargs='+', default=[],
help='a list of tags that will be ignored for date informations.\n\ list of groups and tags here: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/\n\ the full tag name needs to be included (e.g., EXIF:CreateDate)')
parser.add_argument('--use-only-groups', type=str, nargs='+', default=None,
help='specify a restricted set of groups to search for date information\n\ e.g., EXIF')
parser.add_argument('--use-only-tags', type=str, nargs='+', default=None,
help='specify a restricted set of tags to search for date information\n\ e.g., EXIF:CreateDate')
parser.add_argument('--prioritize-groups', type=str, nargs='+', default=None,
help='specify a prioritized set of groups to search for date information\n\ e.g., EXIF File')
parser.add_argument('--prioritize-tags', type=str, nargs='+', default=None,
help='specify a prioritized set of tags to search for date information\n\ e.g., EXIF:CreateDate EXIF:ModifyDate')
# MHB
parser.add_argument('--rename-with-camera-model', action='store_true', default=False,
help='append the camera model or brand name to the renamed file.')
parser.add_argument('-w', '--show-warnings', default=True, action='store_true', help='display warnings.')
# sasi07eee
parser.add_argument('-x', '--src-file-regex', type=str, default=None, help='source file regular expression')
parser.add_argument('-e', '--src-file-extension', type=str, default=[], nargs='+',
help='source file format (comma seperated)')
# parse command line arguments
temp_args = parser.parse_args()
args = {}
args['src_dir'], args['dest_dir'], args['sort_format'], args['rename_format'], args[
'recursive'] = temp_args.src_dir, temp_args.dest_dir, temp_args.sort, temp_args.rename, temp_args.recursive
args['day_begins'], args['copy_files'], args['test'] = temp_args.day_begins, temp_args.copy, temp_args.test
args['remove_duplicates'], args['verbose'], args[
'disable_time_zone_adjust'] = not temp_args.keep_duplicates, not temp_args.silent, temp_args.disable_time_zone_adjust
args['ignore_file_types'], args['additional_groups_to_ignore'], args[
'additional_tags_to_ignore'] = temp_args.ignore_file_types, temp_args.ignore_groups, temp_args.ignore_tags
args['use_only_groups'], args['use_only_tags'], args[
'rename_with_camera_model'] = temp_args.use_only_groups, temp_args.use_only_tags, temp_args.rename_with_camera_model
args['show_warnings'], args['src_file_regex'], args[
'src_file_extension'] = temp_args.show_warnings, temp_args.src_file_regex, temp_args.src_file_extension
args['prioritize_groups'], args['prioritize_tags'] = temp_args.prioritize_groups, temp_args.prioritize_tags
args['keep_filename'] = temp_args.keep_filename
sortPhotos(**args)
if __name__ == '__main__':
main()