File: tagpix/tagpix.py
#!/usr/bin/python """ ========================================================================== tagpix - combine your photos for easy viewing and archiving. Website: https://learning-python.com/tagpix.html Author: © M. Lutz (http://learning-python.com) 2013-2020. License: provided freely, but with no warranties of any kind. Summary: This script, run with console inputs and no command-line arguments, organizes the content of multiple camera-card or other photo-collection folders for fast and convenient access. It: - Transfers all the photos in an entire folder tree to a single folder - Renames photos with date-taken prefixes for uniqueness and sorts - Discards duplicate content and makes duplicate filenames unique - Transfers any movies and other files in the tree to their own folders - Optionally groups merged items of all types into by-year subfolders - Transfers files by either moves or copies (with optional deletes) The result is a single folder that combines all your photos in one location. tagpix runs with either Python 3.X or 2.X, and on all major platforms - including Windows, Mac OS, Linux, and Android. To download this program, visit its website (above). For configuration settings, see file user_configs.py. For the complete story on this program's roles and usage, open this package's UserGuide.html. Versions: 2.3, Sep 2020 - patched to silence spurious Pillow DOS warning 2.2, Jun 2020 - repackaged with documentation changes only 2.2, Dec 2018 - use+drop redundant dates in Android photo filenames 2.1, Oct 2018 - copy modes, more dups, folder skips, verify deletes 2.0, Oct 2017 - year groups, list-only, dup skips, mime, console See release notes in UserGuide.html for complete change logs. *CAUTION*: By design, this script's default operation moves and renames all photos and other files in an entire source folder tree. No automated method for undoing the changes it makes is provided, and no warranty is included with this program. Although tagpix has been tested and used successfully on large photo collections, please read all usage details in UserGuide.html carefully before running it on yours. It is strongly recommended to preview changes with list-only mode before applying them; and either run tagpix on a temporary copy of your source folder tree, or enable its copy-only transfer mode in file user_configs.py to avoid source-tree changes. [All other usage and version documentation now resides in UserGuide.html] ========================================================================== """ from __future__ import print_function # py 2.X import os, sys, pprint, datetime, time, mimetypes, shutil, re # Exif tag extraction. # Uses Pillow/PIL; EXIF alternative failed for more files in testing. # from PIL import Image from PIL.ExifTags import TAGS # tag #id => name #import EXIF # [2.3] Sep-2020: silence a harmless but excessive Pillow-library warning # now issued stupidly for all large images. This includes perfectly valid # 108MP images shot on a Note20 Ultra smartphone, among other >89M image # devices. This also impacted thumbspage, shrinkpix, and PyPhoto, requiring # program rereleases - a typical open-source-agenda result, and an example # of the pitfalls of "batteries included" development. Fix, please. # More complete coverage (and diatribe): UserGuide.html#pillowdoswarning. # Update: Pillow makes this an error _exception_ at limit*2: disable too. # Image.MAX_IMAGE_PIXELS = None # stop both warning, and error at limit*2 # in case the preceding fails if hasattr(Image, 'DecompressionBombWarning'): # not until 2014+ Pillows import warnings warnings.simplefilter('ignore', Image.DecompressionBombWarning) # Py 2.X compatibility. # This and the __future__ import above are all that differ for 2.X. # 3.X's re.ASCII is defined in 2.X so it can be named in flags (0=none). # Update: the first is now moot, given the input() stderr redef below. # if sys.version[0] == '2': #input = raw_input # don't eval() input string re.ASCII = 0 # no-op in 2.X (has re.UNICODE, not re.ASCII) # A few globals in this file. # sepln = '-' * 80 # report-section separator tracemore = False # show extra program-trace output? workdir = '.' # location of default source and destination folders # [2.1] Get user configurations: more easily changed than this script's code. # See user_configs.py for more on these options and settings. # from user_configs import IgnoreFoldersPattern # folder-skip names regex from user_configs import CopyInsteadOfMove # copy-only or copy-and-delete modes from user_configs import DeleteAfterCopy # copy-and-delete mode (both True) # [2.2] additions from user_configs import UseAndroidFilenameDates # use when no Exif date tag? from user_configs import DropAndroidFilenameDates # drop redundant dates? from user_configs import KeepDifferingAndroidFilenameDates # drop iff tagpix==Android? # prior-run-date pattern, compile just once (e.g, '2017-10-13__2017-10-13__xxx') dupprefixpattern = re.compile('(?:\d{4}-\d{2}-\d{2}__){2}', re.ASCII) # [2.1] folder-skips pattern (in user_configs.py), precompile string for speed ignorefolderspattern = re.compile(IgnoreFoldersPattern) # [2.2] redundant dates pattern (e.g., '2018-02-05__20180205_154910.jpg') redundantdatepattern = re.compile('(\d{4}-\d{2}-\d{2})__(\d{8})_\d{6}\..*') # [2.2] android dates pattern, pre-tagpix (e.g., '20180205_154910.jpg') filenamedatepattern = re.compile('(\d{8})_\d{6}\..*') # Newer camera video types. # Not hardcoded in py module, but may come from local files on some platforms # even if not set here. E.g., on Mac OS 10.11, the module auto-loads types from # /etc/apache2/mime.types; on Windows, it tries the registry's MIME database. # Some cameras save AVCHD videos as '.mts', which may map to MIME model/vnd.mts. # mimetypes.add_type('video/mp2t', '.mts') # need video/ here mimetypes.add_type('video/mp2t', '.m2ts') # ditto mimetypes.add_type('video/3gpp', '.3gp') # or auto-loaded (probably) # Route input() prompts to stderr. # This allows normal stdout prints to be redirected to a file or pipe. # Also make sure to flush stdout so Unix can watch with a 'tail -f'. # [2.3] User-friendly exit on ctrl-c at prompt, not exception trace. # def input(prompt): "prompt on stderr, so stdout report can be piped to a file" if sys.stderr.isatty(): # no eoln at console sys.stderr.write(prompt) else: # else eoln (e.g., PyEdit) sys.stderr.write(prompt + '\n') sys.stderr.flush() try: return sys.stdin.readline().rstrip('\n') except KeyboardInterrupt: print('\nScript not run: no changes made.') # [2.3] friendly exit sys.exit(0) builtin_print = print def print(*pargs, **kargs): builtin_print(*pargs, **kargs) sys.stdout.flush() # flush=True only in some Py 3.Xs #========================================================================= # Get run parameters from console #========================================================================= def yes(prompt): reply = input(prompt + ' ') return reply.lower()[:1] == 'y' # Enter=no # [2.1] say copy if copy-only, but moves unchanged copyonly = CopyInsteadOfMove and not DeleteAfterCopy xfermode = 'move' if not copyonly else 'copie' xferverb = xfermode.replace('ie', 'y') # don't run accidentally (e.g., clicks) if not yes('tagpix renames and %ss photos to a merged folder; proceed?' % xfermode): print('Script not run: no changes made.') sys.exit(0) # from dir SourceDir = input('Source - pathname of folder with photos to be %sd? ' % xfermode) if not SourceDir: SourceDir = os.path.join(workdir, 'SOURCE') # prior/default: copy here # [2.3] now done asap: verify from-dir if not os.path.isdir(SourceDir): print('Script not run: source folder does not exist, no changes made.') sys.exit(0) # to dir destdir = input('Destination - pathname of folder to %s items to? ' % xferverb) if not destdir: destdir = workdir # target dirs (unknowns folder dropped) FlatPhotoDir = os.path.join(destdir, 'MERGED', 'PHOTOS') FlatMovieDir = os.path.join(destdir, 'MERGED', 'MOVIES') FlatOtherDir = os.path.join(destdir, 'MERGED', 'OTHERS') # group into by-year subdirs? YearFolders = yes('Group items into by-year subfolders?') # show target names but don't rename/move ListOnly = yes('List only: show target names, but do not rename or %s?' % xferverb) #========================================================================= # Initial setup #========================================================================= def configdirs(): """ ---------------------------------------------------------------------- Verify input folder, create or clean (optionally) output folders. ---------------------------------------------------------------------- """ # verify from-dir - now done earlier [2.3] # if not os.path.isdir(SourceDir): # print('Not run: source folder does not exist.') # sys.exit() # make no changes in list-only mode if ListOnly: return # make or empty to-dirs for subdir in (FlatPhotoDir, FlatMovieDir, FlatOtherDir): if not os.path.exists(subdir): try: os.makedirs(subdir) # all path items, as needed except: print('Script not run: cannot make an output folder, no images changed.') sys.exit() else: if (len(os.listdir(subdir)) >= 1 # even if just a .DS_Store and yes('Delete all prior-run outputs in "%s"?' % subdir) and yes('....About to delete: ARE YOU SURE?')): # [2.1] verify! for tempname in os.listdir(subdir): temppath = os.path.join(subdir, tempname) if os.path.isfile(temppath): os.remove(temppath) # simple photo or other file else: shutil.rmtree(temppath) # else a year subfolder #========================================================================= # Analysis phase #========================================================================= def isMovieFileName(filename): """ ---------------------------------------------------------------------- Detect videos by filename extension's mimetype (not hardcoded set). ---------------------------------------------------------------------- """ mimetype = mimetypes.guess_type(filename)[0] # (type?, encoding?) return (mimetype != None and mimetype.split('/')[0] == 'video') # e.g., 'video/mpeg' def isExifImageFileName(filename): """ ---------------------------------------------------------------------- Detect images by filename extension's mimetype (not hardcoded set). This currently is True for JPEGs and TIFFs (of any extension type), because these are the only image types defined to contain Exif tags. Hence, these are considered 'photos' by tagpix; others go to OTHERS. ---------------------------------------------------------------------- """ exiftypes = ['jpeg', 'tiff'] # of any extension mimetype = mimetypes.guess_type(filename)[0] # (type?, encoding?) return (mimetype != None and mimetype.split('/')[0] == 'image' and # e.g., 'image/jpeg' mimetype.split('/')[1] in exiftypes) # type does exif tags? def getExifTags(filepath): """ ---------------------------------------------------------------------- Collect image-file metadata in new dict, if any (PIL code + try+if). Returns {name: value} holding all Exif tags in image, and uses the TAGS table in PIL (Pillow) to map tag numeric ids to mnemonic names. ---------------------------------------------------------------------- """ nametoval = {} try: i = Image.open(filepath) info = i._getexif() # not all have Exif tags if info == None: raise LookupError('No tags found') # else items() bombs for tag, value in info.items(): # for all tags in photo file decoded = TAGS.get(tag, tag) # map tag's numeric id to name nametoval[decoded] = value # or use id if not in table except Exception as E: print('***Unusable Exif tags skipped: "%s" for' % E, filepath) return nametoval def looksLikeDate(datestr): """ ---------------------------------------------------------------------- Return true if datestr seems to be a valid date. datestr is a string of form "YYYYMMDD". If it is a reasonable date, returns a tuple of 3 ints (YYYY, MM, DD), which is true; else returns False. This is used on filename dates after pattern matching, to discount unrelated strings that have a date-like structure coincidentally. It is assumed that tagpix probably won't be widely used after 2100... ---------------------------------------------------------------------- """ assert len(datestr) == 8 and datestr.isdigit() year, month, day = [int(x) for x in (datestr[0:4], datestr[4:6], datestr[6:8])] if ((1900 <= year <= 2100) and (1 <= month <= 12) and (1 <= day <= 31)): return (year, month, day) else: return False def getFileNameDate(filename): """ ---------------------------------------------------------------------- Get an Android-style date from a photo's filename itself, if any. Used for images with no Exif tags, or Exifs but no date-taken tag. The former can happen for Android photos edited in tools that drop all tags; the latter can happen in Samsung front (selfie) cameras that record no date-taken tag (probably a temp bug, but widespread). In general, tries tags, then Android filenames, then file moddate. looksLikeDate() tries to avoid false positives, but is heuristic. ---------------------------------------------------------------------- """ filenamedate = None if UseAndroidFilenameDates: # enbled in user configs? match = filenamedatepattern.match(filename) # "yyyymmdd_hhmmss.*"? if match: datepart = match.group(1) validate = looksLikeDate(datepart) # date str is valid date? if validate: year, month, day = validate filenamedate = '%4d-%02d-%02d' % (year, month, day) return filenamedate def getFileModDate(filepath): """ ---------------------------------------------------------------------- Get any file's modification-date string, or a default if unavailable. This is used as last resort tagpix date if there is no Exif or Android filename date, and reflects either file creation if the file was not edited, or else the most-recent edit. Note that getctime() creation date is not used, because it is dependent on both operating system and filesystem, is generally unavailable on Unix, and may be irrelevant. ---------------------------------------------------------------------- """ try: filemodtime = os.path.getmtime(filepath) filemoddate = str(datetime.date.fromtimestamp(filemodtime)) # 'yyyy-mm-dd' except: filemoddate = 'unknown' # sort together #filemoddate = str(datetime.date.fromtimestamp(time.time())) # or use today? return filemoddate def classify(sourcedir): """ ---------------------------------------------------------------------- For each file item in the sourcedir tree, create a (date, name, path) tuple, and add it to photo, movie, or other lists according to its type. The lists have item photo-tag or file-mod dates, to be added by moves. subshere.remove() can't mod loop's list (and py 2.X has no list.copy()). TBD: the .* filename skips could be generalized for Windows cruft too; foldername skips are now in user_configs.py, but filenames are not. ---------------------------------------------------------------------- """ print(sepln) print('Analyzing source tree') photos, movies, others = [], [], [] for (dirpath, subshere, fileshere) in os.walk(sourcedir): for subname in subshere[:]: # copy: can't mod in-place [2.1] subpath = os.path.join(dirpath, subname) # skip Unix hidden and thumbs subfolders if ignorefolderspattern.match(subname) != None: print('Skipping folder:', subpath) # old PyPhoto, new thumbspage, etc subshere.remove(subname) # don't scan, leave in source tree for filename in fileshere: filepath = os.path.join(dirpath, filename) # skip Mac .DS_Store, and other Unix hidden files if filename.startswith('.'): print('Skipping file:', filepath) # and will remain in source tree continue if not isExifImageFileName(filename): # # nonphoto: try filename date, then file moddate # filenamedate = getFileNameDate(filename) # android-style name? filemoddate = getFileModDate(filepath) # else file mod date datefile = filenamedate or filemoddate # tagdate='yyyy-mm-dd' if isMovieFileName(filename): movies.append((datefile, filename, filepath)) # all video types else: others.append((datefile, filename, filepath)) # pngs, gifs, text, etc. else: # # photo: check for Exif tags in images only # pictags = getExifTags(filepath) # possibly None if not pictags: # # photo without exif: try filename date, then file moddate # filenamedate = getFileNameDate(filename) # android-style name? filemoddate = getFileModDate(filepath) # else file mod date datefile = filenamedate or filemoddate # tagdate='yyyy-mm-dd' photos.append((datefile, filename, filepath)) # photo sans exif tags else: # # photo with exif: try tags first, then filename, then file moddate # fulltaken = '' for trythis in ('DateTimeOriginal', 'DateTimeDigitized'): try: fulltaken = pictags[trythis] # normal: use 1st except KeyError: # tag may be absent pass if fulltaken.strip(): # bursts: 1st=' ' break # stop if nonempty splittaken = fulltaken.split() # fmt='date time' datetaken = splittaken[0] if splittaken else '' if datetaken: # [0]='yyyy:mm:dd' datetaken = datetaken.replace(':', '-') # use 'yyyy-mm-dd' photos.append((datetaken, filename, filepath)) else: filenamedate = getFileNameDate(filename) # android-style name? filemoddate = getFileModDate(filepath) # else file mode date datefile = filenamedate or filemoddate # tagdate='yyyy-mm-dd' photos.append((datefile, filename, filepath)) # photo sans exif date return (photos, movies, others) # lists of (date, name, path) #========================================================================= # File-moves phase #========================================================================= def stripPriorRunDate(filename): """ ---------------------------------------------------------------------- Drop a prior run's "yyyy-mm-dd__" date prefix if present, so that results of prior merges can be used as source items for new reruns. Also ensures dates are the same; if not, it's not a tagpix prefix. Note that there's no need to use the looksLikeDate() test here, because the filename has already been prepended with a true date. Also note that this does not remove __N suffixes added to duplicate names of differing content, but the suffix is still useful in reruns, and moveone() will ensure that the new name is unique in any event. ---------------------------------------------------------------------- """ if (dupprefixpattern.match(filename) == None or # no duplicate dates? filename[:12] != filename[12:24]): # not the same dates? return filename # not a tagpix prefix dup else: tense = 'will be' if ListOnly else 'was' print('***A prior run\'s date prefix %s stripped:' % tense, filename) # [2.2] prefix, stripped = filename[:12], filename[12:] assert prefix == stripped[:12], 'Prior and new dates differ' return stripped def stripAndroidDate(filename): """ ---------------------------------------------------------------------- [2.2] Drop redundant Android dates in image filenames if present. This must be run _after_ stripPriorRunDate(), due to the pattern. Android (and perhaps other) cameras add a date in image filenames which is redundant with that added by tagpix in moveall() below (e.g., '2018-02-05__20180205_154910.jpg'). Rename the renamed image file to drop the extra Android date and keep the tagpix date (e.g., '2018-02-05__20180205_154910.jpg' => '2018-02-05__154910.jpg'). This step can be disabled in user_configs.py to always keep the extra dates, and can be specialized to drop Android dates only if they are the same as the tagpix date (in rare cases, the two dates may differ if an image is edited in tools that discard Exif creation-date tags). looksLikeDate() tries to avoid false positives, but is heuristic. See also the on-demand _drop-redundant-dates.py utility script. ---------------------------------------------------------------------- """ if not DropAndroidFilenameDates: # enabled in user_configs.py? return filename else: matched = redundantdatepattern.match(filename) # redundant date present? if matched == None: return filename else: tagpixdate = matched.group(1) # YYYY-MM-DD__date2_time.jpg sourcedate = matched.group(2) # date1__YYYYMMDD_time.jpg if not looksLikeDate(sourcedate): # bail if not a valid date return filename samedate = tagpixdate.replace('-', '') == sourcedate if (not samedate and KeepDifferingAndroidFilenameDates): return filename else: stripped = filename[0:12] + filename[21:] # drop 2nd/redundant date2 return stripped # no message here: common def samecontent(filepath1, filepath2, chunksize=1*(1024*1024)): """ ---------------------------------------------------------------------- Return True if two files' content is byte-for-byte identical. Reads up to chunksize bytes on each loop, till bytes differ or eof encountered on either/both (which returns an empty ''). This tests POSIX file content (the 'data' fork in Mac OS lingo). ---------------------------------------------------------------------- """ file1 = open(filepath1, 'rb') # close explicitly for non-cpython file2 = open(filepath2, 'rb') # read in chunks for huge files samebytes = True while samebytes: chunk1 = file1.read(chunksize) # at most this many more byte chunk2 = file2.read(chunksize) if not chunk1 and not chunk2: break # eof on both: entirely same elif chunk1 != chunk2: samebytes = False # eof on one or bytes differ file1.close() file2.close() return samebytes def moveone(filename, filepath, flatdir, moved): """ ---------------------------------------------------------------------- Transfer one already-renamed file to its destination folder in the merged result, or skip it if it has the same name and content as a file already transferred. filename already has a tagpix date prefix, filepath=original name: FROM=filepath, TO=flatdir(/year)?/filename. 'moved' is used for ListOnly mode; os.path.exists() handles all dups. This adds the year folder level to the path; skips true content duplicates; and creates unique names for same-name/diff-content. The while loop here ensures that the unique-name suffix is unique, and tests for same content among all the filename's variants [2.1]. Now does copy-and-delete and copy-only modes, not just moves [2.1]. ---------------------------------------------------------------------- """ # # group by years, if selected # if YearFolders: year = filename.split('-')[0] # oddballs might be 'unknown' yearsub = flatdir + os.sep + year # add year subfolder to dest path if not os.path.exists(yearsub) and not ListOnly: os.mkdir(yearsub) flatpath = os.path.join(yearsub, filename) # year-subdfolder/prefixed-name else: flatpath = os.path.join(flatdir, filename) # flat-dest-folder/prefixed-name # # skip or rename duplicates (report in ListOnly mode) # if os.path.exists(flatpath) or flatpath in moved: # dup from this run or other? if ListOnly: # note dup but don't resolve now print('***Duplicate name will be resolved:', flatpath) else: # skip if same full content, else rename flatpath0 = flatpath id = 1 # per-file numeric id [2.1] while True: # till skipped or unique if samecontent(filepath, flatpath): # same name and byte-for-byte content: don't move print('***Duplicate content was skipped:', filepath, '==', flatpath) return else: # same date-prefixed name, diff content: add id to name and recheck print('***Duplicate filename made unique:', flatpath) front, ext = os.path.splitext(flatpath0) # ext = last '.' to end, or '' flatpath = '%s__%s%s' % (front, id, ext) # add id suffix before ext if not os.path.exists(flatpath): # id used by prior run? [2.1] break # no: use this unique name id += 1 # else try again with next id # # transfer unique file with date prefix from source to dest # print(filepath, '=>', flatpath) moved[flatpath] = True if not ListOnly: try: if not CopyInsteadOfMove: # move to merged result: original, default, recommended, faster os.rename(filepath, flatpath) else: # copy to result, leave in source? (e.g., across drives) [2.1] shutil.copyfile(filepath, flatpath) shutil.copystat(filepath, flatpath) # same as copy2() but EIBTI if DeleteAfterCopy: os.remove(filepath) # else files may accumulate except Exception as why: # e.g., permissions, path length, lock, diff dev/filesystem message = ('***Error moving: %s\n' 'It was not renamed or moved, but the run continued' ' and all non-error items were transferred.\n' 'Resolve the issue and rerun tagpix on your source folder' ' to transfer this item too.\n' 'The Python error message follows:\n' '%s => %s') print(message % (filepath, why.__class__.__name__, why)) def moveall(photos, movies, others): """ ---------------------------------------------------------------------- Add date prefix to filenames, and move photos, movies, and others. [2.1] Refactored three loops into one here; they differed slightly conceptually, but did identical work, and have not diverged in some five years - all handle duplicates and prior-run dates the same way. ---------------------------------------------------------------------- """ moved = {} # for duplicates in ListOnly mode xfermode = 'Moving' if (not CopyInsteadOfMove) or DeleteAfterCopy else 'Copying' categories = [('PHOTOS', photos, FlatPhotoDir), # redundancy kills (code) ('MOVIES', movies, FlatMovieDir), # refactored from 3 loops ('OTHERS', others, FlatOtherDir)] for (catname, catitems, catdest) in categories: print(sepln) print('%s %s:' % (xfermode, catname), len(catitems)) for (datetag, filename, filepath) in catitems: # ids per file (not cat, run) filename = '%s__%s' % (datetag, filename) # add date-taken-or-mod prefix filename = stripPriorRunDate(filename) # drop any prior-run prefix filename = stripAndroidDate(filename) # drop any extra Android date moveone(filename, filepath, catdest, moved) # handle dups, move or copy print(sepln) def unmoved(sourcedir): """ ---------------------------------------------------------------------- Find and report any files missed in the souredir folder, post-moves. This includes duplicates, errors, hiddens, and skipped-folder items. ---------------------------------------------------------------------- """ if CopyInsteadOfMove and not DeleteAfterCopy: # nothing was moved or deleted: source content is moot [2.1] print('Nothing was removed from the source tree') else: # original: show all files left behind by skips and errors missed = [] for (dirpath, subshere, fileshere) in os.walk(sourcedir): # skips, errs for filename in fileshere: # ignore dirs missed.append(os.path.join(dirpath, filename)) print('Missed:', len(missed)) pprint.pprint(missed, width=200) print(sepln) #========================================================================= # Main logic #========================================================================= if __name__ == '__main__': """ ---------------------------------------------------------------------- Setup, classify, rename/move, and verify. ---------------------------------------------------------------------- """ configdirs() photos, movies, others = classify(SourceDir) # plan moves if tracemore: pprint.pprint(photos, width=200); print(sepln) pprint.pprint(movies, width=200); print(sepln) pprint.pprint(others, width=200); print(sepln) moveall(photos, movies, others) # execute moves if not ListOnly: unmoved(SourceDir) # report skips print('Bye.')