File: mergeall-products/unzipped/test/expected-output-3.0/optimizations-3.0/prior-diffall.py
""" ################################################################################ Usage: [py[thon]] diffall.py dir1 dir2 [-recent [days=90]] [-skipcruft] Recursive directory tree comparison: report unique files that exist in only dir1 or dir2, report files of the same name in dir1 and dir2 with differing contents, report instances of same name but different type in dir1 and dir2, and do the same for all subdirectories of the same names in and below dir1 and dir2. A summary of diffs appears at end of output, but search redirected output for "*DIFFERS" and "*UNIQUE" strings for further details (per Sep-2016). In sum, diffall compares full, byte-by-byte file content to verify that files are truly the same. It does not compare file modification times, as these are not relevant to content equivalence. See mergeall for a quicker but shallower alternative that checks modification times but not full content to detect file changes that warrant synchronization. New: (3E) limit reads to 1M for large files, (3E) catch same name=file/dir, (4E) avoid extra os.listdir() calls in dirdiff.comparedirs() by passing results here along. New March-2015, for mergeall 2.0: add "-recent [days]" limited comparisons option to compare just files changed in last days (else compares all files), plus simple stats at end of report. Also for 2.0, added explicit file.close calls, for use outside CPython; we don't care about catching exceptions here, as any kill the script, and we're just reading in any event. New Jan-2016: change incorrect "dirdiff" in usage message to "diffall". Also print total diffall runtime for speed analysis and drive comparisons. Caveat: may run quicker with os.scandir() instead of os.listdir() in Python 3.5+ (only!), but runtime is likely dominated by the exhaustive file reads here, not listings; see mergeall.py for os.scandir() alternative in action. New Sep-2016: changed difference labels slightly, so users can search the report for uppercase '*UNIQUE' and '*DIFFERS' to jump to differences quickly. New Sep-2016: use mergeall's extended OPEN() to support long file pathnames on Windows. New Sep-2016: the '-skipcruft' mode, also added to mergeall, skips system cruft files in both folders so they do not register as differences (and can clutter the report to the point of near unusability on some platforms that rhyme with "Mac"). See mergeall_configs.py for more on cruft metadata. ################################################################################ """ from __future__ import print_function # ADDED: Py 2.X compatibility import os, time, sys, dirdiff from sys import argv from fixlongpaths import OPEN # [2.5] or 'as open', but too obscure from skipcruft import filterCruftNames # [2.5] filter out metadata files blocksize = 1024 * 1024 # up to 1M per read numdir = numfile = numskip = 0 # [2.0] a few sats # [jan16] python/platform-specific current time (secs) gettime = (time.perf_counter if hasattr(time, 'perf_counter') else (time.clock if sys.platform.startswith('win') else time.time)) def intersect(seq1, seq2): """ Return all items in both seq1 and seq2; a set(seq1) & set(seq2) would work too, but sets are randomly ordered, so any platform-dependent directory order would be lost """ return [item for item in seq1 if item in seq2] def recentlychanged(path1, path2, numdays=90): """ [mergeall 2.0] return True if either path1 or path2 was modified in last "days" days (default 90, if not passed, or not listed in the command-line). This is really days-worth-of-seconds, but close enough. In large achives, most files will not have been changed recently, so this test can speed limited comparisons. Library calls used here: -------------------------------------------------------------------- >>> t1 = os.path.getmtime('python') >>> t2 = time.time() >>> t1, t2 (1390862766.9136598, 1426117651.752781) >>> time.ctime(t1), time.ctime(t2) ('Mon Jan 27 14:46:06 2014', 'Wed Mar 11 15:47:31 2015') -------------------------------------------------------------------- """ modtime1 = os.path.getmtime(path1) # in seconds since epoch modtime2 = os.path.getmtime(path2) # float in 3.X, int in 2.X? nowtime = time.time() secsback = numdays * (24 * 60 * 60) return (modtime1 > nowtime - secsback) or (modtime2 > nowtime - secsback) def comparetrees(dir1, dir2, diffs, recent=False, numdays=0, skipcruft=False, verbose=False): """ Compare all subdirectories and files in two directory trees; uses binary files to prevent Unicode decoding and endline transforms, as trees might contain arbitrary binary files as well as arbitrary text; may need bytes listdir arg for undecodable filenames on some platforms; [2.0] compare only files changed in last "numdays" days if "recent"; [2.5] ignore system metadata files in dir1 and dir2 if skipcruft is True; """ global numdir, numfile, numskip # [2.0] # compare file name lists numdir += 1 print('-' * 20) names1 = os.listdir(dir1) names2 = os.listdir(dir2) if skipcruft: # [2.5] ignore metadata files names1 = filterCruftNames(names1) names2 = filterCruftNames(names2) if not dirdiff.comparedirs(dir1, dir2, names1, names2): diffs.append('items UNIQUE at [%s] - [%s]' % (dir1, dir2)) print('Comparing contents') common = intersect(names1, names2) missed = common[:] # compare contents of files in common for name in common: path1 = os.path.join(dir1, name) path2 = os.path.join(dir2, name) if os.path.isfile(path1) and os.path.isfile(path2): missed.remove(name) if recent and (not recentlychanged(path1, path2, numdays)): # [2.0] numskip += 1 # [2.0] if verbose: print(name, 'skipped') else: numfile += 1 file1 = OPEN(path1, 'rb') file2 = OPEN(path2, 'rb') while True: bytes1 = file1.read(blocksize) bytes2 = file2.read(blocksize) if (not bytes1) and (not bytes2): if verbose: print(name, 'matches') break if bytes1 != bytes2: diffs.append('files DIFFER at [%s] - [%s]' % (path1, path2)) print('*DIFFERS:', name) break file1.close() file2.close() # [2.0] # recur to compare directories in common for name in common: path1 = os.path.join(dir1, name) path2 = os.path.join(dir2, name) if os.path.isdir(path1) and os.path.isdir(path2): missed.remove(name) comparetrees(path1, path2, diffs, recent, numdays, skipcruft, verbose) # same name but not both files or dirs? for name in missed: diffs.append('items MISSED at [%s] - [%s]: [%s]' % (dir1, dir2, name)) print('*MISSED:', name) def getargs(): "[2.0] Args for command-line mode" try: extramsg = None recent, numdays = False, 90 # defaults skipcruft = False dir1, dir2 = sys.argv[1:3] # first 2 command-line args if not os.path.isdir(dir1): # exists and is a dir [2.0] [2.5] extramsg = 'dir1 is invalid' assert False if not os.path.isdir(dir2): # exists and is a dir [2.0] [2.5] extramsg = 'dir2 is invalid' # was: assert os.path.isdir(dir2) assert False if '-skipcruft' in sys.argv: skipcruft = True # [2.5] skip metadata files sys.argv.remove('-skipcruft') if len(argv) > 3: assert argv[3] == '-recent' # [2.0] last N days only recent = True if len(argv) > 4: numdays = int(argv[4]) # listed else 90 except: print('Usage: ' '[py[thon]] diffall.py dir1 dir2 [-recent [days=90]] [-skipcruft]') if extramsg: print('Additional details:', extramsg) sys.exit(1) else: return (dir1, dir2, recent, numdays, skipcruft) if __name__ == '__main__': dir1, dir2, recent, numdays, skipcruft = getargs() # walk, compare, change diffs in-place diffs = [] starttime = gettime() comparetrees(dir1, dir2, diffs, recent, numdays, skipcruft, True) tottime = gettime() - starttime # report time [jan6], stats [2.0] hours = tottime // (60*60); tottime -= hours * (60*60) minutes = tottime // 60; tottime -= minutes * 60 print('=' * 80) print('Runtime hrs:mins:secs = %.0f:%.0f:%.2f' % (hours, minutes, tottime)) print('Dirs checked %d, Files checked: %d, Files skipped: %d' % (numdir, numfile, numskip)) if skipcruft: print('System metadata (cruft) files were skipped') # report collected diffs list if not diffs: print('No diffs found.') else: print('Diffs found:', len(diffs)) for diff in diffs: print('-', diff) print('End of report.')