File: mergeall-products/unzipped/test/expected-output-3.0/optimizations-3.0/prior-diffall.py

"""
################################################################################
Usage:
    [py[thon]] diffall.py dir1 dir2 [-recent [days=90]] [-skipcruft]
    
Recursive directory tree comparison: report unique files that exist in only
dir1 or dir2, report files of the same name in dir1 and dir2 with differing 
contents, report instances of same name but different type in dir1 and dir2,
and do the same for all subdirectories of the same names in and below dir1 
and dir2.  A summary of diffs appears at end of output, but search redirected
output for "*DIFFERS" and "*UNIQUE" strings for further details (per Sep-2016).

In sum, diffall compares full, byte-by-byte file content to verify that files
are truly the same.  It does not compare file modification times, as these
are not relevant to content equivalence.  See mergeall for a quicker but
shallower alternative that checks modification times but not full content
to detect file changes that warrant synchronization.

New: (3E) limit reads to 1M for large files, (3E) catch same name=file/dir,
(4E) avoid extra os.listdir() calls in dirdiff.comparedirs() by passing
results here along.

New March-2015, for mergeall 2.0: add "-recent [days]" limited comparisons
option to compare just files changed in last days (else compares all files),
plus simple stats at end of report.  Also for 2.0, added explicit file.close
calls, for use outside CPython; we don't care about catching exceptions here,
as any kill the script, and we're just reading in any event.

New Jan-2016: change incorrect "dirdiff" in usage message to "diffall".
Also print total diffall runtime for speed analysis and drive comparisons.
Caveat: may run quicker with os.scandir() instead of os.listdir() in Python
3.5+ (only!), but runtime is likely dominated by the exhaustive file reads
here, not listings; see mergeall.py for os.scandir() alternative in action.

New Sep-2016: changed difference labels slightly, so users can search the
report for uppercase '*UNIQUE' and '*DIFFERS' to jump to differences quickly.

New Sep-2016: use mergeall's extended OPEN() to support long file pathnames
on Windows.

New Sep-2016: the '-skipcruft' mode, also added to mergeall, skips system
cruft files in both folders so they do not register as differences (and
can clutter the report to the point of near unusability on some platforms
that rhyme with "Mac").  See mergeall_configs.py for more on cruft metadata.
################################################################################
"""

from __future__ import print_function     # ADDED: Py 2.X compatibility

import os, time, sys, dirdiff
from sys import argv
from fixlongpaths import OPEN             # [2.5] or 'as open', but too obscure 
from skipcruft import filterCruftNames    # [2.5] filter out metadata files

blocksize = 1024 * 1024                   # up to 1M per read
numdir = numfile = numskip = 0            # [2.0] a few sats

# [jan16] python/platform-specific current time (secs)
gettime = (time.perf_counter if hasattr(time, 'perf_counter') else
          (time.clock if sys.platform.startswith('win') else time.time)) 


def intersect(seq1, seq2):
    """
    Return all items in both seq1 and seq2;
    a set(seq1) & set(seq2) would work too, but sets are randomly 
    ordered, so any platform-dependent directory order would be lost
    """
    return [item for item in seq1 if item in seq2]


def recentlychanged(path1, path2, numdays=90):
    """
    [mergeall 2.0] return True if either path1 or path2 was modified
    in last "days" days (default 90, if not passed, or not listed in the
    command-line).  This is really days-worth-of-seconds, but close enough.
    In large achives, most files will not have been changed recently, so
    this test can speed limited comparisons.  Library calls used here:
    --------------------------------------------------------------------
    >>> t1 = os.path.getmtime('python')
    >>> t2 = time.time()
    >>> t1, t2
    (1390862766.9136598, 1426117651.752781)
    >>> time.ctime(t1), time.ctime(t2)
    ('Mon Jan 27 14:46:06 2014', 'Wed Mar 11 15:47:31 2015')
    --------------------------------------------------------------------
    """
    modtime1 = os.path.getmtime(path1)      # in seconds since epoch 
    modtime2 = os.path.getmtime(path2)      # float in 3.X, int in 2.X?
    nowtime  = time.time()
    secsback = numdays * (24 * 60 * 60)
    return (modtime1 > nowtime - secsback) or (modtime2 > nowtime - secsback)

    
def comparetrees(dir1, dir2, diffs,
                 recent=False, numdays=0,
                 skipcruft=False,
                 verbose=False):
    """
    Compare all subdirectories and files in two directory trees;
    uses binary files to prevent Unicode decoding and endline transforms,
    as trees might contain arbitrary binary files as well as arbitrary text;
    may need bytes listdir arg for undecodable filenames on some platforms;
    [2.0] compare only files changed in last "numdays" days if "recent";
    [2.5] ignore system metadata files in dir1 and dir2 if skipcruft is True;
    """
    global numdir, numfile, numskip   # [2.0]
    
    # compare file name lists
    numdir += 1
    print('-' * 20)
    names1 = os.listdir(dir1)
    names2 = os.listdir(dir2)
    if skipcruft:
        # [2.5] ignore metadata files
        names1 = filterCruftNames(names1) 
        names2 = filterCruftNames(names2) 
        
    if not dirdiff.comparedirs(dir1, dir2, names1, names2):
        diffs.append('items UNIQUE at [%s] - [%s]' % (dir1, dir2))

    print('Comparing contents')
    common = intersect(names1, names2)
    missed = common[:]

    # compare contents of files in common
    for name in common:
        path1 = os.path.join(dir1, name)
        path2 = os.path.join(dir2, name)
        if os.path.isfile(path1) and os.path.isfile(path2):
            missed.remove(name)
            if recent and (not recentlychanged(path1, path2, numdays)):   # [2.0]
                numskip += 1                                              # [2.0]
                if verbose: print(name, 'skipped')
            else:
                numfile += 1
                file1 = OPEN(path1, 'rb')
                file2 = OPEN(path2, 'rb')
                while True:
                    bytes1 = file1.read(blocksize)
                    bytes2 = file2.read(blocksize)
                    if (not bytes1) and (not bytes2):
                        if verbose: print(name, 'matches')
                        break
                    if bytes1 != bytes2:
                        diffs.append('files DIFFER at [%s] - [%s]' % (path1, path2))
                        print('*DIFFERS:', name)
                        break
                file1.close()
                file2.close()  # [2.0]

    # recur to compare directories in common
    for name in common:
        path1 = os.path.join(dir1, name)
        path2 = os.path.join(dir2, name)
        if os.path.isdir(path1) and os.path.isdir(path2):
            missed.remove(name)
            comparetrees(path1, path2, diffs,
                         recent, numdays, skipcruft, verbose)

    # same name but not both files or dirs?
    for name in missed:
        diffs.append('items MISSED at [%s] - [%s]: [%s]' % (dir1, dir2, name))
        print('*MISSED:', name)


def getargs():
    "[2.0] Args for command-line mode"
    try:
        extramsg = None
        recent, numdays = False, 90         # defaults
        skipcruft = False
        
        dir1, dir2 = sys.argv[1:3]          # first 2 command-line args
        if not os.path.isdir(dir1):         # exists and is a dir [2.0] [2.5]
            extramsg = 'dir1 is invalid'
            assert False
        if not os.path.isdir(dir2):         # exists and is a dir [2.0] [2.5]
            extramsg = 'dir2 is invalid'    # was: assert os.path.isdir(dir2)
            assert False
        if '-skipcruft' in sys.argv:
            skipcruft = True                # [2.5] skip metadata files
            sys.argv.remove('-skipcruft')
        if len(argv) > 3:
            assert argv[3] == '-recent'     # [2.0] last N days only
            recent = True
            if len(argv) > 4: numdays = int(argv[4])   # listed else 90
    except:
        print('Usage: '
            '[py[thon]] diffall.py dir1 dir2 [-recent [days=90]] [-skipcruft]')
        if extramsg: print('Additional details:', extramsg)
        sys.exit(1)
    else:
        return (dir1, dir2, recent, numdays, skipcruft)


if __name__ == '__main__':
    dir1, dir2, recent, numdays, skipcruft = getargs()

    # walk, compare, change diffs in-place
    diffs = []
    starttime = gettime()                                  
    comparetrees(dir1, dir2, diffs, recent, numdays, skipcruft, True) 
    tottime = gettime() - starttime 

    # report time [jan6], stats [2.0]
    hours   = tottime // (60*60); tottime -= hours * (60*60)
    minutes = tottime //  60;     tottime -= minutes * 60
    print('=' * 80)
    print('Runtime hrs:mins:secs = %.0f:%.0f:%.2f'
                      % (hours, minutes, tottime))          
    print('Dirs checked %d, Files checked: %d, Files skipped: %d'
                      % (numdir, numfile, numskip))
    if skipcruft: print('System metadata (cruft) files were skipped')

    # report collected diffs list
    if not diffs:
        print('No diffs found.')
    else:
        print('Diffs found:', len(diffs))
        for diff in diffs: print('-', diff)
    print('End of report.')



[Home page] Books Code Blog Python Author Train Find ©M.Lutz