File: diffall-SAFE.py

"""
################################################################################
Usage: "python diffall.py dir1 dir2".
Recursive directory tree comparison: report unique files that exist in only
dir1 or dir2, report files of the same name in dir1 and dir2 with differing 
contents, report instances of same name but different type in dir1 and dir2,
and do the same for all subdirectories of the same names in and below dir1 
and dir2.  A summary of diffs appears at end of output, but search redirected
output for "DIFF" and "unique" strings for further details.  New: (3E) limit 
reads to 1M for large files, (3E) catch same name=file/dir, (4E) avoid extra 
os.listdir() calls in dirdiff.comparedirs() by passing results here along.
################################################################################
"""

import os, dirdiff
blocksize = 1024 * 1024              # up to 1M per read

def tryprint(*args):
    """
    Added Oct-27-11, post publication and post examples 1.3:
    Don't fail with an exception for unprintable filenames;
    See pages 279-282, and the similar tryprint on page 276;
    Started failing for non-ASCII filenames in email test dirs;
    In general, any filename printers in 3.X may require this,
    unless PYTHONIOENCODING is set as needed (e.g., to utf-8);
    """
    try:
        print(*args)                 # filenames might fail to encode
    except UnicodeEncodeError:
        print('--UNPRINTABLE FILE NAME--', *(arg.encode() for arg in args))

def intersect(seq1, seq2):
    """
    Return all items in both seq1 and seq2;
    a set(seq1) & set(seq2) woud work too, but sets are randomly 
    ordered, so any platform-dependent directory order would be lost
    """
    return [item for item in seq1 if item in seq2]

def comparetrees(dir1, dir2, diffs, verbose=False):
    """
    Compare all subdirectories and files in two directory trees;
    uses binary files to prevent Unicode decoding and endline transforms,
    as trees might contain arbitrary binary files as well as arbitrary text;
    may need bytes listdir arg for undecodable filenames on some platforms
    """
    # compare file name lists
    print('-' * 20)
    names1 = os.listdir(dir1)
    names2 = os.listdir(dir2)
    if not dirdiff.comparedirs(dir1, dir2, names1, names2):
        diffs.append('unique files at %s - %s' % (dir1, dir2))

    print('Comparing contents')
    common = intersect(names1, names2)
    missed = common[:]

    # compare contents of files in common
    for name in common:
        path1 = os.path.join(dir1, name)
        path2 = os.path.join(dir2, name)
        if os.path.isfile(path1) and os.path.isfile(path2):
            missed.remove(name)
            file1 = open(path1, 'rb')
            file2 = open(path2, 'rb')
            while True:
                bytes1 = file1.read(blocksize)
                bytes2 = file2.read(blocksize)
                if (not bytes1) and (not bytes2):
                    if verbose: tryprint(name, 'matches')
                    break
                if bytes1 != bytes2:
                    diffs.append('files differ at %s - %s' % (path1, path2))
                    tryprint(name, 'DIFFERS')
                    break

    # recur to compare directories in common
    for name in common:
        path1 = os.path.join(dir1, name)
        path2 = os.path.join(dir2, name)
        if os.path.isdir(path1) and os.path.isdir(path2):
            missed.remove(name)
            comparetrees(path1, path2, diffs, verbose)

    # same name but not both files or dirs?
    for name in missed:
        diffs.append('files missed at %s - %s: %s' % (dir1, dir2, name))
        tryprint(name, 'DIFFERS')


if __name__ == '__main__':
    dir1, dir2 = dirdiff.getargs()
    diffs = []
    comparetrees(dir1, dir2, diffs, True)      # changes diffs in-place
    print('=' * 40)                            # walk, report diffs list
    if not diffs:
        print('No diffs found.')
    else:
        print('Diffs found:', len(diffs))
        for diff in diffs: tryprint('-', diff)