#!/usr/bin/env python3 """ ======================================================================= Demo Python's hashlib, by comparing the relative speeds of modtime and MD5-hash (a.k.a. checksum) alternatives for file-change detection in content-sync programs like Mergeall (learning-python.com/mergeall). To compare, this walks a folder tree and emulates both schemes' code for each file along the way. Pass one command-line argument - the path of the root of the folder tree to walk. Attrib: © M. Lutz (learning-python.com), June 2022 License: Provided freely, but with no warranties of any kind. -------- FINDINGS -------- Both modtimes and hashes support change detection without full content compares. However, while modtimes have interoperability issues, hashes are wholly unusable for Mergeall, for two reasons: Accuracy Hashes are probabilistic. Though very rare, they may yield duplicates (collisions) that cause changes to go unnoticed. Speed Hashes are prohibitively expensive. They require full reads that make them far too slow to be used for larger content collections. The second of these is a full showstopper for Mergeall. Per the 200G content test's output at the end of this file, hashes can be ~100x slower than modtimes--requiring some 10 minutes (600 seconds) for a tree scan, versus modtimes' 6 seconds. This is impractical for on-demand syncs of non-trivial content, of the sort Mergeall is designed to handle. In their defense, hashes may suffice for smaller content and single files transferred over a network, and might be acceptable for syncs run overnight. Moreover, modtimes have some well-known portability caveats, including FAT32 skew, and subpar support in some contexts. Even so, modtimes' 6-second showing still easily beats waiting a full 10 minutes for a hashes-based differences scan, especially on phones where battery power is a crucial resource. Takeaways: Mergeall is fast because it uses modtimes instead of checksums. Platforms must support modtimes to enable fast syncs. --------- MORE INFO --------- For more on Mergeall's use case, as well as typical modtime issues in FAT32, some drives, older Androids, and Linux, try these: https://learning-python.com/ mergeall-products/unzipped/UserGuide.html#what mergeall-products/unzipped/UserGuide.html#dst mergeall-android-scripts/_README.html#toc22 mergeall-android-scripts/_README.html#toc21 post-release-updates.html#mergealllinuxmodtimes For more on hashlib and checksums in general: https://docs.python.org/3/library/hashlib.html https://en.wikipedia.org/wiki/Cryptographic_hash_function (and search the web for "md5", "checksum," and the like) ======================================================================= """ import time, hashlib, sys, os def walker(op, root): """ ----------------------------------------------- Walk folder tree at root, running function op on each file in the tree along the way. op takes one arg - a file's pathname. Returns total walk time, last file's op result, #files. This skips symlinks; real code would read them. ----------------------------------------------- """ visit = 0 start = time.perf_counter() for (dir0, subs0, files0) in os.walk(root): for file in files0: path = os.path.join(dir0, file) if not os.path.islink(path): visit += 1 last = op(path) elap = time.perf_counter() - start return elap, last, visit def mtime(path): """ ----------------------------------------------- Emulate modtime-based diff check - fetch and compare FROM and TO files' modtime floats. This requires just two quick os.stat calls. Much simpler and ~100x faster than cksum(), though OS caching might help modestly here. This is roughly what Mergeall does (e.g., it uses os.lstat() to avoid os.path.*() calls), but it also uses a 2-second granularity in modtime checks to support FAT32 dives; applies Unicode normalization to filenames; and checks file sizes from the already-fetched stat result only when modtimes are the same. ----------------------------------------------- """ # check modtimes of both sides s1 = os.stat(path) s2 = os.stat(path) test = (s1.st_mtime == s2.st_mtime) # modtimes test = (s1.st_size == s2.st_size) # filesizes return s2.st_mtime def cksum(path, hasher=hashlib.md5, csize=4096, cache=True): """ ----------------------------------------------- Emulate cksum-based diff check - test a FROM file's saved prior hash against its new hash (alternative: compare new hash to TO's prior). This requires reading one of the two trees in full, and is essentially half a diffall scan. If cache, emulates prior-hash fetch by reading a file; real code may index a shelve/pickle. Set hasher to try alts; e.g., hashlib.sha256 may be less prone to hash collisions, but was also ~25% slower in testing. Use csize/cache to vary IO ops; the speed diff seems trivial. ----------------------------------------------- """ hash = hasher() temp = __file__ + '.hash' # get prior hash of this file if cache and os.path.exists(temp): prior = open(temp, 'rb').read() else: prior = b'0' * hash.digest_size # calc this file's new hash file = open(path, 'rb') while True: chunk = file.read(csize) if not chunk: break hash.update(chunk) code = hash.digest() test = (code == prior) # save new hash of this file if cache: save = open(temp, 'wb') save.write(code) save.close() return code #---------------------------------------------- # Main - apply two alts to same folder tree. # Print bytes hashes nicely, by 4-byte groups. #---------------------------------------------- if __name__ == '__main__': assert len(sys.argv) == 2 and os.path.isdir(sys.argv[1]) stuff = sys.argv[1] print('walking:', stuff) elaps = [] for test in (mtime, cksum): elap, last, num = walker(test, stuff) print('\n', test.__name__, '=>') print('\telapsed time =', '%.4f' % elap) print('\tnumber files =', num) print('\tlast result =', last.hex('_', 4) if isinstance(last, bytes) else last) elaps.append(elap) print('\ncksum:modtime ratio = %.2f' % (elaps[1] / elaps[0])) #---------------------------------------------- # Output - macos, 200G 174k-file ~/MY-STUFF. # Note: cache=False has no noticeable effect. #---------------------------------------------- """ $ cd ~/MY-STUFF/Code/etc $ python3 cksums.py ~/MY-STUFF walking: /Users/me/MY-STUFF mtime => elapsed time = 5.4100 number files = 174639 last result = 1641914113.1329055 cksum => elapsed time = 586.3275 number files = 174639 last result = 07120ede_83dbad58_46b0f6cf_1ca45325 cksum:modtime ratio = 108.38 $ python3 cksums.py ~/MY-STUFF walking: /Users/me/MY-STUFF mtime => elapsed time = 7.2121 number files = 174639 last result = 1641914113.1329055 cksum => elapsed time = 587.9185 number files = 174639 last result = 07120ede_83dbad58_46b0f6cf_1ca45325 cksum:modtime ratio = 81.52 """