#!/usr/bin/python3 """ ======================================================================= Version 1.1: search for "Feb2014" for version changes -- Unicode file content, local file same name as that in remote site link, >1 ignores. License: provide freely, but with no warranties of any kind. Synopsis: use python's html and url parser libs to try to isolate and move unused files in a (flat) web site directory. Run me in the directory of the site's root html file(s) (default=[index.html]). This is heuristic: it assumes that referenced files are in this site if they exist here. It also may incorrectly classify some files as unused if they are referenced only from files which cause Python's html parser to fail -- you should inspect the run log and unused file directory manually after a run, to see if parse failures occurred. More lenient html parsers exist for Python, but many seem 2.X-only; other parse options might avoid failures too: re.findall() pattern matches for '(?s)href="(.*?)"' and 'src=...'? (see Example 19-9). See PP4E Chapters 19+14 for html parsers, Chapter 13 for url parsing; this sort of code could be adapted to do web site search and similar. TBD: extend me to delete the unused files from remote site via ftp; not done because unused files require verification if parse failures. CAVEAT: assumes site is one dir, doesn't handle subdirs (improve me); ======================================================================= """ import os, sys, html.parser, urllib.parse def findUnusedFiles(rootFiles=['index.html'], # site roots to scan for links dirUnused='Unused', # where to move unused files skipFiles=[], # skip these even if reached thisSite=[]): # skip links to other servers """ -------------------------------------------------------------------- main function: find files referenced by rootFiles and by any html they reach, ignoring any filenames in skipFiles, and ignoring files at sites other than thisSite, and move unused files to dirUnused; -------------------------------------------------------------------- """ usedfiles = set(rootFiles) # changed in-place for rootfile in rootFiles: parseFileRefs(rootfile, usedfiles, skipFiles, thisSite, indent=0) moveUnusedFiles(usedfiles, dirUnused) return usedfiles def moveUnusedFiles(usedFiles, dirUnused, trace=print): """ -------------------------------------------------------------------- after finding all used files, move unused files to a temp directory -------------------------------------------------------------------- """ print('-' * 80) if not os.path.exists(dirUnused): # tbd: clean if present? os.mkdir(dirUnused) for filename in os.listdir('.'): if filename not in usedFiles: if not os.path.isfile(filename): print('Not a file:', filename) else: trace('Moving...', filename) os.rename(filename, os.path.join(dirUnused, filename)) def parseFileRefs(htmlFile, usedFiles, skipFiles, thisSite, indent, trace=print): """ -------------------------------------------------------------------- find files referenced in root named htmlFile, recur for html files; called initially, and via indirect recursion from html parser class; -------------------------------------------------------------------- """ trace('%sParsing:' % ('.' * indent), htmlFile) parser = MyParser(usedFiles, skipFiles, thisSite, indent) # Feb2014: default Unicode encoding can fail: make this explicit try: text = open(htmlFile, encoding='ascii').read() except: raw = open(htmlFile, 'rb').read() # try decoding bytes for encoding in ('utf8', 'latin1'): # parser requires str text try: text = raw.decode(encoding) break except: pass trace('%sEncoding =' % ('+' * indent), encoding) try: parser.feed(text) except html.parser.HTMLParseError as E: print('==>FAILED:', E) # file's refs may be missed! parser.close() class MyParser(html.parser.HTMLParser): """ -------------------------------------------------------------------- use Python stdlib html parser to scan files, changing usedFiles in-place; could nest this in parseFileRefs for enclosing scope, but would remake class per call; -------------------------------------------------------------------- """ def __init__(self, usedFiles, skipFiles, thisSite, indent): self.usedFiles = usedFiles self.skipFiles = skipFiles # doesn't vary self.thisSite = thisSite # doesn't vary self.indent = indent super().__init__() # vs html.parser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): """ callback on tag open during parse: check links and images """ if tag == 'a': # Feb2014: for 'link' too? TBD url = [value for (name, value) in attrs if name.lower() == 'href'] if url: self.notefile(url[0]) elif tag == 'img': url = [value for (name, value) in attrs if name.lower() == 'src'] if url: self.notefile(url[0]) def notefile(self, url): """ note used file found, and recur to a nested parse if html """ urlparts = urllib.parse.urlparse(url) (scheme, server, filepath, parms, query, frag) = urlparts filename = os.path.basename(filepath) if (os.path.exists(filename) and # is it here? filename not in self.skipFiles and # ignore it? filename not in self.usedFiles and # skip repeats? (not server or server in self.thisSite)): # Feb2014: site? self.usedFiles.add(filename) # add in-place if filename.endswith(('.html', '.htm')): # recur for html parseFileRefs( filename, self.usedFiles, self.skipFiles, self.thisSite, self.indent + 3) def deleteUnusedRemote(localUnusedDir, ftpsite, ftpuser, ftppswd, ftpdir='.'): """ ------------------------------------------------------------------------ TBD: delete unused files from remote site too? see Chapter 13 for ftp; not used because unused dir requires manual inspection if parse failures ------------------------------------------------------------------------ """ from ftplib import FTP connection = FTP(ftpsite) connection.login(ftpuser, ftppswd) connection.cwd(ftpdir) for filename in os.listdir(localUnusedDir): connection.delete(filename) if __name__== '__main__': # Feb2014: # 1) thissite: ignore local file if link is to same name at diff site. # 2) allow > 1 ignore via splits; now only hard-coded in this script, # though could be a quoted and .split() command-line argument if useful. # edit me for your site defaults (this script is for personal use); # trial-and-error scheme: add ignores till unused set is large enough htmlroot = sys.argv[1] if len(sys.argv) > 1 else 'index.html' moveto = sys.argv[2] if len(sys.argv) > 2 else 'PossiblyUnused' # edit or generalize me site = input('B(ooks) or T(raining)? ') if site.lower()[0] == 't': thissite = ['learning-python.com'] ignore = [] elif site.lower()[0] == 'b': thissite = ['rmi.net', 'www.rmi.net'] ignore = ['whatsnew.html', 'about-me.html', 'self.html', 'pic22.html'] print('ignore =', ignore, ', thissite =', thissite) # parse files, find and follow links, move unused files usedFiles = findUnusedFiles([htmlroot], moveto, ignore, thissite) moveFiles = os.listdir(moveto) # report on results print('-' * 80) print('**Summary**\n') print('%d unused files moved to:\n\t%s\n' % (len(moveFiles), os.path.abspath(moveto))) print('%d used files in this site: ' % len(usedFiles)) for F in sorted(usedFiles): print('\t', F) """ if input('delete remotely?') in 'yY': deleteUnusedRemote(moveto, input('site?'), input('user?'), input('pswd?')) """