Version 1.1: search for "Feb2014" for version changes -- Unicode file
content, local file same name as that in remote site link, >1 ignores.
License: provide freely, but with no warranties of any kind.

Synopsis: use python's html and url parser libs to try to isolate
and move unused files in a (flat) web site directory.  Run me in
the directory of the site's root html file(s) (default=[index.html]).

This is heuristic: it assumes that referenced files are in this site 
if they exist here.  It also may incorrectly classify some files as 
unused if they are referenced only from files which cause Python's
html parser to fail -- you should inspect the run log and unused file
directory manually after a run, to see if parse failures occurred.
More lenient html parsers exist for Python, but many seem 2.X-only; 
other parse options might avoid failures too: re.findall() pattern 
matches for '(?s)href="(.*?)"' and 'src=...'? (see Example 19-9).

See PP4E Chapters 19+14 for html parsers, Chapter 13 for url parsing;
this sort of code could be adapted to do web site search and similar.
TBD: extend me to delete the unused files from remote site via ftp;
not done because unused files require verification if parse failures.
CAVEAT: assumes site is one dir, doesn't handle subdirs (improve me);

import os, sys, html.parser, urllib.parse

def findUnusedFiles(rootFiles=['index.html'],     # site roots to scan for links
                    dirUnused='Unused',           # where to move unused files
                    skipFiles=[],                 # skip these even if reached
                    thisSite=[]):                 # skip links to other servers
    main function: find files referenced by rootFiles and by any html
    they reach, ignoring any filenames in skipFiles, and ignoring files
    at sites other than thisSite, and move unused files to dirUnused;
    usedfiles = set(rootFiles)   # changed in-place
    for rootfile in rootFiles:
        parseFileRefs(rootfile, usedfiles, skipFiles, thisSite, indent=0)
    moveUnusedFiles(usedfiles, dirUnused)
    return usedfiles

def moveUnusedFiles(usedFiles, dirUnused, trace=print): 
    after finding all used files, move unused files to a temp directory
    print('-' * 80)
    if not os.path.exists(dirUnused):             # tbd: clean if present?
    for filename in os.listdir('.'):
        if filename not in usedFiles:
            if not os.path.isfile(filename):
                print('Not a file:', filename)
                trace('Moving...', filename)
                os.rename(filename, os.path.join(dirUnused, filename))

def parseFileRefs(htmlFile, usedFiles, skipFiles, thisSite, indent, trace=print):
    find files referenced in root named htmlFile, recur for html files;
    called initially, and via indirect recursion from html parser class;
    trace('%sParsing:' % ('.' * indent), htmlFile)
    parser = MyParser(usedFiles, skipFiles, thisSite, indent)

    # Feb2014: default Unicode encoding can fail: make this explicit
        text = open(htmlFile, encoding='ascii').read()             
        raw  = open(htmlFile, 'rb').read()       # try decoding bytes
        for encoding in ('utf8', 'latin1'):      # parser requires str text
                text = raw.decode(encoding)
            except: pass
        trace('%sEncoding =' % ('+' * indent), encoding)
    except html.parser.HTMLParseError as E:
        print('==>FAILED:', E)                   # file's refs may be missed!

class MyParser(html.parser.HTMLParser):
    use Python stdlib html parser to scan files, changing usedFiles
    in-place; could nest this in parseFileRefs for enclosing scope,
    but would remake class per call;
    def __init__(self, usedFiles, skipFiles, thisSite, indent):
        self.usedFiles = usedFiles
        self.skipFiles = skipFiles    # doesn't vary
        self.thisSite  = thisSite     # doesn't vary
        self.indent    = indent
        super().__init__()            # vs html.parser.HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        callback on tag open during parse: check links and images
        if tag == 'a':   # Feb2014: for 'link' too? TBD
            url = [value for (name, value) in attrs if name.lower() == 'href']
            if url:

        elif tag == 'img':
            url = [value for (name, value) in attrs if name.lower() == 'src']
            if url:

    def notefile(self, url):
        note used file found, and recur to a nested parse if html
        urlparts = urllib.parse.urlparse(url)
        (scheme, server, filepath, parms, query, frag) = urlparts
        filename = os.path.basename(filepath)

        if (os.path.exists(filename)       and                # is it here?
            filename not in self.skipFiles and                # ignore it?
            filename not in self.usedFiles and                # skip repeats?
            (not server or server in self.thisSite)):         # Feb2014: site?

            self.usedFiles.add(filename)                      # add in-place
            if filename.endswith(('.html', '.htm')):          # recur for html
                    self.skipFiles, self.thisSite,
                    self.indent + 3)

def deleteUnusedRemote(localUnusedDir, ftpsite, ftpuser, ftppswd, ftpdir='.'):
    TBD: delete unused files from remote site too? see Chapter 13 for ftp;
    not used because unused dir requires manual inspection if parse failures
    from ftplib import FTP
    connection = FTP(ftpsite)
    connection.login(ftpuser, ftppswd)
    for filename in os.listdir(localUnusedDir):

if __name__== '__main__':

    # Feb2014:
    # 1) thissite: ignore local file if link is to same name at diff site.
    # 2) allow > 1 ignore via splits; now only hard-coded in this script,
    # though could be a quoted and .split() command-line argument if useful.

    # edit me for your site defaults (this script is for personal use);
    # trial-and-error scheme: add ignores till unused set is large enough

    htmlroot = sys.argv[1] if len(sys.argv) > 1 else 'index.html'
    moveto   = sys.argv[2] if len(sys.argv) > 2 else 'PossiblyUnused'

    # edit or generalize me
    site = input('B(ooks) or T(raining)? ')
    if site.lower()[0] == 't':
        thissite = ['']
        ignore   = []
    elif site.lower()[0] == 'b':
        thissite = ['', '']
        ignore   = ['whatsnew.html', 'about-me.html', 'self.html', 'pic22.html']
    print('ignore =', ignore, ', thissite =', thissite)

    # parse files, find and follow links, move unused files
    usedFiles = findUnusedFiles([htmlroot], moveto, ignore, thissite)
    moveFiles = os.listdir(moveto)

    # report on results
    print('-' * 80)
    print('%d unused files moved to:\n\t%s\n' % 
              (len(moveFiles), os.path.abspath(moveto)))
    print('%d used files in this site: ' % len(usedFiles))
    for F in sorted(usedFiles): print('\t', F)

    if input('delete remotely?') in 'yY':
        deleteUnusedRemote(moveto, input('site?'), input('user?'), input('pswd?'))

[Home] Books Programs Blog Python Author Training Search Email ©M.Lutz