File: cleansite11.py

#!/usr/bin/python3
"""
=======================================================================
Version 1.1: search for "Feb2014" for version changes -- Unicode file
content, local file same name as that in remote site link, >1 ignores.
License: provide freely, but with no warranties of any kind.

Synopsis: use python's html and url parser libs to try to isolate
and move unused files in a (flat) web site directory.  Run me in
the directory of the site's root html file(s) (default=[index.html]).

This is heuristic: it assumes that referenced files are in this site 
if they exist here.  It also may incorrectly classify some files as 
unused if they are referenced only from files which cause Python's
html parser to fail -- you should inspect the run log and unused file
directory manually after a run, to see if parse failures occurred.
More lenient html parsers exist for Python, but many seem 2.X-only; 
other parse options might avoid failures too: re.findall() pattern 
matches for '(?s)href="(.*?)"' and 'src=...'? (see Example 19-9).

See PP4E Chapters 19+14 for html parsers, Chapter 13 for url parsing;
this sort of code could be adapted to do web site search and similar.
TBD: extend me to delete the unused files from remote site via ftp;
not done because unused files require verification if parse failures.
CAVEAT: assumes site is one dir, doesn't handle subdirs (improve me);
=======================================================================
"""

import os, sys, html.parser, urllib.parse


def findUnusedFiles(rootFiles=['index.html'],     # site roots to scan for links
                    dirUnused='Unused',           # where to move unused files
                    skipFiles=[],                 # skip these even if reached
                    thisSite=[]):                 # skip links to other servers
    """
    --------------------------------------------------------------------
    main function: find files referenced by rootFiles and by any html
    they reach, ignoring any filenames in skipFiles, and ignoring files
    at sites other than thisSite, and move unused files to dirUnused;
    --------------------------------------------------------------------
    """
    usedfiles = set(rootFiles)   # changed in-place
    for rootfile in rootFiles:
        parseFileRefs(rootfile, usedfiles, skipFiles, thisSite, indent=0)
    moveUnusedFiles(usedfiles, dirUnused)
    return usedfiles


def moveUnusedFiles(usedFiles, dirUnused, trace=print): 
    """
    --------------------------------------------------------------------
    after finding all used files, move unused files to a temp directory
    --------------------------------------------------------------------
    """
    print('-' * 80)
    if not os.path.exists(dirUnused):             # tbd: clean if present?
        os.mkdir(dirUnused)
    for filename in os.listdir('.'):
        if filename not in usedFiles:
            if not os.path.isfile(filename):
                print('Not a file:', filename)
            else:
                trace('Moving...', filename)
                os.rename(filename, os.path.join(dirUnused, filename))


def parseFileRefs(htmlFile, usedFiles, skipFiles, thisSite, indent, trace=print):
    """
    --------------------------------------------------------------------
    find files referenced in root named htmlFile, recur for html files;
    called initially, and via indirect recursion from html parser class;
    --------------------------------------------------------------------
    """
    trace('%sParsing:' % ('.' * indent), htmlFile)
    parser = MyParser(usedFiles, skipFiles, thisSite, indent)

    # Feb2014: default Unicode encoding can fail: make this explicit
    try:
        text = open(htmlFile, encoding='ascii').read()             
    except:
        raw  = open(htmlFile, 'rb').read()       # try decoding bytes
        for encoding in ('utf8', 'latin1'):      # parser requires str text
            try:
                text = raw.decode(encoding)
                break
            except: pass
        trace('%sEncoding =' % ('+' * indent), encoding)
        
    try:
        parser.feed(text)
    except html.parser.HTMLParseError as E:
        print('==>FAILED:', E)                   # file's refs may be missed!
    parser.close()


class MyParser(html.parser.HTMLParser):
    """
    --------------------------------------------------------------------
    use Python stdlib html parser to scan files, changing usedFiles
    in-place; could nest this in parseFileRefs for enclosing scope,
    but would remake class per call;
    --------------------------------------------------------------------
    """
    def __init__(self, usedFiles, skipFiles, thisSite, indent):
        self.usedFiles = usedFiles
        self.skipFiles = skipFiles    # doesn't vary
        self.thisSite  = thisSite     # doesn't vary
        self.indent    = indent
        super().__init__()            # vs html.parser.HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        """
        callback on tag open during parse: check links and images
        """
        if tag == 'a':   # Feb2014: for 'link' too? TBD
            url = [value for (name, value) in attrs if name.lower() == 'href']
            if url:
                self.notefile(url[0])

        elif tag == 'img':
            url = [value for (name, value) in attrs if name.lower() == 'src']
            if url:
                self.notefile(url[0])

    def notefile(self, url):
        """
        note used file found, and recur to a nested parse if html
        """
        urlparts = urllib.parse.urlparse(url)
        (scheme, server, filepath, parms, query, frag) = urlparts
        filename = os.path.basename(filepath)

        if (os.path.exists(filename)       and                # is it here?
            filename not in self.skipFiles and                # ignore it?
            filename not in self.usedFiles and                # skip repeats?
            (not server or server in self.thisSite)):         # Feb2014: site?

            self.usedFiles.add(filename)                      # add in-place
            if filename.endswith(('.html', '.htm')):          # recur for html
                parseFileRefs(
                    filename,
                    self.usedFiles,
                    self.skipFiles, self.thisSite,
                    self.indent + 3)


def deleteUnusedRemote(localUnusedDir, ftpsite, ftpuser, ftppswd, ftpdir='.'):
    """
    ------------------------------------------------------------------------
    TBD: delete unused files from remote site too? see Chapter 13 for ftp;
    not used because unused dir requires manual inspection if parse failures
    ------------------------------------------------------------------------
    """   
    from ftplib import FTP
    connection = FTP(ftpsite)
    connection.login(ftpuser, ftppswd)
    connection.cwd(ftpdir) 
    for filename in os.listdir(localUnusedDir):
        connection.delete(filename)


if __name__== '__main__':

    # Feb2014:
    # 1) thissite: ignore local file if link is to same name at diff site.
    # 2) allow > 1 ignore via splits; now only hard-coded in this script,
    # though could be a quoted and .split() command-line argument if useful.

    # edit me for your site defaults (this script is for personal use);
    # trial-and-error scheme: add ignores till unused set is large enough

    htmlroot = sys.argv[1] if len(sys.argv) > 1 else 'index.html'
    moveto   = sys.argv[2] if len(sys.argv) > 2 else 'PossiblyUnused'

    # edit or generalize me
    site = input('B(ooks) or T(raining)? ')
    if site.lower()[0] == 't':
        thissite = ['learning-python.com']
        ignore   = []
    elif site.lower()[0] == 'b':
        thissite = ['rmi.net', 'www.rmi.net']
        ignore   = ['whatsnew.html', 'about-me.html', 'self.html', 'pic22.html']
    print('ignore =', ignore, ', thissite =', thissite)

    # parse files, find and follow links, move unused files
    usedFiles = findUnusedFiles([htmlroot], moveto, ignore, thissite)
    moveFiles = os.listdir(moveto)

    # report on results
    print('-' * 80)
    print('**Summary**\n')
    print('%d unused files moved to:\n\t%s\n' % 
              (len(moveFiles), os.path.abspath(moveto)))
    print('%d used files in this site: ' % len(usedFiles))
    for F in sorted(usedFiles): print('\t', F)

    """
    if input('delete remotely?') in 'yY':
        deleteUnusedRemote(moveto, input('site?'), input('user?'), input('pswd?'))
    """