File: cleansite11.py
#!/usr/bin/python3
"""
=======================================================================
Version 1.1: search for "Feb2014" for version changes -- Unicode file
content, local file same name as that in remote site link, >1 ignores.
License: provide freely, but with no warranties of any kind.
Synopsis: use python's html and url parser libs to try to isolate
and move unused files in a (flat) web site directory. Run me in
the directory of the site's root html file(s) (default=[index.html]).
This is heuristic: it assumes that referenced files are in this site
if they exist here. It also may incorrectly classify some files as
unused if they are referenced only from files which cause Python's
html parser to fail -- you should inspect the run log and unused file
directory manually after a run, to see if parse failures occurred.
More lenient html parsers exist for Python, but many seem 2.X-only;
other parse options might avoid failures too: re.findall() pattern
matches for '(?s)href="(.*?)"' and 'src=...'? (see Example 19-9).
See PP4E Chapters 19+14 for html parsers, Chapter 13 for url parsing;
this sort of code could be adapted to do web site search and similar.
TBD: extend me to delete the unused files from remote site via ftp;
not done because unused files require verification if parse failures.
CAVEAT: assumes site is one dir, doesn't handle subdirs (improve me);
=======================================================================
"""
import os, sys, html.parser, urllib.parse
def findUnusedFiles(rootFiles=['index.html'], # site roots to scan for links
dirUnused='Unused', # where to move unused files
skipFiles=[], # skip these even if reached
thisSite=[]): # skip links to other servers
"""
--------------------------------------------------------------------
main function: find files referenced by rootFiles and by any html
they reach, ignoring any filenames in skipFiles, and ignoring files
at sites other than thisSite, and move unused files to dirUnused;
--------------------------------------------------------------------
"""
usedfiles = set(rootFiles) # changed in-place
for rootfile in rootFiles:
parseFileRefs(rootfile, usedfiles, skipFiles, thisSite, indent=0)
moveUnusedFiles(usedfiles, dirUnused)
return usedfiles
def moveUnusedFiles(usedFiles, dirUnused, trace=print):
"""
--------------------------------------------------------------------
after finding all used files, move unused files to a temp directory
--------------------------------------------------------------------
"""
print('-' * 80)
if not os.path.exists(dirUnused): # tbd: clean if present?
os.mkdir(dirUnused)
for filename in os.listdir('.'):
if filename not in usedFiles:
if not os.path.isfile(filename):
print('Not a file:', filename)
else:
trace('Moving...', filename)
os.rename(filename, os.path.join(dirUnused, filename))
def parseFileRefs(htmlFile, usedFiles, skipFiles, thisSite, indent, trace=print):
"""
--------------------------------------------------------------------
find files referenced in root named htmlFile, recur for html files;
called initially, and via indirect recursion from html parser class;
--------------------------------------------------------------------
"""
trace('%sParsing:' % ('.' * indent), htmlFile)
parser = MyParser(usedFiles, skipFiles, thisSite, indent)
# Feb2014: default Unicode encoding can fail: make this explicit
try:
text = open(htmlFile, encoding='ascii').read()
except:
raw = open(htmlFile, 'rb').read() # try decoding bytes
for encoding in ('utf8', 'latin1'): # parser requires str text
try:
text = raw.decode(encoding)
break
except: pass
trace('%sEncoding =' % ('+' * indent), encoding)
try:
parser.feed(text)
except html.parser.HTMLParseError as E:
print('==>FAILED:', E) # file's refs may be missed!
parser.close()
class MyParser(html.parser.HTMLParser):
"""
--------------------------------------------------------------------
use Python stdlib html parser to scan files, changing usedFiles
in-place; could nest this in parseFileRefs for enclosing scope,
but would remake class per call;
--------------------------------------------------------------------
"""
def __init__(self, usedFiles, skipFiles, thisSite, indent):
self.usedFiles = usedFiles
self.skipFiles = skipFiles # doesn't vary
self.thisSite = thisSite # doesn't vary
self.indent = indent
super().__init__() # vs html.parser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""
callback on tag open during parse: check links and images
"""
if tag == 'a': # Feb2014: for 'link' too? TBD
url = [value for (name, value) in attrs if name.lower() == 'href']
if url:
self.notefile(url[0])
elif tag == 'img':
url = [value for (name, value) in attrs if name.lower() == 'src']
if url:
self.notefile(url[0])
def notefile(self, url):
"""
note used file found, and recur to a nested parse if html
"""
urlparts = urllib.parse.urlparse(url)
(scheme, server, filepath, parms, query, frag) = urlparts
filename = os.path.basename(filepath)
if (os.path.exists(filename) and # is it here?
filename not in self.skipFiles and # ignore it?
filename not in self.usedFiles and # skip repeats?
(not server or server in self.thisSite)): # Feb2014: site?
self.usedFiles.add(filename) # add in-place
if filename.endswith(('.html', '.htm')): # recur for html
parseFileRefs(
filename,
self.usedFiles,
self.skipFiles, self.thisSite,
self.indent + 3)
def deleteUnusedRemote(localUnusedDir, ftpsite, ftpuser, ftppswd, ftpdir='.'):
"""
------------------------------------------------------------------------
TBD: delete unused files from remote site too? see Chapter 13 for ftp;
not used because unused dir requires manual inspection if parse failures
------------------------------------------------------------------------
"""
from ftplib import FTP
connection = FTP(ftpsite)
connection.login(ftpuser, ftppswd)
connection.cwd(ftpdir)
for filename in os.listdir(localUnusedDir):
connection.delete(filename)
if __name__== '__main__':
# Feb2014:
# 1) thissite: ignore local file if link is to same name at diff site.
# 2) allow > 1 ignore via splits; now only hard-coded in this script,
# though could be a quoted and .split() command-line argument if useful.
# edit me for your site defaults (this script is for personal use);
# trial-and-error scheme: add ignores till unused set is large enough
htmlroot = sys.argv[1] if len(sys.argv) > 1 else 'index.html'
moveto = sys.argv[2] if len(sys.argv) > 2 else 'PossiblyUnused'
# edit or generalize me
site = input('B(ooks) or T(raining)? ')
if site.lower()[0] == 't':
thissite = ['learning-python.com']
ignore = []
elif site.lower()[0] == 'b':
thissite = ['rmi.net', 'www.rmi.net']
ignore = ['whatsnew.html', 'about-me.html', 'self.html', 'pic22.html']
print('ignore =', ignore, ', thissite =', thissite)
# parse files, find and follow links, move unused files
usedFiles = findUnusedFiles([htmlroot], moveto, ignore, thissite)
moveFiles = os.listdir(moveto)
# report on results
print('-' * 80)
print('**Summary**\n')
print('%d unused files moved to:\n\t%s\n' %
(len(moveFiles), os.path.abspath(moveto)))
print('%d used files in this site: ' % len(usedFiles))
for F in sorted(usedFiles): print('\t', F)
"""
if input('delete remotely?') in 'yY':
deleteUnusedRemote(moveto, input('site?'), input('user?'), input('pswd?'))
"""