File: headcode.py

#!/usr/bin/python
"""
========================================================================================
headcode.py - July 2016, Python 3.X + 2.X
Synopsis: a simple HTML file <head> code insertion/deletion script
Author:   M. Lutz, learning-python.com
License:  provided freely but with no warranties of any kind

    UPDATE: for a more-recent tool which achieves some similar goals, see 
    also https://learning-python.com/thumbspage/build/insert-analytics.py

Adds (if no args) or removes (if any arg) analytics or other <head>-section code
block *IN-PLACE* in all HTML files in a directory tree.  Change your tree root path
(HTMLDIR) and insertion code (INSERT) below as desired, and run with 0 args to add
or 1 to remove.  In the target use case, each run of this script saves editing 70
files manually - a prime example of the sort of tactical tasks Python excels at.

For an example of this script in action, see:
    http://learning-python.com/books/headcode-LOG.txt

Assumes your HTML files have either a "</head>" or "</HEAD>" to use as a replacement
target.  Also assumes your bytes INSERT is encoded per a Unicode scheme that is 
compatible with the content of all your HTML files -- it's ASCII by default if 
you use a bytes literal and the os.linesep encoding here, and ASCII is a subset of
most encodings (e.g., latin1, utf8); but manually encode a str otherwise if needed.

The INSERT encoding constraint derives from this script's use of binary files and
bytes to sidestep Unicode encodings - it processes still-encoded text to avoid
having to know each file's (and hence the INSERT's) encoding.  Alternatives:
  1) Guess file encodings by trying to open and decode with each of a set of options
  2) Look for an encoding type in a <meta> tag (if present) by parsing HTML
  3) Attempt to guess each file's encoding by inspecting its contents' bytes
Each of these seems overkill for this simple utility (but see genhtml for #1).
========================================================================================
"""
from __future__ import print_function  # 2.X compatibility
import sys, os, glob

HTMLDIR = r'C:\MY-STUFF\Training\cd\training-cd-2.7-3.5--oct015'     # live
HTMLDIR = r'C:\Users\me\Desktop\training-cd-2.7-3.5--oct015'         # test

mode = 'insert' if len(sys.argv) == 1 else 'restore'
print('MODE:', mode)

# assume ASCII compatible with content of files, else encode a str manually 
INSERT = b"""
<!--+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-->
<script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

  ga('create', 'UA-XXXXXXXX-1', 'auto');
  ga('send', 'pageview');

</script>
<!--+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-->

"""
INSERT = INSERT.replace(b'\n', os.linesep.encode('ascii'))  # for binary-mode file

cnvcnt = 0
for (dirpath, dirshere, fileshere) in os.walk(HTMLDIR):
    for filehere in fileshere:
        if filehere.endswith(('.htm', '.html')):
            htmlfile = os.path.join(dirpath, filehere)
            print(htmlfile)

            oldfile = open(htmlfile, 'rb')    # ignore Unicode encoding of file content, use bytes
            oldtext = oldfile.read()          # but assume compatible with INSERT encoding (ASCII?)
            oldfile.close()

            newfile = open(htmlfile, 'wb')
            if mode == 'insert':
                newtext = oldtext.replace(b'</head>', (INSERT + b'</head>'))
                newtext = newtext.replace(b'</HEAD>', (INSERT + b'</HEAD>'))

            elif mode == 'restore':
                newtext = oldtext.replace(INSERT, b'')

            newfile.write(newtext)
            newfile.close()
            cnvcnt += 1

print('DONE: %d files converted.' % cnvcnt)



[Home page] Books Code Blog Python Author Train Find ©M.Lutz