File: findem.py

#!/usr/bin/env python3
"""
==================================================================================
findem.py - locate text that may truncate web pages in Android Opera
Origin:  January 2022, M. Lutz, https://learning-python.com
License: provided freely but with no warranties of any kind

Use HTML parsing to find all <pre> or <code> blocks containing a "/*" character 
sequence, among all the HTML files in a website tree.  This sequence makes some 
Android Operas truncate some pages.  This is a browser bug that impacts 3 out of
11 pages with this pattern at the host website, in many Opera 5x and 6x versions.

To work around: search for occurrences here, and replace "/" with "&#47;" HTML
escapes (these are converted to "/" in <pre> and <code> by all browsers tested).
About the Opera bug: learning-python.com/about-this-site.html#androidbrowsernews.

Update: it's not just <pre>.  A "/*" in a <code> or <!--> comment can truncate 
pages in Android Opera too.  This now handles <code>; edit findin's default below
or pass to walker().  Comments are ignored here, and the bug varies by page.

Subtlety: Python's HTML parser converts already-changed &#47; charrefs to a "/"
unless convert_charrefs is False.  If you want to see all files that have or had
a "/*", set this to True; the False preset shows only files still having  a "/*". 
==================================================================================
"""

import os, sys, html.parser

# EDIT ME: defaults per parser
findit = '/*'       # find this text in <findin>...</findin>
findin = 'pre'      # tag in which to search for findit: pre|code
dumpit = False      # verbose mode: show matched block's full text

# EDIT ME: see note above
convert_charrefs = False     # False=show only remaining findits


class Parser(html.parser.HTMLParser):

    def __init__(self, findit, findin, dumpit, **others):
        self.findit = findit
        self.findin = findin
        self.dumpit = dumpit
        self.found = {}                # count {files: matches}
        super().__init__(**others)     # pass others to superclass

    def parse(self, text, path):
        self.path = path
        self.saving = False
        self.feed(text)

    def handle_starttag(self, tag, attrs):
        if tag == self.findin:
            self.content = ''
            self.saving = True

    def handle_data(self, data):
        if self.saving:
            self.content += data

    def handle_endtag(self, tag):
        if tag == self.findin: 
            if self.findit in self.content:
                message = 'Found <%s> "%s" in: %s'
                print(message % (self.findin, self.findit, self.path))
                self.found[self.path] = self.found.get(self.path, 0) + 1
                if self.dumpit: 
                    print('-'*80, self.content, '-'*80, sep='\n')
            self.saving = False


def walker(rootpath='.', findit=findit, findin=findin, dumpit=dumpit):
    numhtml = 0 
    parser = Parser(findit, findin, dumpit, convert_charrefs=convert_charrefs)
    for (dirhere, subs, files) in os.walk(rootpath):
        for file in files:
            if file.lower().endswith(('.html', '.htm')):
                path = os.path.join(dirhere, file)
                try:
                    text = open(path, encoding='utf8').read()             
                except:
                    try:
                        text = open(path, encoding='latin1').read()             
                    except:
                        print('decoding error skipped:', path)
                        continue
                numhtml += 1
                parser.parse(text, path)
    return numhtml, parser.found


if __name__ == '__main__':
    rootpath = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
    numhtml, found = walker(rootpath, findit, findin)
    print('HTML files parsed:', numhtml)
    print('Found %d matches in %d files' % (sum(found.values()), len(found)))


"""
=================================================================================
EXAMPLE OUTPUT:
~/MY-STUFF/Websites$ python3 $C/_etc/findem.py UNION 
Found <pre> "/*" in: UNION/pyedit-linux-file-associations.html
Found <pre> "/*" in: UNION/errata-supplements.html
Found <pre> "/*" in: UNION/errata-supplements.html
Found <pre> "/*" in: UNION/extract_py.html
Found <pre> "/*" in: UNION/talk.html
Found <pre> "/*" in: UNION/extract-calendar_py.html
Found <pre> "/*" in: UNION/talkmore.html
Found <pre> "/*" in: UNION/talkmore.html
Found <pre> "/*" in: UNION/talkmore.html
Found <pre> "/*" in: UNION/newex.html
Found <pre> "/*" in: UNION/newex.html
Found <pre> "/*" in: UNION/newex.html
Found <pre> "/*" in: UNION/newex.html
Found <pre> "/*" in: UNION/pymailgui-products/unzipped/_README.html
Found <pre> "/*" in: UNION/ziptools/ziptools/_README.html
Found <pre> "/*" in: UNION/ziptools/ziptools/_README.html
Found <pre> "/*" in: UNION/ziptools/ziptools/_README.html
Found <pre> "/*" in: UNION/mergeall-android-scripts/_README.html
Found <pre> "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
Found <pre> "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
Found <pre> "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
HTML files parsed: 2213
Found 21 matches in 11 files
=================================================================================
"""