#!/usr/bin/env python3 """ ================================================================================== findem.py - locate text that may truncate web pages in Android Opera Origin: January 2022, M. Lutz, https://learning-python.com License: provided freely but with no warranties of any kind Use HTML parsing to find all
or blocks containing a "/*" character
sequence, among all the HTML files in a website tree. This sequence makes some
Android Operas truncate some pages. This is a browser bug that impacts 3 out of
11 pages with this pattern at the host website, in many Opera 5x and 6x versions.
To work around: search for occurrences here, and replace "/" with "/" HTML
escapes (these are converted to "/" in and by all browsers tested).
About the Opera bug: learning-python.com/about-this-site.html#androidbrowsernews.
Update: it's not just . A "/*" in a or comment can truncate
pages in Android Opera too. This now handles ; edit findin's default below
or pass to walker(). Comments are ignored here, and the bug varies by page.
Subtlety: Python's HTML parser converts already-changed / charrefs to a "/"
unless convert_charrefs is False. If you want to see all files that have or had
a "/*", set this to True; the False preset shows only files still having a "/*".
==================================================================================
"""
import os, sys, html.parser
# EDIT ME: defaults per parser
findit = '/*' # find this text in ...
findin = 'pre' # tag in which to search for findit: pre|code
dumpit = False # verbose mode: show matched block's full text
# EDIT ME: see note above
convert_charrefs = False # False=show only remaining findits
class Parser(html.parser.HTMLParser):
def __init__(self, findit, findin, dumpit, **others):
self.findit = findit
self.findin = findin
self.dumpit = dumpit
self.found = {} # count {files: matches}
super().__init__(**others) # pass others to superclass
def parse(self, text, path):
self.path = path
self.saving = False
self.feed(text)
def handle_starttag(self, tag, attrs):
if tag == self.findin:
self.content = ''
self.saving = True
def handle_data(self, data):
if self.saving:
self.content += data
def handle_endtag(self, tag):
if tag == self.findin:
if self.findit in self.content:
message = 'Found <%s> "%s" in: %s'
print(message % (self.findin, self.findit, self.path))
self.found[self.path] = self.found.get(self.path, 0) + 1
if self.dumpit:
print('-'*80, self.content, '-'*80, sep='\n')
self.saving = False
def walker(rootpath='.', findit=findit, findin=findin, dumpit=dumpit):
numhtml = 0
parser = Parser(findit, findin, dumpit, convert_charrefs=convert_charrefs)
for (dirhere, subs, files) in os.walk(rootpath):
for file in files:
if file.lower().endswith(('.html', '.htm')):
path = os.path.join(dirhere, file)
try:
text = open(path, encoding='utf8').read()
except:
try:
text = open(path, encoding='latin1').read()
except:
print('decoding error skipped:', path)
continue
numhtml += 1
parser.parse(text, path)
return numhtml, parser.found
if __name__ == '__main__':
rootpath = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
numhtml, found = walker(rootpath, findit, findin)
print('HTML files parsed:', numhtml)
print('Found %d matches in %d files' % (sum(found.values()), len(found)))
"""
=================================================================================
EXAMPLE OUTPUT:
~/MY-STUFF/Websites$ python3 $C/_etc/findem.py UNION
Found "/*" in: UNION/pyedit-linux-file-associations.html
Found "/*" in: UNION/errata-supplements.html
Found "/*" in: UNION/errata-supplements.html
Found "/*" in: UNION/extract_py.html
Found "/*" in: UNION/talk.html
Found "/*" in: UNION/extract-calendar_py.html
Found "/*" in: UNION/talkmore.html
Found "/*" in: UNION/talkmore.html
Found "/*" in: UNION/talkmore.html
Found "/*" in: UNION/newex.html
Found "/*" in: UNION/newex.html
Found "/*" in: UNION/newex.html
Found "/*" in: UNION/newex.html
Found "/*" in: UNION/pymailgui-products/unzipped/_README.html
Found "/*" in: UNION/ziptools/ziptools/_README.html
Found "/*" in: UNION/ziptools/ziptools/_README.html
Found "/*" in: UNION/ziptools/ziptools/_README.html
Found "/*" in: UNION/mergeall-android-scripts/_README.html
Found "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
Found "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
Found "/*" in: UNION/mergeall-products/unzipped/test/ziptools/_README.html
HTML files parsed: 2213
Found 21 matches in 11 files
=================================================================================
"""