""" ========================================================================================= temp implementation notes: --DONE: re.findall subsumes page.count(term) for literals --DONE: str.lower() not required idf case insensitive match --DONE: combine nouns/verbs automatically --DONE could encode/decode to str per utf-8 (maybe) --DONE: for (score, terms) in [(+1, goodterms), (-1, badterms)]: net += score * len() # orginal coding # assumes byte strings (b'...') to avoid a raw html text decode goodterms = [b'wall (?:street|st) rises', b'stocks rose', b'stocks rise', b'markets rose', b'markets rise', b'market rose', b'market rises', b's&p gains'] # bad is auto converse of good badterms = goodterms.copy() badterms = [term.replace(b'rises', b'falls') for term in badterms] badterms = [term.replace(b'rose', b'fell') for term in badterms] badterms = [term.replace(b'rise', b'fall') for term in badterms] badterms = [term.replace(b'gains', b'loses') for term in badterms] # temp list comp equiv res = [] for term in goodterms: for term2 in goodterms: if term != term2 and term2.startswith(term): break else: res .append(term) # verb pattern effect >>> re.findall('wall street (?:end(?:s)? )?lower', 'cccwall st ends lowerccc') [] >>> re.findall('wall street (?:end(?:s)? )?lower', 'cccwall street ends lowerccc') ['wall street ends lower'] >>> re.findall('wall street (?:end(?:s)? )?lower', 'cccwall street end lowerccc') ['wall street end lower'] >>> re.findall('wall street (?:end(?:s)? )?lower', 'cccwall street lowerccc') ['wall street lower'] ========================================================================================= """