#!/usr/bin/python """mxTextTools version of Typographify() in dmTxt2Html.py The hope here is that this version will be dramatically faster than the regular-expression based version. """ from TextTools import * import string, re #-- List to contain all words with adjusted markup ws = [] head_pos = None loops = 0 #-- Define "emitter" callbacks for each output format def emit_misc(tl,txt,l,r,s): ws.append(txt[l:r]) def emit_func(tl,txt,l,r,s): ws.append(''+txt[l+1:r-1]+'') def emit_modl(tl,txt,l,r,s): ws.append(''+txt[l+1:r-1]+'') def emit_emph(tl,txt,l,r,s): ws.append(''+txt[l+1:r-1]+'') def emit_strg(tl,txt,l,r,s): ws.append(''+txt[l+1:r-1]+'') def emit_titl(tl,txt,l,r,s): ws.append(''+txt[l+1:r-1]+'') def jump_count(tl,txt,l,r,s): global head_pos, loops loops = loops+1 if head_pos is None: head_pos = r elif head_pos == r: raise "InfiniteLoopError", txt[l-20:l]+'{'+txt[l]+'}'+txt[l+1:r+15] else: head_pos = r def emit_debug(tl,txt,l,r,s): ws.append(txt[l:r]) print '<<', txt[l:r], '>>' #-- What can appear inside, and what can be, markups? punctuation = "`!@#$%^&*()_-+=|\{}[]:;'<>,.?/"+'"' punct_set = set(punctuation) markable = alphanumeric+whitespace+"`!@#$%^&()+=|\{}:;<>,.?/"+'"' markable_func = set(markable+"*-_[]") markable_modl = set(markable+"*-_'") markable_emph = set(markable+"*_'[]") markable_strg = set(markable+"-_'[]") markable_titl = set(markable+"*-'[]") markups = "-*'[]_" markup_set = set(markups) # What can precede and follow markup phrases? darkins = '(/"' leadins = whitespace+darkins # might add from "-*'[]_" darkouts = '/.),:;?!"' darkout_set = set(darkouts) leadouts = whitespace+darkouts # for non-conflicting markup leadout_set = set(leadouts) # What can appear inside plain words? wordish = alphanumeric+'{}/@#$%^&-_+=|\><'+darkouts word_set = set(wordish) wordinit = alphanumeric+"$#+\<.&{"+darkins wordinit_set = set(wordinit) #-- Define the word patterns (global so as to do it only at import) # Special markup def markup_struct(lmark, rmark, callback, markables, x_post="-"): struct = \ ( callback, Table+CallTag, ( (None, Is, lmark), # Starts with left marker (None, AllInSet, markables), # All stuff marked (that looks right) (None, Is, rmark), # Ends with right maker (None, IsInSet, leadout_set,+2,+1),# EITHER: postfixed with lead-out (None, Skip, -1,+1, MatchOk), # ..give back trailing lead-out char (None, IsIn, x_post, MatchFail), # OR: special case postfix (None, Skip, -1,+1, MatchOk) # ..give back special trailing char ) ) return struct funcs = markup_struct("'", "'", emit_func, markable_func) modules = markup_struct("[", "]", emit_modl, markable_modl) emphs = markup_struct("-", "-", emit_emph, markable_emph, x_post="") strongs = markup_struct("*", "*", emit_strg, markable_strg) titles = markup_struct("_", "_", emit_titl, markable_titl) # All the stuff not specially marked plain_words = \ ( ws, Table+AppendMatch, # AppendMatch is only -slightly- ( (None, IsInSet, # faster than emit_misc callback wordinit_set, MatchFail), # Must start with word-initial (None, Is, "'",+1), # May have apostrophe next (None, AllInSet, word_set,+1), # May have more word-internal (None, Is, "'", +2), # May have trailing apostrophe (None, IsIn, "st",+1), # May have [ts] after apostrophe (None, IsInSet, darkout_set,+1, MatchOk), # Postfixed with dark lead-out (None, IsInSet, whitespace_set, MatchFail), # Give back trailing whitespace (None, Skip, -1) ) ) # Catch some special cases bullet_point = \ ( ws, Table+AppendMatch, ( (None, Word+CallTag, "* "), # Asterisk bullet is a word ) ) horiz_rule = \ ( None, Table, ( (None, Word, "-"*50), # 50 dashes in a row (None, AllIn, "-"), # More dashes ) ) into_mark = \ ( ws, Table+AppendMatch, # Special case where dark lead-in ( (None, IsInSet, set(darkins)), # is followed by markup char (None, IsInSet, markup_set), (None, Skip, -1) # Give back the markup char ) ) stray_punct = \ ( ws, Table+AppendMatch, # Pickup any cases where multiple ( (None, IsInSet, punct_set), # punctuation character occur (None, AllInSet, punct_set), # alone (followed by whitespace) (None, IsInSet, whitespace_set), (None, Skip, -1) # Give back the whitespace ) ) leadout_eater = (ws, AllInSet+AppendMatch, leadout_set) # Tag all the (possibly marked-up) words tag_words = \ ( bullet_point+(+1,), horiz_rule + (+1,), into_mark + (+1,), stray_punct+ (+1,), emphs + (+1,), funcs + (+1,), strongs + (+1,), modules + (+1,), titles + (+1,), into_mark+(+1,), plain_words +(+1,), # Since file is mostly plain words, can leadout_eater+(+1,-1), # shortcut by tight looping (with escape) (jump_count, Skip+CallTag, 0), # Check for infinite loop (None, EOF, Here, -13) # Check for EOF ) def Typographify(txt): global ws ws = [] # clear the list before we proceed tag(txt, tag_words, 0, len(txt), ws) return string.join(ws, '') if __name__ == '__main__': import sys, time txt = open(sys.argv[1]).read() start = time.time() mx_txt = Typographify(txt) sys.stderr.write('*** TextTools processing ***\n') sys.stderr.write(sys.argv[1]+' processed in %.3f seconds' % (time.time()-start)+'\n') sys.stderr.write('%d full tagging loops\n' % loops) print mx_txt # Just for comparison, let us time the [re] version being replaced def reTypographify(txt): # [module] names r = re.compile(r"""([\(\s'/">]|^)\[(.*?)\]([<\s\.\),:;'"?!/-])""", re.M | re.S) txt = r.sub('\\1\\2\\3',txt) # *strongly emphasize* words r = re.compile(r"""([\(\s'/"]|^)\*(.*?)\*([\s\.\),:;'"?!/-])""", re.M | re.S) txt = r.sub('\\1\\2\\3', txt) # -emphasize- words r = re.compile(r"""([\(\s'/"]|^)-(.*?)-([\s\.\),:;'"?!/])""", re.M | re.S) txt = r.sub('\\1\\2\\3', txt) # _Book Title_ citations r = re.compile(r"""([\(\s'/"]|^)_(.*?)_([\s\.\),:;'"?!/-])""", re.M | re.S) txt = r.sub('\\1\\2\\3', txt) # 'Function()' names r = re.compile(r"""([\(\s/"]|^)'(.*?)'([\s\.\),:;"?!/-])""", re.M | re.S) txt = r.sub("\\1\\2\\3", txt) return txt start = time.time() re_txt = reTypographify(txt) sys.stderr.write('*** re processing ***\n') sys.stderr.write(sys.argv[1]+' processed in %.3f seconds' % (time.time()-start)+'\n')