A simple DeTeX function in python – LaTeX to text

The LaTeX logo, typeset with LaTeX (Photo credit: Wikipedia)
I have implemented a simple DeTeX function in Python. I provide this function below, as is and without any guarantee. If you run it, and it should change the example LaTeX text into “simple” text thanks the detex() function defined in the code.
It’s a quick and dirty approach: I did not try to implement the full LaTeX syntax. I just applied a few regexps to strip the commands of the text. Feedback will be appreciated in the comment form below 🙂
Take care to the “backslash plague” as explained in http://docs.python.org/2/howto/regex.html“.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
    
import re

testMode=False

def applyRegexps(text, listRegExp):
    """ Applies successively many regexps to a text"""
    if testMode:
        print 'n'.join(listRegExp)
    # apply all the rules in the ruleset
    for element in listRegExp:
        left = element['left']
        right = element['right']
        r=re.compile(left)
        text=r.sub(right,text)
    return text

"""
     _      _             ____  
  __| | ___| |_ _____  __/ /  
 / _` |/ _  __/ _  / / |  | |
| (_| |  __/ ||  __/>  <| |  | |
 __,_|___|_____/_/_ |  | |
                         _/_/ 
"""

def detex(latexText):
    """Transform a latex text into a simple text"""    
    # initialization
    regexps=[]
    text=latexText
    # remove all the contents of the header, ie everything before the first occurence of "begin{document}"
    text = re.sub(r"(?s).*?(\begin{document})", "", text, 1)
    
    # remove comments
    regexps.append({r'left':r'([^\])%.*', 'right':r'1'})
    text= applyRegexps(text, regexps)
    regexps=[]
     
    # - replace some LaTeX commands by the contents inside curly rackets
    to_reduce = [r'\emph', r'\textbf', r'\textit', r'\text', r'\IEEEauthorblockA', r'\IEEEauthorblockN', r'\author', r'\caption',r'\author',r'\thanks']
    for tag in to_reduce:
      regexps.append({'left':tag+r'{([^}{]*)}', 'right':r'1'})
    text= applyRegexps(text, regexps)
    regexps=[]
    """
     _     _       _ _       _     _   
    | |__ (_) __ _| (_) __ _| |__ | |_ 
    | '_ | |/ _` | | |/ _` | '_ | __|
    | | | | | (_| | | | (_| | | | | |_ 
    |_| |_|_|__, |_|_|__, |_| |_|__|
             |___/     |___/           
    """
    # - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
    to_highlight = [r'\part[*]*', r'\chapter[*]*', r'\section[*]*', r'\subsection[*]*', r'\subsubsection[*]*', r'\paragraph[*]*'];
    # highlightment pattern: #--content--#
    for tag in to_highlight:
      regexps.append({'left':tag+r'{([^}{]*)}','right':r'n#--1--#n'})
    # highlightment pattern: [content]
    to_highlight = [r'\title',r'\author',r'\thanks',r'\cite', r'\ref'];
    for tag in to_highlight:
      regexps.append({'left':tag+r'{([^}{]*)}','right':r'[1]'})
    text= applyRegexps(text, regexps)
    regexps=[]
    
    """
     _ __ ___ _ __ ___   _____   _____ 
    | '__/ _  '_ ` _  / _   / / _ 
    | | |  __/ | | | | | (_)  V /  __/
    |_|  ___|_| |_| |_|___/ _/ ___|
                                       
    """
    # remove LaTeX tags
    # - remove completely some LaTeX commands that take arguments
    to_remove = [r'\maketitle',r'\footnote', r'\centering', r'\IEEEpeerreviewmaketitle', r'\includegraphics', r'\IEEEauthorrefmark', r'\label', r'\begin', r'\end', r'\big', r'\right', r'\left', r'\documentclass', r'\usepackage', r'\bibliographystyle', r'\bibliography',  r'\cline', r'\multicolumn']
    
    # replace tag with options and argument by a single space
    for tag in to_remove:
      regexps.append({'left':tag+r'([[^]]*])*({[^}{]*})*', 'right':r' '})
      #regexps.append({'left':tag+r'{[^}{]*}[[^][]*]', 'right':r' '})
    text= applyRegexps(text, regexps)
    regexps=[]

    """
                    _                
     _ __ ___ _ __ | | __ _  ___ ___ 
    | '__/ _  '_ | |/ _` |/ __/ _ 
    | | |  __/ |_) | | (_| | (_|  __/
    |_|  ___| .__/|_|__,_|______|
             |_|                     
    """
    
    # - replace some LaTeX commands by the contents inside curly rackets
    # replace some symbols by their ascii equivalent
    # - common symbols
    regexps.append({'left':r'\eg({})* *','right':r'e.g., '})
    regexps.append({'left':r'\ldots','right':r'...'})
    regexps.append({'left':r'\Rightarrow','right':r'=>'})
    regexps.append({'left':r'\rightarrow','right':r'->'})
    regexps.append({'left':r'\le','right':r'<='})
    regexps.append({'left':r'\ge','right':r'>'})
    regexps.append({'left':r'\_','right':r'_'})
    regexps.append({'left':r'\\','right':r'n'})
    regexps.append({'left':r'~','right':r' '})
    regexps.append({'left':r'\&','right':r'&'})
    regexps.append({'left':r'\%','right':r'%'})
    regexps.append({'left':r'([^\])&','right':r'1t'})
    regexps.append({'left':r'\item','right':r't- '})
    regexps.append({'left':r'\hline[ t]*\hline','right':r'============================================='})
    regexps.append({'left':r'[ t]*\hline','right':r'_____________________________________________'})
    # - special letters
    regexps.append({'left':r'\'{?{e}}?','right':r'é'})
    regexps.append({'left':r'\`{?{a}}?','right':r'à'})
    regexps.append({'left':r'\'{?{o}}?','right':r'ó'})
    regexps.append({'left':r'\'{?{a}}?','right':r'á'})
    # keep untouched the contents of the equations
    regexps.append({'left':r'$(.)$', 'right':r'1'})
    regexps.append({'left':r'$([^$]*)$', 'right':r'1'})
    # remove the equation symbols ($)
    regexps.append({'left':r'([^\])$', 'right':r'1'})
    # correct spacing problems
    regexps.append({'left':r' +,','right':r','})
    regexps.append({'left':r' +','right':r' '})
    regexps.append({'left':r' +)','right':r')'})
    regexps.append({'left':r'( +','right':r'('})
    regexps.append({'left':r' +.','right':r'.'})    
    # remove lonely curly brackets    
    regexps.append({'left':r'^([^{]*)}', 'right':r'1'})
    regexps.append({'left':r'([^\]){([^}]*)}','right':r'12'})
    regexps.append({'left':r'\{','right':r'{'})
    regexps.append({'left':r'\}','right':r'}'})
    # strip white space characters at end of line
    regexps.append({'left':r'[ t]*n','right':r'n'})
    # remove consecutive blank lines
    regexps.append({'left':r'([ t]*n){3,}','right':r'n'})
    # apply all those regexps
    text= applyRegexps(text, regexps)
    regexps=[]    
    # return the modified text
    return text

"""
                 _       
 _ __ ___   __ _(_)_ __  
| '_ ` _  / _` | | '_  
| | | | | | (_| | | | | |
|_| |_| |_|__,_|_|_| |_|
                         
"""
def main():
    """ Just for debugging"""
    #print "defining the test textn"
    latexText=r"""
    % This paper can be formatted using the peerreviewca
    % (instead of conference) mode.
    documentclass[twocolumn,a4paper]{article}
    %documentclass[peerreviewca]{IEEEtran}
    % correct bad hyphenation here
    hyphenation{op-ti-cal net-works semi-con-duc-tor IEEEtran pri-va-cy Au-tho-ri-za-tion}
    % package for printing the date and time (version)
    usepackage{time}
    begin{document}
    title{Next Generation Networks}
    author{Tot titithanks{Network and Security -- test company -- [email protected]}}
    maketitle
    begin{abstract}footnote{Version :  today ;  now}
    lorem ipsum(ldots)end{abstract}
    emph{Keywords: IP Multimedia Subsystem, Quality of Service}
    section{Introduction} label{sect:introduction}
    lorem ipsum(ldots) % of the world population. cite{TISPAN2006a}. footnote{Bearer Independent Call Control protocol}. 
    hline
    section{Protocols used in IMS} label{sect:protocols}
    lorem ipsum(ldots) cite{rfc2327, rfc3264}.
    subsection{Authentication, Authorization, and Accounting} label{sect:protocols_aaa}
    lorem ipsum(ldots)
    subsubsection{Additional protocols} label{sect:protocols_additional}
    lorem ipsum(ldots)
    begin{table}
        begin{center}
            begin{tabular}{|c|c|c|}
            hline
                textbf{Capability}                                 & textbf{UE} & textbf{GGSN} \ hline
                emph{DiffServ Edge Function}           & Optional      & Required          \ hline
                emph{RSVP/IntServ}                                 & Optional      & Optional          \ hline
                emph{IP Policy Enforcement Point}  & Optional      & Required          \ hline
            end{tabular}
        caption{IP Bearer Services Manager capability in the UE and GGSN}
        label{tab_ue_ggsn}
        end{center}
    end{table}
     The main transport layer functions are listed below:
    begin{my_itemize}
        item The emph{Resource Control Enforcement Function} (RCEF) enforces policies under the control of the A-RACF. It opens and closes unidirectional filters called emph{gates} or emph{pinholes}, polices traffic and marks IP packets cite{TISPAN2006c}.
        item  The emph{Border Gateway Function} (BGF) performs policy enforcement and Network Address Translation (NAT) functions under the control of the S-PDF. It operates on unidirectional flows related to a particular session (micro-flows) cite{TISPAN2006c}.
        item  The emph{Layer 2 Termination Point} (L2TP) terminates the Layer 2 procedures of the access network cite{TISPAN2006c}.
    end{my_itemize}
    Their QoS capabilities are summarized in table ref{tab_rcef_bgf} cite{TISPAN2006c}.
    The admission control usually follows a three step procedure:
    begin{my_enumerate}
        item Authorization of resources (eg by the A-RACF)
        item Resource reservation (eg by the BGF)
        item Resource commitment (eg by the RCEF)
    end{my_enumerate}
    begin{figure}
    centering
    includegraphics[width=1.5in]{./pictures/RACS_functional_architecture}
    caption{RACS interaction with transfer functions}
    label{fig_RACS_functional_architecture}
    end{figure}
    %subsection{Example}  label{sect:qos_example}
    % conference papers do not normally have an appendix
    % use section* for acknowledgement
    section*{Acknowledgment}
    % optional entry into table of contents (if used)
    %addcontentsline{toc}{section}{Acknowledgment}
    lorem ipsum(ldots)
    bibliographystyle{plain}
    %bibliographystyle{alpha}
    bibliography{./mabiblio}
    end{document}
    """
    #print 'n'.join(diff)
    text=detex(latexText)
    print text


if __name__ == "__main__":
    main()
Enjoy! And feel free to comment below or to put a link to this article on your blog. Thanks!
A simple DeTeX function in python – LaTeX to text

Leave a Reply Cancel reply