The LaTeX logo, typeset with LaTeX
The LaTeX logo, typeset with LaTeX (Photo credit: Wikipedia)

I have implemented a simple DeTeX function in Python. I provide this function below, as is and without any guarantee. If you run it, and it should change the example LaTeX text into “simple” text thanks the detex() function defined in the code.

It’s a quick and dirty approach: I did not try to implement the full LaTeX syntax. I just applied a few regexps to strip the commands of the text. Feedback will be appreciated in the comment form below ๐Ÿ™‚

Take care to the “backslash plague” as explained in http://docs.python.org/2/howto/regex.html“.

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re

testMode=False

def applyRegexps(text, listRegExp):
""" Applies successively many regexps to a text"""
if testMode:
print 'n'.join(listRegExp)
# apply all the rules in the ruleset
for element in listRegExp:
left = element['left']
right = element['right']
r=re.compile(left)
text=r.sub(right,text)
return text

"""
_ _ ____
__| | ___| |_ _____ __/ /
/ _` |/ _ __/ _ / / | | |
| (_| | __/ || __/> <| | | |
__,_|___|_____/_/_ | | |
_/_/
"""

def detex(latexText):
"""Transform a latex text into a simple text"""
# initialization
regexps=[]
text=latexText
# remove all the contents of the header, ie everything before the first occurence of "begin{document}"
text = re.sub(r"(?s).*?(\begin{document})", "", text, 1)

# remove comments
regexps.append({r'left':r'([^\])%.*', 'right':r'1'})
text= applyRegexps(text, regexps)
regexps=[]

# - replace some LaTeX commands by the contents inside curly rackets
to_reduce = [r'\emph', r'\textbf', r'\textit', r'\text', r'\IEEEauthorblockA', r'\IEEEauthorblockN', r'\author', r'\caption',r'\author',r'\thanks']
for tag in to_reduce:
regexps.append({'left':tag+r'{([^}{]*)}', 'right':r'1'})
text= applyRegexps(text, regexps)
regexps=[]
"""
_ _ _ _ _ _
| |__ (_) __ _| (_) __ _| |__ | |_
| '_ | |/ _` | | |/ _` | '_ | __|
| | | | | (_| | | | (_| | | | | |_
|_| |_|_|__, |_|_|__, |_| |_|__|
|___/ |___/
"""
# - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
to_highlight = [r'\part[*]*', r'\chapter[*]*', r'\section[*]*', r'\subsection[*]*', r'\subsubsection[*]*', r'\paragraph[*]*'];
# highlightment pattern: #--content--#
for tag in to_highlight:
regexps.append({'left':tag+r'{([^}{]*)}','right':r'n#--1--#n'})
# highlightment pattern: [content]
to_highlight = [r'\title',r'\author',r'\thanks',r'\cite', r'\ref'];
for tag in to_highlight:
regexps.append({'left':tag+r'{([^}{]*)}','right':r'[1]'})
text= applyRegexps(text, regexps)
regexps=[]

"""
_ __ ___ _ __ ___ _____ _____
| '__/ _ '_ ` _ / _ / / _
| | | __/ | | | | | (_) V / __/
|_| ___|_| |_| |_|___/ _/ ___|

"""
# remove LaTeX tags
# - remove completely some LaTeX commands that take arguments
to_remove = [r'\maketitle',r'\footnote', r'\centering', r'\IEEEpeerreviewmaketitle', r'\includegraphics', r'\IEEEauthorrefmark', r'\label', r'\begin', r'\end', r'\big', r'\right', r'\left', r'\documentclass', r'\usepackage', r'\bibliographystyle', r'\bibliography', r'\cline', r'\multicolumn']

# replace tag with options and argument by a single space
for tag in to_remove:
regexps.append({'left':tag+r'([[^]]*])*({[^}{]*})*', 'right':r' '})
#regexps.append({'left':tag+r'{[^}{]*}[[^][]*]', 'right':r' '})
text= applyRegexps(text, regexps)
regexps=[]

"""
_
_ __ ___ _ __ | | __ _ ___ ___
| '__/ _ '_ | |/ _` |/ __/ _
| | | __/ |_) | | (_| | (_| __/
|_| ___| .__/|_|__,_|______|
|_|
"""

# - replace some LaTeX commands by the contents inside curly rackets
# replace some symbols by their ascii equivalent
# - common symbols
regexps.append({'left':r'\eg({})* *','right':r'e.g., '})
regexps.append({'left':r'\ldots','right':r'...'})
regexps.append({'left':r'\Rightarrow','right':r'=>'})
regexps.append({'left':r'\rightarrow','right':r'->'})
regexps.append({'left':r'\le','right':r'<='})
regexps.append({'left':r'\ge','right':r'>'})
regexps.append({'left':r'\_','right':r'_'})
regexps.append({'left':r'\\','right':r'n'})
regexps.append({'left':r'~','right':r' '})
regexps.append({'left':r'\&','right':r'&'})
regexps.append({'left':r'\%','right':r'%'})
regexps.append({'left':r'([^\])&','right':r'1t'})
regexps.append({'left':r'\item','right':r't- '})
regexps.append({'left':r'\hline[ t]*\hline','right':r'============================================='})
regexps.append({'left':r'[ t]*\hline','right':r'_____________________________________________'})
# - special letters
regexps.append({'left':r'\'{?{e}}?','right':r'รฉ'})
regexps.append({'left':r'\`{?{a}}?','right':r'ร '})
regexps.append({'left':r'\'{?{o}}?','right':r'รณ'})
regexps.append({'left':r'\'{?{a}}?','right':r'รก'})
# keep untouched the contents of the equations
regexps.append({'left':r'$(.)$', 'right':r'1'})
regexps.append({'left':r'$([^$]*)$', 'right':r'1'})
# remove the equation symbols ($)
regexps.append({'left':r'([^\])$', 'right':r'1'})
# correct spacing problems
regexps.append({'left':r' +,','right':r','})
regexps.append({'left':r' +','right':r' '})
regexps.append({'left':r' +)','right':r')'})
regexps.append({'left':r'( +','right':r'('})
regexps.append({'left':r' +.','right':r'.'})
# remove lonely curly brackets
regexps.append({'left':r'^([^{]*)}', 'right':r'1'})
regexps.append({'left':r'([^\]){([^}]*)}','right':r'12'})
regexps.append({'left':r'\{','right':r'{'})
regexps.append({'left':r'\}','right':r'}'})
# strip white space characters at end of line
regexps.append({'left':r'[ t]*n','right':r'n'})
# remove consecutive blank lines
regexps.append({'left':r'([ t]*n){3,}','right':r'n'})
# apply all those regexps
text= applyRegexps(text, regexps)
regexps=[]
# return the modified text
return text

"""
_
_ __ ___ __ _(_)_ __
| '_ ` _ / _` | | '_
| | | | | | (_| | | | | |
|_| |_| |_|__,_|_|_| |_|

"""
def main():
""" Just for debugging"""
#print "defining the test textn"
latexText=r"""
% This paper can be formatted using the peerreviewca
% (instead of conference) mode.
documentclass[twocolumn,a4paper]{article}
%documentclass[peerreviewca]{IEEEtran}
% correct bad hyphenation here
hyphenation{op-ti-cal net-works semi-con-duc-tor IEEEtran pri-va-cy Au-tho-ri-za-tion}
% package for printing the date and time (version)
usepackage{time}
begin{document}
title{Next Generation Networks}
author{Tot titithanks{Network and Security -- test company -- [email protected]}}
maketitle
begin{abstract}footnote{Version : today ; now}
lorem ipsum(ldots)end{abstract}
emph{Keywords: IP Multimedia Subsystem, Quality of Service}
section{Introduction} label{sect:introduction}
lorem ipsum(ldots) % of the world population. cite{TISPAN2006a}. footnote{Bearer Independent Call Control protocol}.
hline
section{Protocols used in IMS} label{sect:protocols}
lorem ipsum(ldots) cite{rfc2327, rfc3264}.
subsection{Authentication, Authorization, and Accounting} label{sect:protocols_aaa}
lorem ipsum(ldots)
subsubsection{Additional protocols} label{sect:protocols_additional}
lorem ipsum(ldots)
begin{table}
begin{center}
begin{tabular}{|c|c|c|}
hline
textbf{Capability} & textbf{UE} & textbf{GGSN} \ hline
emph{DiffServ Edge Function} & Optional & Required \ hline
emph{RSVP/IntServ} & Optional & Optional \ hline
emph{IP Policy Enforcement Point} & Optional & Required \ hline
end{tabular}
caption{IP Bearer Services Manager capability in the UE and GGSN}
label{tab_ue_ggsn}
end{center}
end{table}
The main transport layer functions are listed below:
begin{my_itemize}
item The emph{Resource Control Enforcement Function} (RCEF) enforces policies under the control of the A-RACF. It opens and closes unidirectional filters called emph{gates} or emph{pinholes}, polices traffic and marks IP packets cite{TISPAN2006c}.
item The emph{Border Gateway Function} (BGF) performs policy enforcement and Network Address Translation (NAT) functions under the control of the S-PDF. It operates on unidirectional flows related to a particular session (micro-flows) cite{TISPAN2006c}.
item The emph{Layer 2 Termination Point} (L2TP) terminates the Layer 2 procedures of the access network cite{TISPAN2006c}.
end{my_itemize}
Their QoS capabilities are summarized in table ref{tab_rcef_bgf} cite{TISPAN2006c}.
The admission control usually follows a three step procedure:
begin{my_enumerate}
item Authorization of resources (eg by the A-RACF)
item Resource reservation (eg by the BGF)
item Resource commitment (eg by the RCEF)
end{my_enumerate}
begin{figure}
centering
includegraphics[width=1.5in]{./pictures/RACS_functional_architecture}
caption{RACS interaction with transfer functions}
label{fig_RACS_functional_architecture}
end{figure}
%subsection{Example} label{sect:qos_example}
% conference papers do not normally have an appendix
% use section* for acknowledgement
section*{Acknowledgment}
% optional entry into table of contents (if used)
%addcontentsline{toc}{section}{Acknowledgment}
lorem ipsum(ldots)
bibliographystyle{plain}
%bibliographystyle{alpha}
bibliography{./mabiblio}
end{document}
"""
#print 'n'.join(diff)
text=detex(latexText)
print text


if __name__ == "__main__":
main()
Enjoy!  And feel free to comment below or to put a link to this article on your blog. Thanks!


Leave a Reply

Your email address will not be published. Required fields are marked *