lib/reLyX/CleanTeX.pm

   1 # This file is part of reLyX
   2 # Copyright (c) 1998-9 Amir Karger karger@post.harvard.edu
   3 # You are free to use and modify this code under the terms of
   4 # the GNU General Public Licence version 2 or later.
   5
   6 package CleanTeX;
   7 # This package prepares a LaTeX file for translation to LyX
   8 # - Translates some local commands (e.g., {\em blah} to {\emph{blah}})
   9 # - Prepares math mode stuff for LyX. LyX reads LaTeX math mode directly,
  10 #      so reLyX can basically copy all math mode exactly, but LyX is a
  11 #      bit stricter than LaTeX. E.g., translate 'x^2' -> 'x^{2}
  12 # - Removes optional arguments if LyX doesn't understand them, e.g. \\
  13
  14 use strict;
  15
  16 use Verbatim;
  17
  18 ######
  19 # Global variables
  20 my $last_eaten; # last token we ate
  21
  22 # List of commands for which LyX doesn't support the optional argument
  23 my @DeleteOptArg = map {"\\$_"} qw(\\ \\*
  24               chapter section subsection subsubsection paragraph subparagraph
  25               );
  26
  27 my $debug_on; # was -d option given?
  28
  29 #########################   PARSER INVOCATION   ################################
  30 sub call_parser {
  31 # This subroutine opens the TeX parser and processes the file.
  32 # Arg0 is the name of the input TeX file
  33 # Arg1 is the name of the output "clean" file
  34
  35     my ($InFileName, $OutFileName) = (shift,shift);
  36
  37     $debug_on = (defined($main::opt_d) && $main::opt_d);
  38     my $zzz=$debug_on ? " TeX file ($InFileName --> $OutFileName)\n" :"... ";
  39     print STDERR "Cleaning$zzz";
  40     open (OUTFILE, ">$OutFileName") or die "problem opening $OutFileName: $!\n";
  41
  42 # Create the list of tokens for the parser
  43 # Parts of the token list are swiped from TeX.pm
  44     my %MyTokens = ( '{' => $Text::TeX::Tokens{'{'},
  45                      '}' => $Text::TeX::Tokens{'}'},
  46                      '$' => $Text::TeX::Tokens{'$'},
  47                      '$$' => $Text::TeX::Tokens{'$$'},
  48                      '\begin' => $Text::TeX::Tokens{'\begin'},
  49                      '\end' => $Text::TeX::Tokens{'\end'},
  50                    );
  51
  52     # Put local tokens, like \em, into %MyTokens
  53     #Note: \cal is "local", although it's found in math mode
  54     # (The "map" just puts a backslash in front of each word in the list)
  55     my @LocalTokens = qw (em rm bf tt sf sc sl it
  56                          rmfamily ttfamily sffamily mdseries bfseries
  57                         upshape itshape slshape scshape cal
  58                         );
  59     foreach (@LocalTokens) {
  60         $MyTokens{"\\$_"} = $Text::TeX::Tokens{'\em'}
  61     }
  62     # Now add any commands
  63     &ReadCommands::Merge(\%MyTokens);
  64
  65 # Create the fileobject
  66     my $file = new Text::TeX::OpenFile
  67            $InFileName,
  68            'defaultact' => \&clean_tex,
  69            'tokens' => \%MyTokens;
  70
  71 # Now actually process the file
  72     $file->process;
  73     close OUTFILE;
  74     #warn "Done cleaning TeX file\n";
  75 } # end sub call_parser
  76
  77
  78 #######################   MAIN TRANSLATING SUBROUTINE   ########################
  79 # Routine called by the TeX-parser to perform token-processing.
  80 sub clean_tex {
  81     my($eaten,$txt) = (shift,shift);
  82     my ($outstr, $type);
  83
  84     # Translation table for TT::Token tokens whose translations should
  85     #    NOT have whitespace after them! See sub translate...
  86     #   Note that tokens of type TT::EndLocal are always translated to '}'. So,
  87     #   any token defined as a local token *must* be translated to something
  88     #   with a '{' (e.g., '\em' -> '\emph{') or we'll have mismatched braces
  89     my %no_ws_transtbl = (
  90                         '\em' => '\emph{',
  91                         '\rm' => '\textrm{',
  92                         '\bf' => '\textbf{',
  93                         '\tt' => '\texttt{',
  94                         '\sf' => '\textsf{',
  95                         '\sc' => '\textsc{',
  96                         '\sl' => '\textsl{',
  97                         '\it' => '\textit{',
  98                         '\rmfamily' => '\textrm{',
  99                         '\ttfamily' => '\texttt{',
 100                         '\sffamily' => '\textsf{',
 101                         '\mdseries' => '\textmd{',
 102                         '\bfseries' => '\textbf{',
 103                         '\upshape' => '\textup{',
 104                         '\itshape' => '\textit{',
 105                         '\slshape' => '\textsl{',
 106                         '\scshape' => '\textsc{',
 107                         '\cal' => '\mathcal{',
 108                         );
 109
 110
 111     # a faux "switch" statement.  sets $_ for later use in pattern
 112     # matching.
 113     $type = ref($eaten);
 114     $type =~ s/^Text::TeX::// or die "Non-Text::TeX object";
 115     my $printstr = ""; # default for undefined printstrs etc.
 116     SWITCH: for ($type) {
 117            # Handle blank lines.
 118            if (/Paragraph/) {
 119                last SWITCH;
 120            }
 121
 122            # Handle the end of a local font command - insert a '}'
 123            if (/EndLocal/) {
 124                $printstr = '}';
 125                last SWITCH;
 126            }
 127
 128            # $eaten->exact_print is undefined for previous environments
 129            $outstr = $eaten->exact_print;
 130            if (! defined $outstr) { # comment at end of paragraph
 131                warn "Weird undefined token $eaten!" unless $eaten->comment;
 132                last SWITCH;
 133             }
 134
 135            # Handle LaTeX tokens
 136            if (/^Token$/) {
 137                my $realtok = $eaten->print; # w/out whitespace
 138                # If a comment is its own paragraph, print nothing
 139                last SWITCH unless defined($realtok);
 140                # Special handling for \verb and \verb*
 141                if ($realtok =~ /^\\verb\*?/) {
 142                    $printstr = &Verbatim::copy_verb($txt,$eaten);
 143                    last SWITCH;
 144                }
 145
 146                # Translate token if necessary, or just print it
 147                # "no_ws" is HACK to remove whitespace, so '\em ' -> '\emph{'
 148                $printstr = &translate($outstr, \%no_ws_transtbl, "no_ws");
 149
 150                # Ignore optional argument(s) if necessary
 151                $printstr .= &handle_opt_args($eaten,$txt);
 152
 153                last SWITCH;
 154            }
 155
 156            # Tokens taking arguments, like '^'
 157            # ADD '{' if there isn't one before the argument!
 158            # TODO can we check whether the command is \label, \include
 159            # and not add the braces in that case?
 160            if (/^BegArgsToken$/) {
 161                $printstr = $outstr;
 162
 163                # Ignore optional argument(s) if necessary
 164                $printstr .= &handle_opt_args($eaten,$txt);
 165
 166                # Add beginning brace before the 1st argument if there isn't one
 167                my $tok = $txt->lookAheadToken;
 168                $printstr .= '{' unless ($tok =~ /\{/);
 169                last SWITCH;
 170            }
 171
 172            # End of one argument, beginning of next
 173            # Note: by default ArgToken,EndArgsToken print nothing
 174            # ADD '}' if there isn't one after the last argument
 175            # Then read and print any optional arguments which may exist
 176            #    between this argument the next (we must do this here or we would
 177            #    add a '{' before an optional argument!)
 178            # ADD '{' if there isn't one before the next argument!
 179            # (just like we do in BegArgsToken and EndArgsToken)
 180            if (/^ArgToken$/) {
 181                $printstr = $outstr; # = ''
 182
 183                # Add '}' after the argument that ended if necessary
 184                $printstr .= '}' unless $last_eaten->print eq "\}";
 185
 186                # Eat and print any optional arguments
 187                $printstr .= &handle_opt_args($eaten,$txt);
 188
 189                # Add '{' before the next argument if necessary
 190                my $tok = $txt->lookAheadToken;
 191                $printstr .= '{' unless ($tok =~ /\{/);
 192                last SWITCH;
 193            }
 194
 195            # End of tokens taking arguments, like '^'
 196            #     ADD '}' if there isn't one after the last argument, i.e.,
 197            # if the previous token *wasn't* a '}'
 198            #     Kludge: for TeX style \input command ("\input foo" with no
 199            # braces) we need to read the whole filename, but parser will have
 200            # read only one char. So read in the rest of the filename before
 201            # printing the '}'.
 202            if (/^EndArgsToken$/) {
 203                $printstr = $outstr; # = ''
 204
 205                unless ($last_eaten->print eq "\}") {
 206                    my $s = $eaten->base_token;
 207                    if ($s->print eq "\\input") {
 208                        my $t = $txt->lookAheadToken;
 209                        # For one-char filename (a.tex) do nothing
 210                        if ($t =~ /^[\w.\-]/) {
 211                            my $u = $txt->eatMultiToken;
 212                            $t = $u->print;
 213                            $t =~ s/\s+//g;
 214                            $printstr .= $t;
 215                         }
 216                         # TeX \input always adds .tex ending
 217                         $printstr .= ".tex";
 218                     }
 219
 220                    $printstr .= '}';
 221                 }
 222
 223                # Don't bother eating optional args coming after the last
 224                # required arg: they'll just be copied as text
 225                last SWITCH;
 226            }
 227
 228            # Handle opening groups, like '{' and '$'.
 229            if (/Begin::Group$/) {
 230                $printstr = $outstr;
 231                last SWITCH;
 232            }
 233
 234            # Handle closing groups, like '}' and '$'.
 235            if (/End::Group$/) {
 236                $printstr = $outstr;
 237                last SWITCH;
 238            }
 239
 240            if (/Begin::Group::Args/) {
 241                my $env = $eaten->environment;
 242                $printstr = $outstr;
 243                if ($env eq "verbatim" || $env eq "reLyXskip") {
 244                    # copy everything up to "\end{foo}"
 245                    $printstr .= &Verbatim::copy_verbatim($txt, $eaten);
 246                }
 247                last SWITCH;
 248            }
 249
 250            if  (/End::Group::Args/) {
 251                $printstr = $outstr;
 252                last SWITCH;
 253            }
 254
 255            if (/Text/) {
 256                $printstr = $outstr;
 257                last SWITCH;
 258            }
 259
 260            # The default action - print the string.
 261            $printstr = $outstr;
 262     } # end SWITCH:for ($type)
 263
 264     # Actually print the string
 265     if (defined $printstr) {
 266         print OUTFILE $printstr;
 267         $last_eaten = $eaten; #save for next time
 268     } else {warn "Undefined printstr";}
 269
 270 } # end sub clean_tex
 271
 272 ####################   TRANSLATOR SUBROUTINES    ###############################
 273 sub translate {
 274 # Replace a string (possibly with whitespace around it) with another
 275 # Arg0 is a string, Arg1 is a reference to a hash containing translations
 276 # If a token not in the table is passed in, do nothing
 277 # If Arg2 is defined AND the token is known, then remove whitespace from
 278 #     the end of the translated token. This is a HACK to do '\em ' -> '\emph{'
 279 # Return the string, possibly modified
 280     my ($tokstr, $transref) = (shift, shift);
 281     my $remove_ws = shift;
 282     my %transtable = %$transref;
 283
 284     # remove whitespace from the string (since transtable doesn't have it)
 285     my $stripstr = $tokstr;
 286     $stripstr =~ s/^\s*(\S+)\s*$/$1/ or warn "couldn't strip token";
 287     if ( exists $transtable{$stripstr} ) {
 288          # use \Q or \, (, $, and [ will be misinterpreted
 289         $tokstr =~ s/\Q$stripstr\E/$transtable{$stripstr}/;
 290
 291         # remove whitespace?
 292         if (defined $remove_ws) {
 293             $tokstr =~ s/\s*$//;
 294         }
 295     }
 296
 297     return $tokstr;
 298 }
 299
 300 sub handle_opt_args {
 301 # read and concatenate OR IGNORE optional arguments
 302 # Arg0 is a BegArgsToken or ArgToken
 303     my ($eaten,$fileobject) = (shift,shift);
 304     my $outstr = "";
 305
 306     # If at end of paragraph, don't bother looking for optArgs
 307     return "" unless $fileobject->lookAheadToken;
 308
 309     # Get the next argument(s) expected for this token == /^o*[rR]?$/
 310     # If there are no args expected, just return
 311     my $curr_args = $eaten->next_args($fileobject) or return "";
 312
 313     # Now print or ignore any optional arguments
 314     # If there's an 'r' in curr_args, we're done for now
 315     my $foo;
 316     my $token_name = $eaten->token_name; # (needed for EndArgsToken, e.g.)
 317     while ($curr_args =~ s/^o//) {
 318         my $opt = $fileobject->eatOptionalArgument;
 319         # Print any initial space before the optional argument
 320         if ($foo = $opt->exact_print) {
 321             if ($foo =~ /^(\s+)/) {
 322                 $outstr .= $1;
 323             }
 324         }
 325
 326         # Print the argument or ignore it
 327         if ($opt->print) {
 328             if (grep /^\Q$token_name\E$/, @DeleteOptArg) {
 329                 print "Optional argument '",$opt->print,
 330                        "' to macro $token_name ignored\n";
 331             } else {
 332                 $outstr .= "[" . $opt->print . "]";
 333             }
 334         } # Was an optional argument found?
 335     }
 336
 337     return $outstr;
 338 } # end sub handle_opt_args
 339
 340 1; # return true value to calling program