lib/reLyX/CleanTeX.pm

   1 # This file is part of reLyX
   2 # Copyright (c) 1998-9 Amir Karger karger@post.harvard.edu
   3 # You are free to use and modify this code under the terms of
   4 # the GNU General Public Licence version 2 or later.
   5
   6 package CleanTeX;
   7 # This package prepares a LaTeX file for translation to LyX
   8 # - Translates some local commands (e.g., {\em blah} to {\emph{blah}})
   9 # - Prepares math mode stuff for LyX. LyX reads LaTeX math mode directly,
  10 #      so reLyX can basically copy all math mode exactly, but LyX is a
  11 #      bit stricter than LaTeX. E.g., translate 'x^2' -> 'x^{2}
  12 # - Removes optional arguments if LyX doesn't understand them, e.g. \\
  13
  14 use strict;
  15
  16 use Verbatim;
  17
  18 ######
  19 # Global variables
  20 my $last_eaten; # last token we ate
  21
  22 # List of commands for which LyX doesn't support the optional argument
  23 my @DeleteOptArg = map {"\\$_"} qw(\\ \\*
  24               chapter section subsection subsubsection paragraph subparagraph
  25               );
  26
  27 my $debug_on; # was -d option given?
  28
  29 #########################   PARSER INVOCATION   ################################
  30 sub call_parser {
  31 # This subroutine opens the TeX parser and processes the file.
  32 # Arg0 is the name of the input TeX file
  33 # Arg1 is the name of the output "clean" file
  34
  35     my ($InFileName, $OutFileName) = (shift,shift);
  36
  37     $debug_on = (defined($main::opt_d) && $main::opt_d);
  38     my $zzz=$debug_on ? " TeX file ($InFileName --> $OutFileName)\n" :"... ";
  39     print STDERR "Cleaning$zzz";
  40     open (OUTFILE, ">$OutFileName") or die "problem opening $OutFileName: $!\n";
  41
  42 # Create the list of tokens for the parser
  43 # Parts of the token list are swiped from TeX.pm
  44     my %MyTokens = ( '{' => $Text::TeX::Tokens{'{'},
  45                      '}' => $Text::TeX::Tokens{'}'},
  46                      '$' => $Text::TeX::Tokens{'$'},
  47                      '$$' => $Text::TeX::Tokens{'$$'},
  48                      '\begin' => $Text::TeX::Tokens{'\begin'},
  49                      '\end' => $Text::TeX::Tokens{'\end'},
  50                    );
  51
  52     # Put local tokens, like \em, into %MyTokens
  53     #Note: \cal is "local", although it's found in math mode
  54     # (The "map" just puts a backslash in front of each word in the list)
  55     my @LocalTokens = qw (em rm bf tt sf sc sl it
  56                          rmfamily ttfamily sffamily mdseries bfseries
  57                         upshape itshape slshape scshape cal
  58                         );
  59     foreach (@LocalTokens) {
  60         $MyTokens{"\\$_"} = $Text::TeX::Tokens{'\em'}
  61     }
  62     # Now add any commands
  63     &ReadCommands::Merge(\%MyTokens);
  64
  65 # Create the fileobject
  66     my $file = new Text::TeX::OpenFile
  67            $InFileName,
  68            'defaultact' => \&clean_tex,
  69            'tokens' => \%MyTokens;
  70
  71 # Now actually process the file
  72     $file->process;
  73     close OUTFILE;
  74     #warn "Done cleaning TeX file\n";
  75 } # end sub call_parser
  76
  77
  78 #######################   MAIN TRANSLATING SUBROUTINE   ########################
  79 # Routine called by the TeX-parser to perform token-processing.
  80 sub clean_tex {
  81     my($eaten,$txt) = (shift,shift);
  82     my ($outstr, $type);
  83
  84     # Sub translate is given a string and one of the translation tables below.
  85     # It returns the translation, or just the string if there's no translation
  86     # Translation table for TT::Begin::Group tokens
  87     my %begtranstbl = (
  88                         '$' => '\(', # LyX math mode doesn't
  89                         '$$' => '\[', # understand \$ or $$
  90                         );
  91
  92     # Translation table for TT::End::Group tokens
  93     my %endtranstbl = (
  94                            '$' => '\)',
  95                            '$$' => '\]',
  96                        );
  97
  98     # Translation table for TT::Token tokens whose translations should
  99     #    NOT have whitespace after them! See sub translate...
 100     #   Note that tokens of type TT::EndLocal are always translated to '}'. So,
 101     #   any token defined as a local token *must* be translated to something
 102     #   with a '{' (e.g., '\em' -> '\emph{') or we'll have mismatched braces
 103     my %no_ws_transtbl = (
 104                         '\em' => '\emph{',
 105                         '\rm' => '\textrm{',
 106                         '\bf' => '\textbf{',
 107                         '\tt' => '\texttt{',
 108                         '\sf' => '\textsf{',
 109                         '\sc' => '\textsc{',
 110                         '\sl' => '\textsl{',
 111                         '\it' => '\textit{',
 112                         '\rmfamily' => '\textrm{',
 113                         '\ttfamily' => '\texttt{',
 114                         '\sffamily' => '\textsf{',
 115                         '\mdseries' => '\textmd{',
 116                         '\bfseries' => '\textbf{',
 117                         '\upshape' => '\textup{',
 118                         '\itshape' => '\textit{',
 119                         '\slshape' => '\textsl{',
 120                         '\scshape' => '\textsc{',
 121                         '\cal' => '\mathcal{',
 122                         );
 123
 124
 125     # a faux "switch" statement.  sets $_ for later use in pattern
 126     # matching.
 127     $type = ref($eaten);
 128     $type =~ s/^Text::TeX::// or die "Non-Text::TeX object";
 129     my $printstr = ""; # default for undefined printstrs etc.
 130     SWITCH: for ($type) {
 131            # Handle blank lines.
 132            if (/Paragraph/) {
 133                last SWITCH;
 134            }
 135
 136            # Handle the end of a local font command - insert a '}'
 137            if (/EndLocal/) {
 138                # we could just say $printstr='}'
 139                $printstr = &translate('}', \%endtranstbl);
 140                last SWITCH;
 141            }
 142
 143            # $eaten->exact_print is undefined for previous environments
 144            $outstr = $eaten->exact_print;
 145            if (! defined $outstr) { # comment at end of paragraph
 146                warn "Weird undefined token $eaten!" unless $eaten->comment;
 147                last SWITCH;
 148             }
 149
 150            # Handle LaTeX tokens
 151            if (/^Token$/) {
 152                my $realtok = $eaten->print; # w/out whitespace
 153                # If a comment is its own paragraph, print nothing
 154                last SWITCH unless defined($realtok);
 155                # Special handling for \verb and \verb*
 156                if ($realtok =~ /^\\verb\*?/) {
 157                    $printstr = &Verbatim::copy_verb($txt,$eaten);
 158                    last SWITCH;
 159                }
 160
 161                # Translate token if necessary, or just print it
 162                # "no_ws" is HACK to remove whitespace, so '\em ' -> '\emph{'
 163                $printstr = &translate($outstr, \%no_ws_transtbl, "no_ws");
 164
 165                # Ignore optional argument(s) if necessary
 166                $printstr .= &handle_opt_args($eaten,$txt);
 167
 168                last SWITCH;
 169            }
 170
 171            # Tokens taking arguments, like '^'
 172            # ADD '{' if there isn't one before the argument!
 173            # TODO can we check whether the command is \label, \include
 174            # and not add the braces in that case?
 175            if (/^BegArgsToken$/) {
 176                $printstr = $outstr;
 177
 178                # Ignore optional argument(s) if necessary
 179                $printstr .= &handle_opt_args($eaten,$txt);
 180
 181                # Add beginning brace before the 1st argument if there isn't one
 182                my $tok = $txt->lookAheadToken;
 183                $printstr .= '{' unless ($tok =~ /\{/);
 184                last SWITCH;
 185            }
 186
 187            # End of one argument, beginning of next
 188            # Note: by default ArgToken,EndArgsToken print nothing
 189            # ADD '}' if there isn't one after the last argument
 190            # Then read and print any optional arguments which may exist
 191            #    between this argument the next (we must do this here or we would
 192            #    add a '{' before an optional argument!)
 193            # ADD '{' if there isn't one before the next argument!
 194            # (just like we do in BegArgsToken and EndArgsToken)
 195            if (/^ArgToken$/) {
 196                $printstr = $outstr; # = ''
 197
 198                # Add '}' after the argument that ended if necessary
 199                $printstr .= '}' unless $last_eaten->print eq "\}";
 200
 201                # Eat and print any optional arguments
 202                $printstr .= &handle_opt_args($eaten,$txt);
 203
 204                # Add '{' before the next argument if necessary
 205                my $tok = $txt->lookAheadToken;
 206                $printstr .= '{' unless ($tok =~ /\{/);
 207                last SWITCH;
 208            }
 209
 210            # End of tokens taking arguments, like '^'
 211            #     ADD '}' if there isn't one after the last argument, i.e.,
 212            # if the previous token *wasn't* a '}'
 213            #     Kludge: for TeX style \input command ("\input foo" with no
 214            # braces) we need to read the whole filename, but parser will have
 215            # read only one char. So read in the rest of the filename before
 216            # printing the '}'.
 217            if (/^EndArgsToken$/) {
 218                $printstr = $outstr; # = ''
 219
 220                unless ($last_eaten->print eq "\}") {
 221                    my $s = $eaten->base_token;
 222                    if ($s->print eq "\\input") {
 223                        my $t = $txt->lookAheadToken;
 224                        # For one-char filename (a.tex) do nothing
 225                        if ($t =~ /^[\w.\-]/) {
 226                            my $u = $txt->eatMultiToken;
 227                            $t = $u->print;
 228                            $t =~ s/\s+//g;
 229                            $printstr .= $t;
 230                         }
 231                         # TeX \input always adds .tex ending
 232                         $printstr .= ".tex";
 233                     }
 234
 235                    $printstr .= '}';
 236                 }
 237
 238                # Don't bother eating optional args coming after the last
 239                # required arg: they'll just be copied as text
 240                last SWITCH;
 241            }
 242
 243            # Handle opening groups, like '{' and '$'.
 244            if (/Begin::Group$/) {
 245                $printstr = &translate($outstr,\%begtranstbl);
 246                last SWITCH;
 247            }
 248
 249            # Handle closing groups, like '}' and '$'.
 250            if (/End::Group$/) {
 251                $printstr = &translate($outstr, \%endtranstbl);
 252                last SWITCH;
 253            }
 254
 255            if (/Begin::Group::Args/) {
 256                my $env = $eaten->environment;
 257                $printstr = $outstr;
 258                if ($env eq "verbatim" || $env eq "reLyXskip") {
 259                    # copy everything up to "\end{foo}"
 260                    $printstr .= &Verbatim::copy_verbatim($txt, $eaten);
 261                }
 262                last SWITCH;
 263            }
 264
 265            if  (/End::Group::Args/) {
 266                $printstr = $outstr;
 267                last SWITCH;
 268            }
 269
 270            if (/Text/) {
 271                $printstr = $outstr;
 272                last SWITCH;
 273            }
 274
 275            # The default action - print the string.
 276            $printstr = $outstr;
 277     } # end SWITCH:for ($type)
 278
 279     # Actually print the string
 280     if (defined $printstr) {
 281         print OUTFILE $printstr;
 282         $last_eaten = $eaten; #save for next time
 283     } else {warn "Undefined printstr";}
 284
 285 } # end sub clean_tex
 286
 287 ####################   TRANSLATOR SUBROUTINES    ###############################
 288 sub translate {
 289 # Replace a string (possibly with whitespace around it) with another
 290 # Arg0 is a string, Arg1 is a reference to a hash containing translations
 291 # If a token not in the table is passed in, do nothing
 292 # If Arg2 is defined AND the token is known, then remove whitespace from
 293 #     the end of the translated token. This is a HACK to do '\em ' -> '\emph{'
 294 # Return the string, possibly modified
 295     my ($tokstr, $transref) = (shift, shift);
 296     my $remove_ws = shift;
 297     my %transtable = %$transref;
 298
 299     # remove whitespace from the string (since transtable doesn't have it)
 300     my $stripstr = $tokstr;
 301     $stripstr =~ s/^\s*(\S+)\s*$/$1/ or warn "couldn't strip token";
 302     if ( exists $transtable{$stripstr} ) {
 303          # use \Q or \, (, $, and [ will be misinterpreted
 304         $tokstr =~ s/\Q$stripstr\E/$transtable{$stripstr}/;
 305
 306         # remove whitespace?
 307         if (defined $remove_ws) {
 308             $tokstr =~ s/\s*$//;
 309         }
 310     }
 311
 312     return $tokstr;
 313 }
 314
 315 sub handle_opt_args {
 316 # read and concatenate OR IGNORE optional arguments
 317 # Arg0 is a BegArgsToken or ArgToken
 318     my ($eaten,$fileobject) = (shift,shift);
 319     my $outstr = "";
 320
 321     # If at end of paragraph, don't bother looking for optArgs
 322     return "" unless $fileobject->lookAheadToken;
 323
 324     # Get the next argument(s) expected for this token == /^o*[rR]?$/
 325     # If there are no args expected, just return
 326     my $curr_args = $eaten->next_args($fileobject) or return "";
 327
 328     # Now print or ignore any optional arguments
 329     # If there's an 'r' in curr_args, we're done for now
 330     my $foo;
 331     my $token_name = $eaten->token_name; # (needed for EndArgsToken, e.g.)
 332     while ($curr_args =~ s/^o//) {
 333         my $opt = $fileobject->eatOptionalArgument;
 334         # Print any initial space before the optional argument
 335         if ($foo = $opt->exact_print) {
 336             if ($foo =~ /^(\s+)/) {
 337                 $outstr .= $1;
 338             }
 339         }
 340
 341         # Print the argument or ignore it
 342         if ($opt->print) {
 343             if (grep /^\Q$token_name\E$/, @DeleteOptArg) {
 344                 print "Optional argument '",$opt->print,
 345                        "' to macro $token_name ignored\n";
 346             } else {
 347                 $outstr .= "[" . $opt->print . "]";
 348             }
 349         } # Was an optional argument found?
 350     }
 351
 352     return $outstr;
 353 } # end sub handle_opt_args
 354
 355 1; # return true value to calling program