lib/reLyX/Text/TeX.pm

   1 package Text::TeX;
   2
   3 # This file is copyright (c) 1997-8 Ilya Zakharevich
   4 # Modifications for reLyX by Amir Karger
   5 # You are free to use and modify this code under the terms of
   6 # the GNU General Public Licence version 2 or later.
   7
   8 #
   9 #use strict;
  10 #use vars qw($VERSION @ISA @EXPORT);
  11
  12 #require Exporter;
  13 #require # AutoLoader;  # To quiet AutoSplit.
  14
  15 # @ISA = qw
  16 # (Exporter AutoLoader);
  17 # Items to export into callers namespace by default. Note: do not export
  18 # names by default without a very good reason. Use EXPORT_OK instead.
  19 # Do not simply export all your public functions/methods/constants.
  20 @EXPORT = qw(
  21
  22 );
  23 $VERSION = '0.01';
  24
  25
  26 # Preloaded methods go here.
  27
  28 # Does not deal with verbatims
  29 # Spaces are treated bad.
  30
  31 #####################      GENERAL NOTES      ##################################
  32 # Each package describes a different sort of token.
  33 # Packages:
  34 #    Chunk        - default, just used as an ISA
  35 #    Text         - plain text, made up of TT::$usualtokenclass stuff
  36 #    Paragraph    - new paragraph starting (cuz you got \n\n in a latex file)
  37 #    Token        - simple token, like ~ or \blah
  38 #    EndLocal     - pseudotoken meaning that the scope of a local command (like
  39 #                   '\large') has ended
  40 #    BegArgsToken - pseudotoken which takes one or more arguments, like \section
  41 #    ArgToken     - pseudotoken returned in between arguments to a BegArgsToken
  42 #    EndArgsToken - pseudotoken returned after we finish getting arguments
  43 #                   to a BegArgsToken
  44 #    LookAhead    - a special kind of EndArgsToken when you want to look ahead
  45 #    BegArgsTokenLookedAhead - special kind of BegArgsToken (see man page)
  46 #    Begin::Group - Beginning of a group, i.e., '{'
  47 #    End::Group   - End of a group, i.e., '}'
  48 #    Begin::Group::Args - begin group but get args first, i.e., '\begin'
  49 #    End::Group::Args   - end group but get args first, i.e., '\end'
  50 #    SelfMatch    - e.g., '$'. Matches itself, but otherwise like a Begin::Group
  51 #    Separator    - e.g., '&' (not used in reLyX)
  52 #    Comment      - (not used in reLyX)
  53 #
  54 # The main package is TT::OpenFile. It contains the subroutines that do
  55 #    most of the parsing work. TT::GetParagraph does some stuff too, but
  56 #    it's not a token you'd expect the code to return
  57 #
  58 # Package subroutines (other than 'new'):
  59 #    refine - takes a token to a more specific kind of token type
  60 #               e.g., '{' goes from TT::Token to TT::Begin::Group
  61 #    digest - extra actions to do once you've eaten the token.
  62 #               e.g., eating arguments of \begin, or popping various
  63 #               stacks when you get to an End::Group
  64 #    print  - how to print the token (e.g., the text making up the token)
  65 #    exact_print - print the token exactly as it appeared in the file.
  66 #               Usually involves adding whitespace
  67 #
  68 # Token and pseudotokens have some more subs:
  69 #    base_token   - the token this token is created from. It's the token
  70 #                   itself for a Token, but not for pseudotokens
  71 #    token_name   - the name of the base_token
  72 #
  73 # Token structure:
  74 # $tok->[0] will usually be the word (e.g., '\blah') the parser read
  75 #      For pseudotokens, it's something more complicated
  76 #      (some tokens, like Paragraph have nothing there, though)
  77 # $tok->[1] will be any comment (usually ignored)
  78 # $tok->[2] will be the exact thing the parser read (usu. [0] plus whitespace)
  79 # $tok->[3] stores arguments for Begin::Group::Args and End::Group::Args
  80 # $tok->[4] stores pointer to beginning token for End::Group::Args
  81 #    A TT::Group is a reference to an array of tokens. Often (but not always),
  82 # the first and last groups are Begin::Group and End::Group tokens respectively.
  83
  84 # Pseudotokens are objects, one of whose fields is a reference to the token
  85 # that created the pseudotoken
  86 # BegArgToken, ArgToken, EndArgToken pseudotokens:
  87 # $tok->[0][0] - token (e.g. a TT::Token) that begins this group
  88 # $tok->[0][1] - number of arguments that that token takes
  89 # $tok->[0][2] - (found only in ArgToken) number of arguments to the token
  90 #                that have been read so far
  91 ################################################################################
  92
  93 ########################   GLOBAL VARIABLES   ##################################
  94 # Sorts of text you find in a LaTeX file. For matching
  95 $notusualtoks = "\\\\" . '\${}^_~&@%'; # Why \\\\? double interpretation!
  96 $notusualtokenclass = "[$notusualtoks]";
  97 $usualtokenclass = "[^$notusualtoks]";
  98
  99 # The $macro RE matches LaTeX macros. Here's exactly what it does:
 100 # $macro = \\\\(?:RE)
 101 # This matches either '\\' or \RE where RE = RE1 or RE2
 102 # RE1 = '\)', so $macro will match the end of a math environment, '\)'
 103 # RE2 = (((RE3 or RE4)\*?)\s*) where
 104 # RE3 and RE4 can each be followed by zero or one asterisks. Either is still
 105 # a macro. Ditto, trailing whitespace is included in the token because that's
 106 # what LaTeX does.
 107 # RE3 = '([^a-zA-Z)])' matches a single non-alphabetic char. We already
 108 # test for '\)', so that is explictly excluded from RE3 because '\)*' is not
 109 # a macro. Rather it is '\)' followed by an asterisk.
 110 # RE4 = '([a-zA-Z]+\*?)'
 111 # Ie, one or more alphabetic chars followed by zero or 1 asterisks
 112 # Eg, \section or \section*
 113 # Putting all this together:
 114 $macro = '\\\\(?:\)|((([^a-zA-Z)])|([a-zA-Z]+))\*?)\s*)';
 115
 116 # active is a backslashed macro or $$ (same as \[) or ^^ followed by a char
 117 #    (^^A means ASCII(1), e.g. See the TeXbook) or a special character like ~
 118 $active = "$macro|\\\$\\\$|\\^\\^.|$notusualtokenclass"; # 1 level of grouping
 119
 120 # In TeX, ^joe is equivalent to ^{j}oe, so sometimes we use tokenpattern
 121 #     instead of multitokenpattern to get just one character
 122 $tokenpattern = "($usualtokenclass)|$active"; # Two levels of grouping
 123 $multitokenpattern = "($usualtokenclass+)|$active"; # Two levels of grouping
 124
 125 # Note: In original (CPAN) version, $commentpattern had "". It needs ''
 126 # or otherwise '\s' gets translated to 's'
 127 $commentpattern = '(?:%.*\n\s*)+'; #one or more comment lines
 128 $whitespaceAndComment = '\s*(%.*\n[ \t]*)+';
 129
 130 # matches either nothing OR an argument in brackets ($1 doesn't include [])
 131 $optionalArgument = "(?:\\[([^]]*)\\])?"; # Contains one level of grouping
 132
 133 # These tokens are built from other tokens, so they're pseudotokens
 134 #    (except BegArgsToken actually does have text!?)
 135 for (qw(Text::TeX::ArgToken Text::TeX::BegArgsToken Text::TeX::EndArgsToken )) {
 136   $pseudo{$_} = 1;
 137 }
 138
 139 # More global variables can be found at the end of the file
 140 # E.g., the main Tokens hash
 141
 142 #######################   Token Packages   #####################################
 143 {
 144   package Text::TeX::Comment;
 145   $ignore = 1;
 146 }
 147
 148 {
 149   package Text::TeX::Chunk;
 150   sub refine {}
 151   sub digest {}
 152   sub collect {$_[0]->[0]}
 153   sub new {
 154     my $class = shift;
 155     bless [@_], $class;
 156   }
 157   sub print {$_[0]->[0]}
 158   # exact_print prints the *exact* text read, including whitespace
 159   #     (but not including comments...)
 160   sub exact_print {$_[0]->[2]}
 161   # print the comment that came before a token
 162   sub comment {$_[0]->[1]}
 163
 164 }
 165
 166 {
 167   package Text::TeX::Token;
 168   @ISA = ('Text::TeX::Chunk');
 169
 170   sub refine {
 171     my $self = shift;
 172     return undef unless defined $self->[0];
 173     my $txt = shift;
 174     my $type;
 175     if (defined ($tok = $txt->{tokens}->{$self->[0]})
 176         and defined $tok->{class}) {
 177       bless $self, $tok->{class};
 178     }
 179   } # end sub refine
 180
 181   # Name of the token. Same as print for Token, but ArgToken and
 182   # EndArgsToken, e.g., print nothing!
 183   sub token_name {
 184       my $tok = shift->base_token;
 185       return $tok->print;
 186   }
 187
 188   sub base_token {
 189   # For pseudotokens, this sub is more complicated, but a token is just a token.
 190       return shift;
 191   }
 192
 193   # return the syntax argument created by reLyX
 194   # Return "" if relyx_args is empty, i.e., if the token takes no args
 195   # Return undef if relyx_args doesn't exist, i.e., if the token is unknown
 196   sub relyx_args {
 197       warn "not enough args to Text::TeX::relyx_args" unless @_==2;
 198       my ($tok,$object) = (shift, shift);
 199       my $name;
 200
 201       # Test copied from TT::OpenFile::eat
 202       if (defined ($name = $tok->token_name)) {
 203           #print "$name is defined\n";
 204           if (defined ($entry = $object->{"tokens"}->{$name})) {
 205               #print "Entry in MyTokens is defined\n";
 206               if (exists ($entry->{"relyx_args"})) { # even if it's empty...
 207                   #print "the args are '",$entry->{"relyx_args"},"'\n";
 208                   return $entry->{"relyx_args"}
 209               }
 210           }
 211       }
 212
 213       # else...
 214       #print "did not exist";
 215       return undef;
 216   } # end sub relyx_args
 217
 218   sub next_args {
 219   # Return the next argument(s) expected by this token.
 220   # For regular Tokens: /^o*$/.
 221   # For BegArgsTokens and ArgTokens: /^o*[rR]$/
 222   # For EndArgsTokens: /^o*/. (in case opt args come after last required arg)
 223     my ($eaten,$fileobject) = (shift,shift);
 224
 225     # Get the number & type of arguments of this token == /^[or]*$/
 226     # If it takes no args, just return
 227     # Will also return if curr_args is called for plain Text for some reason
 228     my $syntax = $eaten->relyx_args($fileobject) or return "";
 229
 230     # If it takes just optional args, return them (it's a plain Token)
 231     return $syntax if $syntax =~ /^o+$/;
 232
 233     # Number of arguments we've already read (== 0 for BegArgsToken)
 234     # Note that we only get here for Beg/EndArgsToken or ArgToken
 235     my $arg_num = $eaten->args_done;
 236
 237     # Split args into single "argument sets", each of which is 0 or more
 238     # optional arguments followed by 0 or 1 required argument.
 239     @args = ($syntax =~ /o*[rR]?/g);
 240     push (@args,""); # necessary for EndArgsToken if $syntax ends with "r"
 241
 242     # Now return the n'th argument set
 243     #    e.g., if 0 args have been eaten, return the 0th element of @args,
 244     # which is the first argument
 245     return $args[$arg_num];
 246   } # end sub curr_args
 247 } # end package Text::TeX::Token
 248
 249 {
 250   package Text::TeX::BegArgsToken;
 251   @ISA = ('Text::TeX::Token');
 252   sub print {
 253       my $tok = shift->base_token; # Token this pseudotoken was made from
 254       return $tok->print;
 255   }
 256
 257   sub exact_print {
 258       my $tok = shift->base_token;
 259       return $tok->exact_print;
 260   }
 261
 262   # How many arguments we've read already.
 263   # Obviously zero before we've begun to read the arguments
 264   sub args_done {return 0}
 265
 266   sub base_token { return shift->[0]->[0] }
 267   sub comment { return shift->base_token->comment }
 268 }
 269
 270 {
 271   package Text::TeX::ArgToken;
 272   @ISA = ('Text::TeX::Token');
 273   # This token isn't made from actual text, so it prints nothing
 274   sub print {return ''}
 275   sub exact_print {return ''}
 276
 277   # How many arguments we've read already.
 278   # Luckily, this number is stored in the ArgToken token
 279   sub args_done { return shift->[0]->[2] }
 280
 281   sub base_token { return shift->[0]->[0] }
 282 }
 283
 284 {
 285   package Text::TeX::EndArgsToken;
 286   @ISA = ('Text::TeX::Token');
 287   # This token isn't made because of real text, so it prints nothing
 288   sub print {return ''}
 289   sub exact_print {return ''}
 290
 291   # How many arguments we've read already.
 292   # Obviously the total number of arguments, since we're done
 293   sub args_done {return shift->[0]->[1]}
 294   sub base_token { return shift->[0]->[0] }
 295 }
 296
 297 {
 298   package Text::TeX::EndLocal;
 299   @ISA = ('Text::TeX::Token');
 300   # No text in this token
 301   sub print {return ''}
 302   sub exact_print {return ''}
 303   sub base_token { return shift->[0] }
 304 }
 305
 306 {
 307   package Text::TeX::Group;
 308   sub new {shift; my $in = shift; bless $in}
 309   sub print {
 310     local @arr; #arr becomes global for called subroutines
 311     foreach (@{ $_[0] }) {
 312       push(@arr, $_->print);
 313     }
 314     "`" . join("',`", @arr) . "'";
 315   }
 316
 317   # exact_print prints w/out the quotes
 318   sub exact_print {
 319     local @arr; #arr becomes global for called subroutines
 320     foreach (@{ $_[0] }) {
 321       push(@arr, $_->exact_print);
 322     }
 323     join("", @arr); # ... and return it
 324   }
 325
 326   # Not created straight from LaTeX, so it'll never have a comment
 327   # (although comments can be in the subtokens in the group)
 328   sub comment {undef}
 329
 330   # Return what's in the group, i.e. strip out the '{' and '}' tokens
 331   # if they exist. Return an array of tokens or just one token
 332   sub contents {
 333       #strip off TT::Begin::Group and TT::End::Group from beginning and end
 334       # if they exist. eatBalanced will return Tokens, so don't worry about
 335       # stripping too much from a group like {{foo} bar}. And eatGroup
 336       # will return Begin::Group, Group, End::Group, so after stripping one,
 337       # don't have to worry about stripping another.
 338       $group = shift;
 339       if (ref($group->[0] ) eq "Text::TeX::Begin::Group" and
 340           ref($group->[-1]) eq "Text::TeX::End::Group")
 341       {
 342           shift @$group;
 343           pop @$group;
 344       }
 345
 346       if (wantarray) {
 347           return @$group;
 348       } elsif (!@$group) { # group was '{}'
 349           return new Text::TeX::Token '','',''; # send back an empty token
 350       } else {
 351           warn "Text::TeX -- more than one token in group!" if $#$group > 1;
 352           return $$group[0];
 353       }
 354   }
 355 }
 356
 357 {
 358   package Text::TeX::End::Group;
 359   @ISA = ('Text::TeX::Chunk');
 360   sub new {shift; my $in = shift; bless \$in}
 361   sub digest {                  # 0: the token, 1: text object
 362     # If there are any EndLocal tokens in $txt->{presynthetic}, do them first
 363     # See TT::OpenFile::check_presynthetic for details
 364     return if $_[1]->check_presynthetic($_[0]); # May change $_[0]
 365     my $wa = $_[1]->curwaitforaction;
 366     my $w = $_[1]->popwait;
 367     warn "Expecting `$w', got `$_[0][0]'=`$_[0][0][0]' in `$ {$_[1]->{paragraph}}'"
 368       if $w ne $_[0]->[0];
 369     &$wa if defined $wa; # i.e., do $txt->{waitforactions}[-1] if it exists
 370   }
 371 }
 372
 373 {
 374   package Text::TeX::End::Group::Args;
 375   @ISA = ('Text::TeX::End::Group');
 376
 377   sub digest {                  # 0: the token, 1: text object
 378     # If there are any EndLocal tokens in $txt->{presynthetic}, do them first
 379     #    (Lamport p. 27 says \em is ended by '\end{blah}', not just '}')
 380     # check_presynthetic will put the End::Group::Args token into pending_in
 381     #    so it'll be read on the next pass through eat. Since sub digest will
 382     #    be called again on this token, don't read the argument to \end{}
 383     #    on the first call to sub digest
 384     # See TT::OpenFile::check_presynthetic for details
 385     return if $_[1]->check_presynthetic($_[0]); # May change $_[0]
 386
 387     my $Token = $_[1]->{tokens}->{$_[0]->[0]};
 388     my $count = $Token->{eatargs};
 389     my ($tok, @arr);
 390     # Read environment you're ending (just like in Begin::Group::Args)
 391     while ($count--) {
 392       $tok = $_[1]->eatGroup(1);
 393       if (@$tok == 3 and $tok->[0]->[0] eq '{') { # Special case for {\a}
 394         $tok = $tok->[1];
 395       }
 396       push(@arr,$tok);
 397     }
 398     #$_[0]->[0] .= ' ' . join ' ', map $_->[0], @arr;
 399     $_[0]->[3] = \@arr;
 400     my $s = $_[1]->starttoken;
 401
 402     # like TT::End::Group
 403     my $wa = $_[1]->curwaitforaction;
 404     my $w = $_[1]->popwait;
 405     # If you got '}' when you wanted '\end'
 406     warn "Expecting `$w', got $_[0]->[0] in `$ {$_[1]->{paragraph}}'"
 407       if $w ne $_[0]->[0];
 408     # If you got \end{foo} when you wanted \end{bar}
 409     if ($Token->{selfmatch} and $s->environment ne $_[0]->environment) {
 410       warn "Expecting `$w" , "{", $s->environment,"}', got $_[0]->[0]",
 411         "{", $_[0]->environment , "} in `$ {$_[1]->{paragraph}}'";
 412     }
 413
 414     # If there was a waitforaction then do it now
 415     &$wa if defined $wa;
 416     $_[0]->[4] = $s;            # Put the start data into the token
 417   }
 418
 419   sub print { # need special print to print name of environment
 420       my $obj = $_[0];
 421       my $env = $obj->environment; # assume we've already digested it
 422       # Use the method for printing a regular old token, but append env. name
 423       return $obj->SUPER::print . "{$env}";
 424   }
 425
 426   sub exact_print {
 427       my $obj = $_[0];
 428       my $env = $obj->environment; # assume we've already digested it
 429       # Use the method for printing a regular old token, but append env. name
 430       return $obj->SUPER::exact_print . "{$env}";
 431   }
 432
 433   sub environment {
 434   # this group's environment
 435       return $_[0]->[3]->[0]->[0];
 436   }
 437 } # end package TT::End::Group::Args
 438
 439 {
 440   package Text::TeX::Begin::Group::Args;
 441   @ISA = ('Text::TeX::Begin::Group');
 442
 443   sub digest {                  # 0: the token, 1: text object
 444     my $Token = $_[1]->{tokens}->{$_[0]->[0]};
 445     my $count = $Token->{eatargs};
 446     my ($tok, @arr);
 447     # Read the arguments, e.g., read "{blah}" for "\begin{blah}"
 448     while ($count--) {
 449       $tok = $_[1]->eatGroup(1);
 450       if (@$tok == 3 and $tok->[0]->[0] eq '{') { # Special case for {\a}
 451         $tok = $tok->[1];
 452       }
 453       push(@arr,$tok);
 454     }
 455     # $_[0]->[0] .= ' ' . join ' ', map $_->[0], @arr;
 456     $_[0]->[3] = \@arr;
 457     $_[0]->SUPER::digest($_[1]); # i.e. do Begin::Group stuff (pushwait)
 458   }
 459
 460   sub print { # need special print to print name of environment
 461       my $obj = $_[0];
 462       my $env = $obj->environment; # assume we've already digested it
 463       # Use the method for printing a regular old token, but append env. name
 464       return $obj->SUPER::print . "{$env}";
 465   }
 466
 467   sub exact_print {
 468       my $obj = $_[0];
 469       my $env = $obj->environment; # assume we've already digested it
 470       # Use the method for printing a regular old token, but append env. name
 471       return $obj->SUPER::exact_print . "{$env}";
 472   }
 473
 474   sub environment {
 475   # this group's environment
 476       return $_[0]->[3]->[0]->[0];
 477   }
 478 } # end package TT::Begin::Group::Args
 479
 480 {
 481   package Text::TeX::Begin::Group;
 482   @ISA = ('Text::TeX::Chunk');
 483   # 0: the token, 1: text object
 484   sub digest {
 485       my ($tok, $txt) = (shift, shift);
 486       # $dummy = the anonymous hash associated with this token in the %Tokens
 487       my $dummy = $txt->{tokens}->{$tok->[0]};
 488
 489       # see if this group requires different actions
 490       my $newaction; # action to do while parsing this group
 491       my $waitaction; # action to do when you hit the matching End::Group
 492       undef $waitaction; undef $newaction;
 493       if (defined $dummy) {
 494           if (exists $dummy->{newaction}) {
 495               $newaction = $dummy->{newaction};
 496           }
 497           if (exists $dummy->{waitaction}) {
 498               $waitaction = $dummy->{waitaction};
 499           }
 500       }
 501
 502       # push stuff onto stacks for this group
 503       $txt->pushwait($tok, $newaction, $waitaction);
 504   }
 505 }
 506
 507 {
 508   package Text::TeX::SelfMatch;
 509   @ISA = ('Text::TeX::Chunk');
 510   sub refine {
 511   # This subroutine is never used. See sub digest below
 512     if ($_[1]->curwait eq $_[0]->[0]) {  #if you match what you're waiting for
 513       bless $_[0], Text::TeX::End::Group;
 514     } else { #you need to BE matched
 515       bless $_[0], Text::TeX::Begin::Group;
 516     }
 517   }
 518   # 0: the token, 1: text object
 519   # Unfortunately, this sub IS necessary, because originally, a '$' (e.g.)
 520   #    is type TT::Token. Calling refine calls Chunk::refine, which blesses
 521   #    it to SelfMatch, but then SelfMatch::refine is never called! -Ak
 522   sub digest {                  # XXXX Should not be needed?
 523     # curwait returns undefined if not waiting for anything
 524     if (defined ($cwt = $_[1]->curwait) && $cwt eq $_[0]->[0]) {
 525       bless $_[0], Text::TeX::End::Group;
 526       $_[0]->Text::TeX::End::Group::digest($_[1]);
 527     } else {
 528       bless $_[0], Text::TeX::Begin::Group;
 529       $_[1]->pushwait($_[0]);
 530     }
 531   }
 532 }
 533
 534 @Text::TeX::Text::ISA = ('Text::TeX::Chunk');
 535 @Text::TeX::Paragraph::ISA = ('Text::TeX::Chunk');
 536 @Text::TeX::BegArgsTokenLookedAhead::ISA = ('Text::TeX::BegArgsToken');
 537 @Text::TeX::LookAhead::ISA = ('Text::TeX::EndArgsToken');
 538 @Text::TeX::Separator::ISA = ('Text::TeX::Chunk');
 539
 540 ########################   MAIN CODE   #########################################
 541 {
 542   package Text::TeX::GetParagraph;
 543   # Get a new paragraph from the LaTeX file
 544   # Get stuff until a non-empty line which follows an empty line
 545   sub new {
 546     shift;
 547     my $file = shift;
 548     my $fh;
 549     $fh = $ {$file->{fhs}}[-1] if @{$file->{fhs}};
 550     return undef if (not defined $fh or eof($fh)) and $file->{readahead} eq "";
 551
 552     # See below: every time we call GetParagraph, we read one extra (non-empty)
 553     #    line, which we store in readahead for next time
 554     my $string = $file->{readahead};
 555     $file->{readahead} = ""; #default in case eof($fh) or !defined($fh)
 556
 557     if (defined $fh) { # i.e., if eof($fh) just return readahead from last time
 558       # Read until an empty line (or eof)
 559       while (defined ($in = <$fh>)  && ($in =~ /\S/)) { # $in undefined at eof
 560         $string .= $in;
 561       }
 562       # $in has the empty line we just read in. Add it for verbatim copying
 563       $string .= $in if defined $in; # add whitespace
 564
 565       # Now read until NON-empty line (or eof)
 566       while (defined ($in = <$fh>) && ($in !~ /\S/)) {
 567         $string .= $in;
 568       }
 569
 570       # Next time, the paragraph will begin with the non-empty line we just read
 571       $file->{readahead} = $in if defined $in; # readahead stays "" at eof
 572     }
 573
 574     bless \$string; # ... and return it
 575   }
 576 }
 577
 578
 579 {
 580   package Text::TeX::OpenFile;
 581
 582   $refgen = "TeXOpenFile0000";
 583
 584   sub new {
 585 # Description of OpenFile object:
 586 # readahead - every time we read a paragraph we read one extra token. This
 587 #             token goes into 'readahead' and is prepended to the next paragraph
 588 #             we read
 589 # paragraph - stores the paragraph we're currently parsing
 590 # actions   - what to do. TT::OpenFile->process calls the function pointed
 591 #             to by actions on each token it eats
 592 # tokens    - reference to a hash describing all tokens that the parser
 593 #             should recognize
 594 # presynthetic - holds pseudotokens to deliver before a block ends.
 595 #             Specifically, it holds EndLocal tokens, so that we know to end
 596 #             a command like \em just before the '}' which ends a group
 597 # synthetic - holds pseudotokens to deliver after block ends - specifically,
 598 #             it holds ArgToken (and EndArgsToken) tokens, which it returns
 599 #             in between arguments (and after all arguments) to a command.
 600 #             (also holds LookAhead tokens, which are like EndArgsTokens)
 601 # pending_in - pseudotokens for input. Stuff is put here from synthetic or
 602 #             from pending_out, and if there's something in pending_in, sub
 603 #             eat doesn't bother eating a new token
 604 # pending_out - pseudotokens for output -- stuff put here from presynthetic
 605 #             If there's anything in pending_out it gets returned or put into
 606 #             pending_in, and sub eat doesn't bother eating a new token
 607     shift; my $file = shift; my %opt = @_;
 608     if (defined $file) {
 609        ++$refgen;
 610        open("::$refgen",$file) || die "Cannot open $file: $!";
 611        die "End of file `$file' during opening" if eof("::$refgen");
 612     }
 613     my $fhs = defined $file ? ["::$refgen"] : [];
 614     bless {  fhs => $fhs,
 615              readahead => ($opt{string} || ""),
 616              files => [$file],
 617              "paragraph" => undef,
 618              "tokens" => ($opt{tokens} || \%Text::TeX::Tokens),
 619              waitfors => [], options => \%opt,
 620              waitforactions => [],
 621              defaultacts => [$opt{defaultact}], # The last element is
 622                                                 # the default action
 623                                                 # for next deeper
 624                                                 # level
 625              actions => [defined $opt{action} ?
 626                          $opt{action} :
 627                          $opt{defaultact}],
 628              waitargcounts => [0],
 629              pending_out => [],
 630              pending_in => [],
 631              synthetic => [[]],
 632              presynthetic => [[]],
 633            };
 634   }
 635   sub DESTROY {
 636     my $in = shift; my $i = 0;
 637     for (@{$in->{fhs}}) {
 638       close($_)
 639         || die "Cannot close $ {$in->{files}}[$i]: $!";
 640       $i++;
 641     }
 642   }
 643
 644 # Return the paragraph we're currently reading
 645 #    If called with an argument, get a new paragraph at end of par, otherwise
 646 # don't. (Useful for looking ahead without affecting the file we're reading)
 647 # Either way, return nothing at end of par.
 648   sub paragraph {
 649     my $in = shift;
 650     my $get_paragraph = defined(shift);
 651     #print "ep.in=$in\n";
 652
 653     # Return something if not at end of par
 654     if ($in->{"paragraph"} and $ {$in->{"paragraph"}} ne "") {
 655       $in->{"paragraph"};
 656     # Done with all files and readahead?
 657     } elsif (@{$in->{fhs}} and eof($ {$in->{fhs}}[-1]) and !$in->{readahead}) {
 658       undef;
 659     # No files and done with readahead?
 660     } elsif (!@{$in->{fhs}} and $in->{readahead} eq '') {
 661       undef;
 662     } else {
 663       if ($get_paragraph) {
 664           #warn "getting new\n";
 665           $in->{"paragraph"} = new Text::TeX::GetParagraph $in;
 666       }
 667       return "";
 668     }
 669   }
 670
 671 # pushwait means don't do stuff you've got waiting (like EndLocal tokens)
 672 #    until you're done with something else
 673 # If Arg2 exists, then the upcoming group will have it as its action
 674 # If Arg3 exists, then we'll do it when we get to the end of the upcoming group
 675   sub pushwait {                # 0: text object, 1: token, 2: ????
 676     push(@{ $_[0]->{starttoken} }, $_[1]);
 677     push(@{ $_[0]->{waitfors} }, $_[0]->{tokens}{$_[1]->[0]}{waitfor});
 678     push(@{ $_[0]->{actions} },
 679          defined $_[2] ? $_[2] : $_[0]->{defaultacts}[-1]);
 680     push(@{ $_[0]->{waitforactions} }, $_[3]);
 681     push(@{ $_[0]->{synthetic} }, []);
 682     push(@{ $_[0]->{presynthetic} }, []); # so that a local argument won't
 683                                  # finish at end of the nested group
 684   }
 685
 686 # You've finished a group, so pop all the stuff pushwait pushed on
 687   sub popwait {
 688     if ($#{ $_[0]->{waitfors} } < 0) {
 689       warn "Got negative depth"; return;
 690     }
 691     my $rest = pop(@{ $_[0]->{synthetic} });
 692     warn "Not enough arguments" if @$rest;
 693     $rest = pop(@{ $_[0]->{presynthetic} });
 694     warn "Presynthetic events remaining" if @$rest;
 695     pop(@{ $_[0]->{starttoken} });
 696     pop(@{ $_[0]->{actions} });
 697     pop(@{ $_[0]->{waitforactions} });
 698     pop(@{ $_[0]->{waitfors} });
 699   }
 700
 701 # If there's anything in synthetic, pop it, reverse it, push it onto pending_out
 702   sub popsynthetic {
 703     my $rest = $ { $_[0]->{synthetic} }[-1];
 704     if (@$rest) {
 705       push @{ $_[0]->{pending_out} }, reverse @{ pop @$rest };
 706     }
 707   }
 708
 709   sub pushsynthetic {           # Add new list of events to do *after* the
 710                                 # next end of group.
 711     my $rest = $ { shift->{synthetic} }[-1];
 712     push @$rest, [@_];
 713   }
 714
 715   sub addpresynthetic {         # Add to the list of events to do *before*
 716                                 # the next end of group $uplevel above.
 717     my ($txt) = (shift);
 718     my $rest = $ { $txt->{presynthetic} }[-1];
 719     push @$rest, @_;
 720 #    if (@$rest) {
 721 #      push @{ @$rest->[-1] }, @_;
 722 #    } else {
 723 #      push @$rest, [@_];
 724 #    }
 725   }
 726
 727 # If anything exists in presynthetic[-1], pop it and CHANGE $_[1] to that.
 728 #    Push $_[1] AND (reverse of) anything else in presynthetic[-1] onto
 729 #    pending_in so that we do it before any more tokens are read.
 730 # Otherwise, just return false.
 731 # BUG?! I don't understand why we do reverse. It makes stuff come out FIFO!
 732   sub check_presynthetic {      # 0: text, 1: end token. Returns true on success
 733     if (@{ $_[0]->{presynthetic}[-1] }) {
 734       my $rest = $_[0]->{presynthetic}[-1];
 735       my $next = pop @$rest;
 736       push @{ $_[0]->{pending_in} }, $_[1], (reverse @$rest);
 737       $#$rest = -1;             # Delete them
 738       $_[1] = $next;
 739       return 1;
 740     }
 741   }
 742
 743
 744   sub curwait {
 745   # return what we're currently waiting for. Returns undef if not waiting
 746     my $ref = $_[0]->{waitfors}; $$ref[-1];
 747   }
 748
 749   sub curwaitforaction {
 750     my $ref = $_[0]->{waitforactions}; $$ref[-1];
 751   }
 752
 753   sub starttoken {
 754     my $ref = $_[0]->{starttoken}; $$ref[-1];
 755   }
 756
 757   # These are default bindings. You probably should override it.
 758
 759 # Eat '[blah]' or nothing. Brackets aren't returned in token's [0]
 760 #    but they are returned in [2], so exact_print will print them.
 761   sub eatOptionalArgument {
 762     # Call with no arg. Don't get new paragraph if at end of par
 763     my $in = shift->paragraph;
 764     return undef unless defined $in;
 765     my $comment = ( $$in =~ s/^\s*($Text::TeX::commentpattern)//o );
 766     if ($$in =~ s/^\s*$Text::TeX::optionalArgument//o) {
 767       new Text::TeX::Token $1, $comment, $&;
 768     } else {
 769       warn "No optional argument found";
 770       if ($comment) {new Text::TeX::Token undef, $comment}
 771       else {undef}
 772     }
 773   }
 774
 775 # eat {blah} when it's an argument to a BegArgsToken.
 776 # Returns a TT::Group of refined tokens
 777 #    This sub calls popsynthetic, so an ArgToken or EndArgsToken will be
 778 # popped from synthetic into pending_in. This means that the ArgToken or
 779 # EndArgsToken will be the next token returned by sub eat!
 780   sub eatRequiredArgument {
 781       my $txt = shift;
 782       my $group = $txt->eatGroup(@_);
 783       $txt->popsynthetic;
 784       return $group;
 785   }
 786
 787   sub eatFixedString {
 788     # Call with no arg. Don't get new paragraph if at end of par
 789     my $in = shift->paragraph;
 790     return undef unless defined $in;
 791     my $str = shift;
 792     my ($comment) = ( $$in =~ s/^\s*($Text::TeX::commentpattern)//o );
 793     if ($$in =~ s/^\s*$str//) {new Text::TeX::Token $&, $comment, $&}
 794     else {
 795       warn "String `$str' expected, not found";
 796       if ($comment) {new Text::TeX::Token undef, $comment}
 797       else {undef}
 798     }
 799   }
 800
 801 # Eat '{blah}'. Braces aren't returned. Stuff is returned as a Group,
 802 #   where each member is an (unrefined) TT::Text or Token
 803   sub eatBalanced {
 804     my $txt = shift;
 805     my ($in);
 806     warn "Did not get `{' when expected", return undef
 807       unless defined ($in = $txt->eatFixedString('{')) && defined ($in->[0]);
 808     $txt->eatBalancedRest;
 809   }
 810
 811 # Eat 'blah}'
 812   sub eatBalancedRest {
 813     my $txt = shift;
 814     my ($count,$in,@in) = (1);
 815   EAT:
 816     {
 817       warn "Unfinished balanced next", last EAT
 818         unless defined ($in = $txt->eatMultiToken) && defined $in->[0];
 819       push(@in,$in);
 820       $count++,redo if $in->[0] eq '{';
 821       $count-- if $in->[0] eq '}';
 822       # if !$count, remove '}' you just read and exit, else keep going
 823       pop(@in), last EAT unless $count;
 824       redo EAT;
 825     }
 826     bless \@in, 'Text::TeX::Group';
 827   }
 828
 829 # Eat stuff, either a token or a group (within {})
 830 #    Tokens will be refined.
 831 #    Braces ARE in the group
 832   sub eatGroup {                # If arg2==1 will eat exactly one
 833                                 # group, otherwise a group or a
 834                                 # multitoken.
 835     my $txt = shift;
 836     local ($in,$r,@in); #Note, this is a stupid way to name variables -Ak
 837     if (defined ($in[0] = $txt->eatMultiToken(shift)) and defined $in[0]->[0]) {
 838       $in[0]->refine($txt);
 839       if (ref $in[0] ne 'Text::TeX::Begin::Group') {
 840         return $in[0];
 841       } else { #it is the beginning of a group. So recurse until End::Group
 842         while (defined ($r=ref($in = $txt->eatGroup)) # Eat many groups
 843                && $r ne 'Text::TeX::End::Group') {
 844           push(@in,$in);
 845         }
 846         if (defined $r) {push(@in,$in)}
 847         else {warn "Uncompleted group"}
 848       } # end if Begin::Group
 849     } else {
 850       warn "Got nothing when argument expected";
 851       return undef;
 852     }
 853     bless \@in, 'Text::TeX::Group';
 854   }
 855
 856   sub eatUntil {                # We suppose that the text to match
 857                                 # fits in a paragraph
 858     my $txt = shift;
 859     my $m = shift;
 860     my ($in,@in);
 861     while ( (!defined $txt->{'paragraph'} || $ {$txt->{'paragraph'}} !~ /$m/)
 862            && defined ($in = $txt->eatGroup(1))) {
 863       push(@in,@$in);
 864     }
 865     ($ {$txt->{'paragraph'}} =~ s/$m//) || warn "Delimiter `$m' not found";
 866     bless \@in, 'Text::TeX::Group';
 867   }
 868
 869 # return next token without eating it. Return '' if end of paragraph
 870   sub lookAheadToken {          # If arg2, will eat one token - WHY!? -Ak
 871     my $txt = shift;
 872     # Call paragraph with no argument to say we're "just looking"
 873     my $in = $txt->paragraph;
 874     return '' unless $in;       # To be able to match without warnings
 875     my $comment = undef;
 876     if ($$in =~
 877         /^(?:\s*)(?:$Text::TeX::commentpattern)?($Text::TeX::tokenpattern)/o) {
 878       if (defined $2) {return $1} #if 1 usualtokenclass char, return it ($1==$2)
 879       elsif (defined $3) {return "\\$3"} # Multiletter (\[a-zA-Z]+)
 880       elsif (defined $1) {return $1} # \" or notusualtokenclass
 881     }
 882     return '';
 883   }
 884
 885 # This is the main subroutine for eating a token.
 886 # It returns a token as either TT::Text or TT::Token.
 887 # Or it returns TT::Paragraph if it had to read a new paragraph in the TeX file.
 888   sub eatMultiToken {           # If arg2, will eat one token
 889     my $txt = shift;
 890     # call paragraph with an arg so it gets new paragraph if necessary
 891     my $in = $txt->paragraph(1);
 892     return undef unless defined $in;
 893     return new Text::TeX::Paragraph unless $in; #i.e., if it's a new paragraph
 894     my $comment = undef;
 895     # eat a comment that comes before the token we're about to read
 896     $comment = $2 if $$in =~ s/^(\s*)($Text::TeX::commentpattern)/$1/o;
 897     my $nomulti = shift; #if arg2, eat one token
 898     # Eat text or a token
 899     # Cannot use if () BLOCK, because $& is local.
 900     $got = $$in =~ s/^\s*($Text::TeX::tokenpattern)//o  if $nomulti;
 901     $got = $$in =~ s/^\s*($Text::TeX::multitokenpattern)//o     unless $nomulti;
 902     # $1 = \[^a-zA-Z] or special char like ~
 903     # $2 = regular text. Return $& to include leading space!
 904     # $3 = [a-zA-Z]+ which followed a backslash, i.e., a 'multiletter' command
 905     if ($got and defined $2) {new Text::TeX::Text $&, $comment, $&}
 906     elsif ($got and defined $3) {new Text::TeX::Token "\\$3", $comment, $&}
 907     elsif ($got and defined $1) {new Text::TeX::Token $1, $comment, $&}
 908     elsif ($comment) {new Text::TeX::Token undef, $comment, ""}
 909     else {undef}
 910   }
 911
 912 # This is the main subroutine for eating the file.
 913 # It eats tokens and returns them. Sometimes it also returns pseudotokens.
 914 # Basic rundown:
 915 #  - if there's stuff in pending_out, return it
 916 #  - otherwise get stuff from pending_in OR eat a new token
 917 #  - refine the token, then digest it
 918 # (- pop stuff from synthetic into pending_out for next time UNLESS
 919 #      you read a new command that takes arguments. E.g. x^\sqrt)
 920 #  - return the token unless it's special & has a 'type'
 921 #  - based on the type, set up one or more tokens to be handled later
 922 #    so that, e.g., type 'report_args' returns BegArgsToken, followed
 923 #    later by some number of ArgToken's, followed by an EndArgsToken
 924 #
 925 #    LookAhead tokens can be used for _^. If you have x^a_b, the EndArgsToken
 926 # for the ^ will be changed to a LookAhead, which notes that a _ is next.
 927 # The _ has a BegArgsLookedAhead token instead of BegArgsToken. If anything
 928 # other than _ or ^ follows the argument to the LookAhead token (for example,
 929 # x^2+b, a regular old EndArgsToken is returned for the ^. reLyX doesn't use
 930 # the LookAhead functionality. (phew!)
 931   sub eat {
 932     my $txt = shift;
 933     if ( @{ $txt->{pending_out} } ) {
 934       my $out = pop @{ $txt->{pending_out} };
 935       # E.g., if you have x^\sqrt2 -- when you pop and return the \sqrt
 936       # EndArgsToken, you need to make sure the ^ EndArgsToken falls out next.
 937       #    But if pending_out is an ArgToken, *don't* pop the next thing
 938       # (next ArgToken or EndArgsToken) out of synthetic yet
 939       # Most often, synthetic will be empty, so popsynthetic will do nothing
 940       $txt->popsynthetic if ref($out) eq 'Text::TeX::EndArgsToken';
 941       if (ref $out eq 'Text::TeX::LookAhead') {
 942         my $in = $txt->lookAheadToken;
 943         if (defined ($res = $out->[0][2]{$in})) {
 944           push @{$out->[0]}, $in, $res;
 945           # actually eat what you looked ahead
 946           $in = $txt->eatMultiToken(1); # XXXX may be wrong if next
 947                                         # token needs to be eaten in
 948                                         # the style `multi', like \left.
 949           # Put it at beginning of pending_in so we do E.g., EndLocals first
 950           splice @{ $txt->{pending_in} },
 951             0, 0, (bless \$in, 'Text::TeX::LookedAhead');
 952           return $out;
 953         } else {
 954           return bless $out, 'Text::TeX::EndArgsToken';
 955         }
 956       } else {
 957         return $out;
 958       }
 959     } # end if pending_out
 960
 961     # We didn't get & return stuff from pending_out. So try to get stuff
 962     #    from pending_in. If there's nothing there, eat a new token.
 963     my $in = pop @{ $txt->{pending_in} };
 964     my $after_lookahead;
 965     if (defined $in) {
 966       # after_lookahead is true if we got a LookedAhead token from pending_out
 967       #    because we looked ahead when there was a LookAhead token
 968       $in = $$in, $after_lookahead = 1
 969         if ref $in eq 'Text::TeX::LookedAhead';
 970     } else {
 971       my $one;
 972       # This will happen if we did pushsynthetic on the last token.
 973       # That happened for report_args tokens, i.e., things that require
 974       #     arguments. \frac, e.g., will read either a character or
 975       #     a token *or* the '{' that begins a group, then popsynthetic below.
 976       # \frac puts *two* tokens in {synthetic} so $one will be set TWICE
 977       $one = 1 if @{ $txt->{synthetic}[-1] }; # Need to eat a group.
 978       $in = $txt->eatMultiToken($one);
 979     }
 980     return undef unless defined $in;
 981     $in->refine($txt);
 982     $in->digest($txt);
 983     my ($Token, $type, @arr);
 984     unless (defined $in
 985             && defined $in->[0]
 986             && $in->[0] =~ /$Text::TeX::active/o
 987             && defined ( $Token = $txt->{tokens}->{$in->[0]} )
 988             && exists ($Token->{"Type"})
 989             ) {
 990         $txt->popsynthetic;
 991         return $in;
 992     }
 993     $type = $Token->{Type};
 994     $txt->popsynthetic unless $type eq 'report_args';
 995
 996     # If the token is special enough that it's got a 'type', do more stuff
 997     my $out = $in;
 998     if ($type eq 'action') {
 999 #      return &{$Token->{sub}}($in);
1000       return &{$Token->{'sub'}}($in); #Without 's it breaks strict refs -Ak
1001     } elsif ($type eq 'argmask') {
1002       # eatWithMask;            # ????
1003     } elsif ($type eq 'args') {
1004       # Args eaten already
1005     } elsif ($type eq 'local') {
1006       $txt->addpresynthetic(new Text::TeX::EndLocal $in);
1007     } elsif ($type eq 'report_args') {
1008       my $count = $Token->{count};
1009       my $ordinal = $count;
1010       my $res;
1011       if ($res = $Token->{lookahead}) {
1012         $txt->pushsynthetic(new Text::TeX::LookAhead [$in, $count, $res]);
1013       } else {
1014         # This will fall out after we read all the args this token needs
1015         $txt->pushsynthetic(new Text::TeX::EndArgsToken [$in, $count]);
1016       }
1017       # One of these tokens will fall out after we finish each arg (except last)
1018       # Push on 3,2,1, so that when we *popsynthetic*, 1 will come off first
1019       # followed by 2, 3
1020       # ArgToken->[0][2] will then be the number of args read so far for
1021       # the token held in ArgToken->[0][0]
1022       while (--$ordinal) {
1023         $txt->pushsynthetic(new Text::TeX::ArgToken [$in, $count, $ordinal]);
1024       }
1025       if ($after_lookahead) {
1026         $out = new Text::TeX::BegArgsTokenLookedAhead [$in, $count];
1027       } else {
1028         $out = new Text::TeX::BegArgsToken [$in, $count];
1029       }
1030     } else {
1031       warn "Format of token data unknown for `", $in->[0], "'";
1032     }
1033     return $out;
1034   }
1035
1036   sub report_arg {
1037     my $n = shift;
1038     my $max = shift;
1039     my $act = shift;
1040     my $lastact = shift;
1041     if ($n == $max) {
1042       &$lastact($n);
1043     } else {
1044       &$act($n,$max);
1045     }
1046   }
1047
1048   sub eatDefine {
1049     my $txt = shift;
1050     my ($args, $body);
1051     warn "No `{' found after defin", return undef
1052       unless $args = $txt->eatUntil('{');
1053     warn "Argument list @$args too complicated", return undef
1054       unless @$args == 1 && $$args[0] =~ /^(\ \#\d)*$/;
1055     warn "No `}' found after defin", return undef
1056       unless $body = $txt->eatBalancedRest;
1057     #my @args=split(/(\#[\d\#])/,$$);       # lipa
1058   }
1059
1060 # This is the main subroutine called by parsing programs. Basically, it
1061 #     keeps eating tokens, then calling $txt->actions on that token
1062   sub process {
1063     my ($txt, $eaten, $act) = (shift);
1064     while (defined ($eaten = $txt->eat)) {
1065       if (defined ($act = $txt->{actions}[-1])) {
1066         &$act($eaten,$txt);
1067       }
1068     }
1069   }
1070 } #END Text::TeX::OpenFile
1071
1072 #####################    MORE GLOBAL STUFF    ##################################
1073 %super_sub_lookahead = qw( ^ 1 _ 0 \\sb 0 \\sp 1 \\Sp 1 \\Sb 0 );
1074
1075 # class => 'where to bless to', Type => how to process
1076 # eatargs => how many args to swallow before digesting
1077
1078 %Tokens = (
1079   '{' => {'class' => 'Text::TeX::Begin::Group', 'waitfor' => '}'},
1080   '}' => {'class' => 'Text::TeX::End::Group'},
1081   "\$" => {'class' => 'Text::TeX::SelfMatch', waitfor => "\$"},
1082   '$$' => {'class' => 'Text::TeX::SelfMatch', waitfor => '$$'},
1083   '\begin' => {class => 'Text::TeX::Begin::Group::Args',
1084                eatargs => 1, 'waitfor' => '\end', selfmatch => 1},
1085   '\end' => {class => 'Text::TeX::End::Group::Args', eatargs => 1, selfmatch => 1},
1086   '\left' => {class => 'Text::TeX::Begin::Group::Args',
1087                eatargs => 1, 'waitfor' => '\right'},
1088   '\right' => {class => 'Text::TeX::End::Group::Args', eatargs => 1},
1089   '\frac' => {Type => 'report_args', count => 2},
1090   '\sqrt' => {Type => 'report_args', count => 1},
1091   '\text' => {Type => 'report_args', count => 1},
1092   '\operatorname' => {Type => 'report_args', count => 1},
1093   '\operatornamewithlimits' => {Type => 'report_args', count => 1},
1094   '^' => {Type => 'report_args', count => 1,
1095           lookahead => \%super_sub_lookahead },
1096   '_' => {Type => 'report_args', count => 1,
1097           lookahead => \%super_sub_lookahead },
1098   '\em' => {Type => 'local'},
1099   '\bold' => {Type => 'local'},
1100   '\it' => {Type => 'local'},
1101   '\rm' => {Type => 'local'},
1102   '\mathcal' => {Type => 'local'},
1103   '\mathfrak' => {Type => 'local'},
1104   '\mathbb' => {Type => 'local'},
1105   '\\\\' => {'class' => 'Text::TeX::Separator'},
1106   '&' => {'class' => 'Text::TeX::Separator'},
1107 );
1108
1109 ##############   I NEVER USE ANYTHING BELOW THIS LINE!! -Ak   ##################
1110 {
1111   my $i = 0;
1112   @symbol = (
1113        (undef) x 8,             # 1st row
1114        (undef) x 8,
1115        (undef) x 8,             # 2nd row
1116        (undef) x 8,
1117        undef, undef, '\forall', undef, '\exists', undef, undef, '\???', # 3rd: symbols
1118        (undef) x 8,
1119        (undef) x 8,     # 4th: numbers and symbols
1120        (undef) x 8,
1121        '\???', ( map {"\\$_"}
1122                  qw(Alpha Beta Chi Delta Epsilon Phi Gamma
1123                  Eta Iota vartheta Kappa Lambda Mu Nu Omicron
1124                  Pi Theta Rho Sigma Tau Ypsilon varsigma Omega
1125                  Xi Psi Zeta)), undef, '\therefore', undef, '\perp', undef,
1126        undef, ( map {"\\$_"}
1127                 qw(alpha beta chi delta varepsilon phi gamma
1128                    eta iota varphi kappa lambda mu nu omicron
1129                    pi theta rho sigma tau ypsilon varpi omega
1130                    xi psi zeta)), undef, undef, undef, undef, undef,
1131        (undef) x 8,             # 9st row
1132        (undef) x 8,
1133        (undef) x 8,             # 10nd row
1134        (undef) x 8,
1135        undef, undef, undef, '\leq', undef, '\infty', undef, undef, # 11th row
1136        undef, undef, undef, undef, '\from', undef, '\to', undef,
1137        '\circ', '\pm', undef, '\geq', '\times', undef, '\partial', '\bullet', # 12th row
1138        undef, '\neq', '\equiv', '\approx', '\dots', '\mid', '\hline', undef,
1139        '\Aleph', undef, undef, undef, '\otimes', '\oplus', '\empty', '\cap', # 13th row
1140        '\cup', undef, undef, undef, undef, undef, '\in', '\notin',
1141        undef, '\nabla', undef, undef, undef, '\prod', undef, '\cdot', # 14th row
1142        undef, '\wedge', '\vee', undef, undef, undef, undef, undef,
1143        undef, '\<', undef, undef, undef, '\sum', undef, undef, # 15th row
1144        (undef) x 8,
1145        undef, '\>', '\int', (undef) x 5, # 16th row
1146        (undef) x 8,
1147       );
1148   for (@symbol) {
1149     $xfont{$_} = ['symbol', chr($i)] if defined $_;
1150     $i++;
1151   }
1152 }
1153
1154 # This list was autogenerated by the following script:
1155 # Some handediting is required since MSSYMB.TEX is obsolete.
1156
1157 ## Usage is like:
1158 ##              extract_texchar.pl  PLAIN.TEX MSSYMB.TEX
1159 ##$family = shift;
1160
1161 #%fonts = (2 => "cmsy", 3 => "cmex", '\\msx@' => msam, '\\msy@' => msbm, );
1162
1163 #while (defined ($_ = <ARGV>)) {
1164 #  $list{$fonts{$2}}[hex $3] = $1
1165 #    if /^\s*\\mathchardef(\\\w+)=\"\d([23]|\\ms[xy]\@)([\da-fA-F]+)\s+/o;
1166 #}
1167
1168 #for $font (keys %list) {
1169 #  print "\@$font = (\n  ";
1170 #  for $i (0 .. $#{$list{$font}}/8) {
1171 #    print join ', ', map {packit($_)} @{$list{$font}}[ 8*$i .. 8*$i+7 ];
1172 #    print ",\n  ";
1173 #  }
1174 #  print ");\n\n";
1175 #}
1176
1177 #sub packit {
1178 #  my $cs = shift;
1179 #  if (defined $cs) {
1180 #    #$cs =~ s/\\\\/\\\\\\\\/g;
1181 #    "'$cs'";
1182 #  } else {
1183 #    'undef';
1184 #  }
1185 #}
1186
1187 @cmsy = (
1188   undef, '\cdotp', '\times', '\ast', '\div', '\diamond', '\pm', '\mp',
1189   '\oplus', '\ominus', '\otimes', '\oslash', '\odot', '\bigcirc', '\circ', '\bullet',
1190   '\asymp', '\equiv', '\subseteq', '\supseteq', '\leq', '\geq', '\preceq', '\succeq',
1191   '\sim', '\approx', '\subset', '\supset', '\ll', '\gg', '\prec', '\succ',
1192   '\leftarrow', '\rightarrow', '\uparrow', '\downarrow', '\leftrightarrow', '\nearrow', '\searrow', '\simeq',
1193   '\Leftarrow', '\Rightarrow', '\Uparrow', '\Downarrow', '\Leftrightarrow', '\nwarrow', '\swarrow', '\propto',
1194   '\prime', '\infty', '\in', '\ni', '\bigtriangleup', '\bigtriangledown', '\not', '\mapstochar',
1195   '\forall', '\exists', '\neg', '\emptyset', '\Re', '\Im', '\top', '\perp',
1196   '\aleph', undef, undef, undef, undef, undef, undef, undef,
1197   undef, undef, undef, undef, undef, undef, undef, undef,
1198   undef, undef, undef, undef, undef, undef, undef, undef,
1199   undef, undef, undef, '\cup', '\cap', '\uplus', '\wedge', '\vee',
1200   '\vdash', '\dashv', undef, undef, undef, undef, undef, undef,
1201   '\langle', '\rangle', '\mid', '\parallel', undef, undef, '\setminus', '\wr',
1202   undef, '\amalg', '\nabla', '\smallint', '\sqcup', '\sqcap', '\sqsubseteq', '\sqsupseteq',
1203   undef, '\dagger', '\ddagger', undef, '\clubsuit', '\diamondsuit', '\heartsuit', '\spadesuit',
1204   );
1205
1206 @cmex = (
1207   undef, undef, undef, undef, undef, undef, undef, undef, # 0-7
1208   undef, undef, undef, undef, undef, undef, undef, undef, # 8-15
1209   undef, undef, undef, undef, undef, undef, undef, undef, # 16-23
1210   undef, undef, undef, undef, undef, undef, undef, undef, # 24-31
1211   undef, undef, undef, undef, undef, undef, undef, undef, # 32-39
1212   undef, undef, undef, undef, undef, undef, undef, undef, # 40-47
1213   undef, undef, undef, undef, undef, undef, undef, undef, # 48-55
1214   undef, undef, undef, undef, undef, undef, undef, undef, # 56-64
1215   undef, undef, undef, undef, undef, undef, '\bigsqcup', undef, # 64-71
1216   '\ointop', undef, '\bigodot', undef, '\bigoplus', undef, '\bigotimes', undef, # 72-79
1217   '\sum', '\prod', '\intop', '\bigcup', '\bigcap', '\biguplus', '\bigwedge', '\bigvee', # 80-87
1218   undef, undef, undef, undef, undef, undef, undef, undef,
1219   '\coprod', undef, undef, undef, undef, undef, undef, undef,
1220   );
1221
1222 @msam = (
1223   '\boxdot', '\boxplus', '\boxtimes', '\square', '\blacksquare', '\centerdot', '\lozenge', '\blacklozenge',
1224   '\circlearrowright', '\circlearrowleft', '\rightleftharpoons', '\leftrightharpoons', '\boxminus', '\Vdash', '\Vvdash', '\vDash',
1225   '\twoheadrightarrow', '\twoheadleftarrow', '\leftleftarrows', '\rightrightarrows', '\upuparrows', '\downdownarrows', '\upharpoonright', '\downharpoonright',
1226   '\upharpoonleft', '\downharpoonleft', '\rightarrowtail', '\leftarrowtail', '\leftrightarrows', '\rightleftarrows', '\Lsh', '\Rsh',
1227   '\rightsquigarrow', '\leftrightsquigarrow', '\looparrowleft', '\looparrowright', '\circeq', '\succsim', '\gtrsim', '\gtrapprox',
1228   '\multimap', '\therefore', '\because', '\doteqdot', '\triangleq', '\precsim', '\lesssim', '\lessapprox',
1229   '\eqslantless', '\eqslantgtr', '\curlyeqprec', '\curlyeqsucc', '\preccurlyeq', '\leqq', '\leqslant', '\lessgtr',
1230   '\backprime', undef, '\risingdotseq', '\fallingdotseq', '\succcurlyeq', '\geqq', '\geqslant', '\gtrless',
1231   '\sqsubset', '\sqsupset', '\vartriangleright', '\vartriangleleft', '\trianglerighteq', '\trianglelefteq', '\bigstar', '\between',
1232   '\blacktriangledown', '\blacktriangleright', '\blacktriangleleft', undef, undef, '\vartriangle', '\blacktriangle', '\triangledown',
1233   '\eqcirc', '\lesseqgtr', '\gtreqless', '\lesseqqgtr', '\gtreqqless', '\yen', '\Rrightarrow', '\Lleftarrow',
1234   '\checkmark', '\veebar', '\barwedge', '\doublebarwedge', '\angle', '\measuredangle', '\sphericalangle', '\varpropto',
1235   '\smallsmile', '\smallfrown', '\Subset', '\Supset', '\Cup', '\Cap', '\curlywedge', '\curlyvee',
1236   '\leftthreetimes', '\rightthreetimes', '\subseteqq', '\supseteqq', '\bumpeq', '\Bumpeq', '\lll', '\ggg',
1237   '\ulcorner', '\urcorner', '\circledR', '\circledS', '\pitchfork', '\dotplus', '\backsim', '\backsimeq',
1238   '\llcorner', '\lrcorner', '\maltese', '\complement', '\intercal', '\circledcirc', '\circledast', '\circleddash',
1239   );
1240
1241 @msbm = (
1242   '\lvertneqq', '\gvertneqq', '\nleq', '\ngeq', '\nless', '\ngtr', '\nprec', '\nsucc',
1243   '\lneqq', '\gneqq', '\nleqslant', '\ngeqslant', '\lneq', '\gneq', '\npreceq', '\nsucceq',
1244   '\precnsim', '\succnsim', '\lnsim', '\gnsim', '\nleqq', '\ngeqq', '\precneqq', '\succneqq',
1245   '\precnapprox', '\succnapprox', '\lnapprox', '\gnapprox', '\nsim', '\ncong', undef, undef,
1246   '\varsubsetneq', '\varsupsetneq', '\nsubseteqq', '\nsupseteqq', '\subsetneqq', '\supsetneqq', '\varsubsetneqq', '\varsupsetneqq',
1247   '\subsetneq', '\supsetneq', '\nsubseteq', '\nsupseteq', '\nparallel', '\nmid', '\nshortmid', '\nshortparallel',
1248   '\nvdash', '\nVdash', '\nvDash', '\nVDash', '\ntrianglerighteq', '\ntrianglelefteq', '\ntriangleleft', '\ntriangleright',
1249   '\nleftarrow', '\nrightarrow', '\nLeftarrow', '\nRightarrow', '\nLeftrightarrow', '\nleftrightarrow', '\divideontimes', '\varnothing',
1250   '\nexists', undef, undef, undef, undef, undef, undef, undef,
1251   undef, undef, undef, undef, undef, undef, undef, undef,
1252   undef, undef, undef, undef, undef, undef, undef, undef,
1253   undef, undef, undef, undef, undef, undef, undef, undef,
1254   undef, undef, undef, undef, undef, undef, '\mho', '\eth',
1255   '\eqsim', '\beth', '\gimel', '\daleth', '\lessdot', '\gtrdot', '\ltimes', '\rtimes',
1256   '\shortmid', '\shortparallel', '\smallsetminus', '\thicksim', '\thickapprox', '\approxeq', '\succapprox', '\precapprox',
1257   '\curvearrowleft', '\curvearrowright', '\digamma', '\varkappa', undef, '\hslash', '\hbar', '\backepsilon',
1258   );
1259
1260 # Temporary workaround against Tk's \n (only cmsy contains often-used \otimes):
1261
1262 $cmsy[ord "\n"] = undef;
1263
1264 for $font (qw(cmsy cmex msam msbm)) {
1265   for $num (0 .. $#{$font}) {
1266     $xfont{$$font[$num]} = [$font, chr($num)] if defined $$font[$num];
1267   }
1268 }
1269
1270 %aliases = qw(
1271               \int \intop \oint \ointop \restriction \upharpoonright
1272               \Doteq \doteqdot \doublecup \Cup \doublecap \Cap
1273               \llless \lll \gggtr \ggg \lnot \neg \land \wedge
1274               \lor \vee \le \leq \ge \geq \owns \ni \gets \leftarrow
1275               \to \rightarrow \< \langle \> \rangle \| \parallel
1276              );
1277
1278 for $from (keys %aliases) {
1279   $xfont{$from} = $xfont{$aliases{$from}} if exists $xfont{$aliases{$from}};
1280 }
1281
1282
1283 # Autoload methods go after =cut, and are processed by the autosplit program.
1284
1285 1;
1286 __END__
1287
1288 =head1 NAME
1289
1290 Text::TeX -- Perl module for parsing of C<TeX>.
1291
1292 =head1 SYNOPSIS
1293
1294   use Text::TeX;
1295
1296   sub report {
1297     my($eaten,$txt) = (shift,shift);
1298     print "Comment: `", $eaten->[1], "'\n" if defined $eaten->[1];
1299     print "@{$txt->{waitfors}} ", ref $eaten, ": `", $eaten->[0], "'";
1300     if (defined $eaten->[3]) {
1301       my @arr = @{ $eaten->[3] };
1302       foreach (@arr) {
1303         print " ", $_->print;
1304       }
1305     }
1306     print "\n";
1307   }
1308
1309   my $file = new Text::TeX::OpenFile 'test.tex',
1310     'defaultact' => \&report;
1311   $file->process;
1312
1313 =head1 DESCRIPTION
1314
1315 A new C<TeX> parser is created by
1316
1317   $file = new Text::TeX::OpenFile $filename, attr1 => $val1, ...;
1318
1319 $filename may be C<undef>, in this case the text to parse may be
1320 specified in the attribute C<string>.
1321
1322 Recognized attributes are:
1323
1324 =over 12
1325
1326 =item C<string>
1327
1328 contains the text to parse before parsing $filename.
1329
1330 =item C<defaultact>
1331
1332 denotes a procedure to submit C<output tokens> to.
1333
1334 =item C<tokens>
1335
1336 gives a hash of C<descriptors> for C<input token>. A sane default is
1337 provided.
1338
1339 =back
1340
1341 A call to the method C<process> launches the parser.
1342
1343 =head2 Tokenizer
1344
1345 When the parser is running, it processes input stream by splitting it
1346 into C<input tokens> using some I<heuristics> similar to the actual
1347 rules of TeX tokenizer. However, since it does not use I<the exact
1348 rules>, the resulting tokens may be wrong if some advanced TeX command
1349 are used, say, the character classes are changed.
1350
1351 This should not be of any concern if the stream in question is a
1352 "user" file, but is important for "packages".
1353
1354 =head2 Digester
1355
1356 The processed C<input tokens> are handled to the digester, which
1357 handles them according to the provided C<tokens> attribute.
1358
1359 =head2 C<tokens> attribute
1360
1361 This is a hash reference which describes how the C<input tokens>
1362 should be handled. A key to this hash is a literal like C<^> or
1363 C<\fraction>. A value should be another hash reference, with the
1364 following keys recognized:
1365
1366 =over 7
1367
1368 =item class
1369
1370 Into which class to bless the token. Several predefined classes are
1371 provided. The default is C<Text::TeX::Token>.
1372
1373 =item Type
1374
1375 What kind of special processing to do with the input after the
1376 C<class> methods are called. Recognized C<Type>s are:
1377
1378 =over 10
1379
1380 =item report_args
1381
1382 When the token of this C<Type> is encountered, it is converted into
1383 C<Text::Tex::BegArgsToken>. Then the arguments are processed as usual,
1384 and an C<output token> of type C<Text::Tex::ArgToken> is inserted
1385 between them. Finally, after all the arguments are processed, an
1386 C<output token> C<Text::Tex::EndArgsToken> is inserted.
1387
1388 The first element of these simulated C<output tokens> is an array
1389 reference with the first element being the initial C<output token>
1390 which generated this sequence. The second element of the internal
1391 array is the number of arguments required by the C<input token>. The
1392 C<Text::Tex::ArgToken> token has a third element, which is the ordinal
1393 of the argument which ends immediately before this token.
1394
1395 If requested, a token C<Text::Tex::LookAhead> may be returned instead
1396 of C<Text::Tex::EndArgsToken>. The additional elements of
1397 C<$token->[0]> are: the reference to the corresponding C<lookahead>
1398 attribute, the relevant key (text of following token) and the
1399 corresponding value.
1400
1401 In such a case the input token which was looked-ahead would generate
1402 an output token of type C<Text::Tex::BegArgsTokenLookedAhead> (if it
1403 usually generates C<Text::Tex::BegArgsToken>).
1404
1405 =item local
1406
1407 Means that these macro introduces a local change, which should be
1408 undone at the end of enclosing block. At the end of the block an
1409 output event C<Text::TeX::EndLocal> is delivered, with C<$token->[0]>
1410 being the output token for the I<local> event starting.
1411
1412 Useful for font switching.
1413
1414 =back
1415
1416 =back
1417
1418 Some additional keys may be recognized by the code for the particular
1419 C<class>.
1420
1421 =over 12
1422
1423 =item C<count>
1424
1425 number of arguments to the macro.
1426
1427 =item C<waitfor>
1428
1429 gives the matching token for a I<starting delimiter> token.
1430
1431 =item C<eatargs>
1432
1433 number of tokens to swallow literally and put into the relevant slot
1434 of the C<output token>. The surrounding braces are stripped.
1435
1436 =item C<selfmatch>
1437
1438 is used with C<eatargs==1>. Denotes that the matching token is also
1439 C<eatargs==1>, and the swallowed tokens should coinside (like with
1440 C<\begin{blah} ... \end{blah}>).
1441
1442 =item C<lookahead>
1443
1444 is a hash with keys being texts of tokens which need to be treated
1445 specially after the end of arguments for the current token. If the
1446 corresponding text follows the token indeed, a token
1447 C<Text::Tex::LookAhead> is returned instead of
1448 C<Text::Tex::EndArgsToken>.
1449
1450 =back
1451
1452 =head2 Symbol font table
1453
1454 The hash %Text::TeX::xfont contains the translation table from TeX
1455 tokens into the corresponding font elements. The values are array
1456 references of the form C<[fontname, char]>, Currently the only font
1457 supported is C<symbol>.
1458
1459 =cut
1460
1461 =head1 AUTHOR
1462
1463 Ilya Zakharevich, ilya@math.ohio-state.edu
1464
1465 =head1 SEE ALSO
1466
1467 perl(1).
1468
1469 =cut