src/lexer.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "lexer" : Lexical analyser                                              */
   3 /*                                                                           */
   4 /*   Part of Inform 6.42                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2024                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 int total_source_line_count,            /* Number of source lines so far     */
  25
  26     no_hash_printed_yet,                /* Have not yet printed the first #  */
  27     hash_printed_since_newline,         /* A hash has been printed since the
  28                                            most recent new-line was printed
  29                                            (generally as a result of an error
  30                                            message or the start of pass)     */
  31     dont_enter_into_symbol_table,       /* Return names as text (with
  32                                            token type UQ_TT) and not as
  33                                            entries in the symbol table,
  34                                            when TRUE. If -2, only the
  35                                            keyword table is searched.        */
  36     return_sp_as_variable;              /* When TRUE, the word "sp" denotes
  37                                            the stack pointer variable
  38                                            (used in assembly language only)  */
  39 int next_token_begins_syntax_line;      /* When TRUE, start a new syntax
  40                                            line (for error reporting, etc.)
  41                                            on the source code line where
  42                                            the next token appears            */
  43
  44 int32 last_mapped_line;  /* Last syntax line reported to debugging file      */
  45
  46 /* ------------------------------------------------------------------------- */
  47 /*   The lexer's output is a sequence of structs, each called a "token",     */
  48 /*   representing one lexical unit (or "lexeme") each.  Instead of providing */
  49 /*   "lookahead" (that is, always having available the next token after the  */
  50 /*   current one, so that syntax analysers higher up in Inform can have      */
  51 /*   advance knowledge of what is coming), the lexer instead has a system    */
  52 /*   where tokens can be read in and then "put back again".                  */
  53 /*   The meaning of the number (and to some extent the text) supplied with   */
  54 /*   a token depends on its type: see "header.h" for the list of types.      */
  55 /*   For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
  56 /*   number, and translated to the token:                                    */
  57 /*     type NUMBER_TT, value 483, text "$1e3"                                */
  58 /* ------------------------------------------------------------------------- */
  59 /*   These three variables are set to the current token on a call to         */
  60 /*   get_next_token() (but are not changed by a call to put_token_back()).   */
  61 /*   (It would be tidier to use a token_data structure, rather than having   */
  62 /*   get_next_token() unpack three values. But this is the way it is.)       */
  63 /* ------------------------------------------------------------------------- */
  64
  65 int token_type;
  66 int32 token_value;
  67 char *token_text;
  68
  69 /* ------------------------------------------------------------------------- */
  70 /*   The next two variables are the head and tail of a singly linked list.   */
  71 /*   The tail stores the portion most recently read from the current         */
  72 /*   lexical block; its end values therefore describe the location of the    */
  73 /*   current token, and are updated whenever the three variables above are   */
  74 /*   via set_token_location(...).  Earlier vertices, if any, represent the   */
  75 /*   regions of lexical blocks read beforehand, where new vertices are       */
  76 /*   only introduced by interruptions like a file inclusion or an EOF.       */
  77 /*   Vertices are deleted off of the front of the list once they are no      */
  78 /*   longer referenced by pending debug information records.                 */
  79 /* ------------------------------------------------------------------------- */
  80
  81 static debug_locations *first_token_locations;
  82 static debug_locations *last_token_location;
  83
  84 extern debug_location get_token_location(void)
  85 {   debug_location result;
  86     debug_location *location = &(last_token_location->location);
  87     result.file_index = location->file_index;
  88     result.beginning_byte_index = location->end_byte_index;
  89     result.end_byte_index = location->end_byte_index;
  90     result.beginning_line_number = location->end_line_number;
  91     result.end_line_number = location->end_line_number;
  92     result.beginning_character_number = location->end_character_number;
  93     result.end_character_number = location->end_character_number;
  94     result.orig_file_index = location->orig_file_index;
  95     result.orig_beg_line_number = location->orig_beg_line_number;
  96     result.orig_beg_char_number = location->orig_beg_char_number;
  97     return result;
  98 }
  99
 100 extern debug_locations get_token_locations(void)
 101 {   debug_locations result;
 102     result.location = get_token_location();
 103     result.next = NULL;
 104     result.reference_count = 0;
 105     return result;
 106 }
 107
 108 static void set_token_location(debug_location location)
 109 {   if (location.file_index == last_token_location->location.file_index)
 110     {   last_token_location->location.end_byte_index =
 111             location.end_byte_index;
 112         last_token_location->location.end_line_number =
 113             location.end_line_number;
 114         last_token_location->location.end_character_number =
 115             location.end_character_number;
 116         last_token_location->location.orig_file_index =
 117             location.orig_file_index;
 118         last_token_location->location.orig_beg_line_number =
 119             location.orig_beg_line_number;
 120         last_token_location->location.orig_beg_char_number =
 121             location.orig_beg_char_number;
 122     } else
 123     {   debug_locations*successor =
 124             my_malloc
 125                 (sizeof(debug_locations),
 126                  "debug locations of recent tokens");
 127         successor->location = location;
 128         successor->next = NULL;
 129         successor->reference_count = 0;
 130         last_token_location->next = successor;
 131         last_token_location = successor;
 132     }
 133 }
 134
 135 extern debug_location_beginning get_token_location_beginning(void)
 136 {   debug_location_beginning result;
 137     ++(last_token_location->reference_count);
 138     result.head = last_token_location;
 139     result.beginning_byte_index =
 140         last_token_location->location.end_byte_index;
 141     result.beginning_line_number =
 142         last_token_location->location.end_line_number;
 143     result.beginning_character_number =
 144         last_token_location->location.end_character_number;
 145     result.orig_file_index = last_token_location->location.orig_file_index;
 146     result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
 147     result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
 148
 149     return result;
 150 }
 151
 152 static void cleanup_token_locations(debug_location_beginning*beginning)
 153 {   if (first_token_locations)
 154     {   while (first_token_locations &&
 155                !first_token_locations->reference_count)
 156         {   debug_locations*moribund = first_token_locations;
 157             first_token_locations = moribund->next;
 158             my_free(&moribund, "debug locations of recent tokens");
 159             if (beginning &&
 160                 (beginning->head == moribund || !first_token_locations))
 161             {   compiler_error
 162                     ("Records needed by a debug_location_beginning are no "
 163                      "longer allocated, perhaps because of an invalid reuse "
 164                      "of this or an earlier beginning");
 165             }
 166         }
 167     } else
 168     {   if (beginning)
 169         {   compiler_error
 170                 ("Attempt to use a debug_location_beginning when no token "
 171                  "locations are defined");
 172         } else
 173         {   compiler_error
 174                 ("Attempt to clean up token locations when no token locations "
 175                  "are defined");
 176         }
 177     }
 178 }
 179
 180 extern void discard_token_location(debug_location_beginning beginning)
 181 {   --(beginning.head->reference_count);
 182 }
 183
 184 extern debug_locations get_token_location_end
 185     (debug_location_beginning beginning)
 186 {   debug_locations result;
 187     cleanup_token_locations(&beginning);
 188     --(beginning.head->reference_count);
 189     /* Sometimes we know what we'll read before we switch to the lexical block
 190        where we'll read it.  In such cases the beginning will be placed in the
 191        prior block and last exactly zero bytes there.  It's misleading to
 192        include such ranges, so we gobble them. */
 193     if (beginning.head->location.end_byte_index ==
 194           beginning.beginning_byte_index &&
 195         beginning.head->next)
 196     {   beginning.head = beginning.head->next;
 197         result.location = beginning.head->location;
 198         result.location.beginning_byte_index = 0;
 199         result.location.beginning_line_number = 1;
 200         result.location.beginning_character_number = 1;
 201     } else
 202     {   result.location = beginning.head->location;
 203         result.location.beginning_byte_index =
 204             beginning.beginning_byte_index;
 205         result.location.beginning_line_number =
 206             beginning.beginning_line_number;
 207         result.location.beginning_character_number =
 208             beginning.beginning_character_number;
 209     }
 210
 211     result.location.orig_file_index =
 212         beginning.orig_file_index;
 213     result.location.orig_beg_line_number =
 214         beginning.orig_beg_line_number;
 215     result.location.orig_beg_char_number =
 216         beginning.orig_beg_char_number;
 217
 218     result.next = beginning.head->next;
 219     result.reference_count = 0;
 220     return result;
 221 }
 222
 223 /* ------------------------------------------------------------------------- */
 224 /*   In order to be able to put tokens back efficiently, the lexer stores    */
 225 /*   tokens in a "circle": the variable circle_position ranges between       */
 226 /*   0 and CIRCLE_SIZE-1.  We only need a circle size as large as the        */
 227 /*   maximum number of tokens ever put back at once, plus 1 (in effect, the  */
 228 /*   maximum token lookahead ever needed in syntax analysis, plus 1).        */
 229 /*                                                                           */
 230 /*   Note that the circle struct type is lexeme_data, whereas the expression */
 231 /*   code all works in token_data. They have slightly different needs. The   */
 232 /*   data is exported through the token_text, token_value, token_type        */
 233 /*   globals, so there's no need to use the same struct at both levels.      */
 234 /*                                                                           */
 235 /*   Unlike some compilers, Inform does not have a context-free lexer: in    */
 236 /*   fact it has 12288 different possible states.  However, the context only */
 237 /*   affects the interpretation of "identifiers": lexemes beginning with a   */
 238 /*   letter and containing up to 32 chars of alphanumeric and underscore     */
 239 /*   chars.  (For example, "default" may refer to the directive or statement */
 240 /*   of that name, and which token values are returned depends on the        */
 241 /*   current lexical context.)                                               */
 242 /*                                                                           */
 243 /*   Along with each token, we also store the lexical context it was         */
 244 /*   translated under; because if it is called for again, there may need     */
 245 /*   to be a fresh interpretation of it if the context has changed.          */
 246 /* ------------------------------------------------------------------------- */
 247
 248 #define CIRCLE_SIZE 6
 249
 250 /*   (The worst case for token lookahead is distinguishing between an
 251      old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".)   */
 252
 253 static int circle_position;
 254 static lexeme_data circle[CIRCLE_SIZE];
 255
 256 /* ------------------------------------------------------------------------- */
 257 /*   A complication, however, is that the text of some lexemes needs to be   */
 258 /*   held in Inform's memory for much longer periods: for example, a         */
 259 /*   dictionary word lexeme (like "'south'") must have its text preserved    */
 260 /*   until the code generation time for the expression it occurs in, when    */
 261 /*   the dictionary reference is actually made. We handle this by keeping    */
 262 /*   all lexeme text until the end of the statement (or, for top-level       */
 263 /*   directives, until the end of the directive). Then we call               */
 264 /*   release_token_texts() to start over. The lextexts array will therefore  */
 265 /*   grow to the largest number of lexemes in a single statement or          */
 266 /*   directive.                                                              */
 267 /* ------------------------------------------------------------------------- */
 268
 269 typedef struct lextext_s {
 270     char *text;
 271     size_t size; /* Allocated size (including terminal null)                 */
 272 } lextext;
 273
 274 static lextext *lextexts; /* Allocated to no_lextexts */
 275 static memory_list lextexts_memlist;
 276 static int no_lextexts;
 277
 278 static int cur_lextexts;    /* Number of lextexts in current use
 279                                (cur_lextexts <= no_lextexts)                 */
 280
 281 static int lex_index;       /* Index of lextext being written to             */
 282 static int lex_pos;         /* Current write position in that lextext        */
 283
 284 /* ------------------------------------------------------------------------- */
 285 /*   The lexer itself needs up to 3 characters of lookahead (it uses an      */
 286 /*   LR(3) grammar to translate characters into tokens).                     */
 287 /*                                                                           */
 288 /*   Past the end of the stream, we fill in zeros. This has the awkward      */
 289 /*   side effect that a zero byte in a source file will silently terminate   */
 290 /*   it, rather than producing an "illegal source character" error.          */
 291 /*   On the up side, we can compile veneer routines (which are null-         */
 292 /*   terminated strings) with no extra work.                                 */
 293 /* ------------------------------------------------------------------------- */
 294
 295 #define LOOKAHEAD_SIZE 3
 296
 297 static int current, lookahead,          /* The latest character read, and    */
 298     lookahead2, lookahead3;             /* the three characters following it */
 299                                         /* (zero means end-of-stream)        */
 300
 301 static int pipeline_made;               /* Whether or not the pipeline of
 302                                            characters has been constructed
 303                                            yet (this pass)                   */
 304
 305 static int (* get_next_char)(void);     /* Routine for reading the stream of
 306                                            characters: the lexer does not
 307                                            need any "ungetc" routine for
 308                                            putting them back again.  End of
 309                                            stream is signalled by returning
 310                                            zero.                             */
 311
 312 static char *source_to_analyse;         /* The current lexical source:
 313                                            NULL for "load from source files",
 314                                            otherwise this points to a string
 315                                            containing Inform code            */
 316
 317 static int tokens_put_back;             /* Count of the number of backward
 318                                            moves made from the last-read
 319                                            token                             */
 320
 321 /* This gets called for both token_data and lexeme_data structs. It prints
 322    a description of the common part (the text, value, type fields).
 323 */
 324 extern void describe_token_triple(const char *text, int32 value, int type)
 325 {
 326     /*  Many of the token types are not set in this file, but later on in
 327         Inform's higher stages (for example, in the expression evaluator);
 328         but this routine describes them all.                                 */
 329
 330     printf("{ ");
 331
 332     switch(type)
 333     {
 334         /*  The following token types occur in lexer output:                 */
 335
 336         case SYMBOL_TT:          printf("symbol ");
 337                                  describe_symbol(value);
 338                                  break;
 339         case NUMBER_TT:          printf("literal number %d", value);
 340                                  break;
 341         case DQ_TT:              printf("string \"%s\"", text);
 342                                  break;
 343         case SQ_TT:              printf("string '%s'", text);
 344                                  break;
 345         case UQ_TT:              printf("barestring %s", text);
 346                                  break;
 347         case SEP_TT:             printf("separator '%s'", text);
 348                                  break;
 349         case EOF_TT:             printf("end of file");
 350                                  break;
 351
 352         case STATEMENT_TT:       printf("statement name '%s'", text);
 353                                  break;
 354         case SEGMENT_MARKER_TT:  printf("object segment marker '%s'", text);
 355                                  break;
 356         case DIRECTIVE_TT:       printf("directive name '%s'", text);
 357                                  break;
 358         case CND_TT:             printf("textual conditional '%s'", text);
 359                                  break;
 360         case OPCODE_NAME_TT:     printf("opcode name '%s'", text);
 361                                  break;
 362         case SYSFUN_TT:          printf("built-in function name '%s'", text);
 363                                  break;
 364         case LOCAL_VARIABLE_TT:  printf("local variable name '%s'", text);
 365                                  break;
 366         case MISC_KEYWORD_TT:    printf("statement keyword '%s'", text);
 367                                  break;
 368         case DIR_KEYWORD_TT:     printf("directive keyword '%s'", text);
 369                                  break;
 370         case TRACE_KEYWORD_TT:   printf("'trace' keyword '%s'", text);
 371                                  break;
 372         case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", text);
 373                                  break;
 374
 375         /*  The remaining are etoken types, not set by the lexer             */
 376
 377         case OP_TT:              printf("operator '%s'",
 378                                      operators[value].description);
 379                                  break;
 380         case ENDEXP_TT:          printf("end of expression");
 381                                  break;
 382         case SUBOPEN_TT:         printf("open bracket");
 383                                  break;
 384         case SUBCLOSE_TT:        printf("close bracket");
 385                                  break;
 386         case LARGE_NUMBER_TT:    printf("large number: '%s'=%d",text,value);
 387                                  break;
 388         case SMALL_NUMBER_TT:    printf("small number: '%s'=%d",text,value);
 389                                  break;
 390         case VARIABLE_TT:        printf("variable '%s'=%d", text, value);
 391                                  break;
 392         case DICTWORD_TT:        printf("dictionary word '%s'", text);
 393                                  break;
 394         case ACTION_TT:          printf("action name '%s'", text);
 395                                  break;
 396
 397         default:
 398             printf("** unknown token type %d, text='%s', value=%d **",
 399             type, text, value);
 400     }
 401     printf(" }");
 402 }
 403
 404 /* ------------------------------------------------------------------------- */
 405 /*   All but one of the Inform keywords (most of them opcode names used      */
 406 /*   only by the assembler).  (The one left over is "sp", a keyword used in  */
 407 /*   assembly language only.)                                                */
 408 /*                                                                           */
 409 /*   A "keyword group" is a set of keywords to be searched for.  If a match  */
 410 /*   is made on an identifier, the token type becomes that given in the KG   */
 411 /*   and the token value is its index in the KG.                             */
 412 /*                                                                           */
 413 /*   The keyword ordering must correspond with the appropriate #define's in  */
 414 /*   "header.h" but is otherwise not significant.                            */
 415 /* ------------------------------------------------------------------------- */
 416
 417 /* This must exceed the total number of keywords across all groups,
 418    including opcodes. */
 419 #define MAX_KEYWORDS (500)
 420
 421 /* The values will be filled in at compile time, when we know
 422    which opcode set to use. */
 423 keyword_group opcode_names =
 424 { { "" },
 425     OPCODE_NAME_TT, FALSE, TRUE
 426 };
 427
 428 static char *opcode_list_z[] = {
 429     "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
 430     "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
 431     "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
 432     "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
 433     "print_char", "print_num", "random", "push", "pull", "split_window",
 434     "set_window", "output_stream", "input_stream", "sound_effect", "jz",
 435     "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
 436     "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
 437     "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
 438     "restore", "restart", "ret_popped", "pop", "quit", "new_line",
 439     "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
 440     "erase_window", "erase_line", "set_cursor", "get_cursor",
 441     "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
 442     "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
 443     "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
 444     "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
 445     "restore_undo", "draw_picture", "picture_data", "erase_picture",
 446     "set_margins", "move_window", "window_size", "window_style",
 447     "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
 448     "mouse_window", "push_stack", "put_wind_prop", "print_form",
 449     "make_menu", "picture_table", "print_unicode", "check_unicode",
 450     "set_true_colour", "buffer_screen",
 451     ""
 452 };
 453
 454 static char *opcode_list_g[] = {
 455     "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
 456     "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
 457     "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
 458     "jltu", "jgeu", "jgtu", "jleu",
 459     "call", "return",
 460     "catch", "throw", "tailcall",
 461     "copy", "copys", "copyb", "sexs", "sexb", "aload",
 462     "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
 463     "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
 464     "streamchar", "streamnum", "streamstr",
 465     "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
 466     "random", "setrandom", "quit", "verify",
 467     "restart", "save", "restore", "saveundo", "restoreundo", "protect",
 468     "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
 469     "linearsearch", "binarysearch", "linkedsearch",
 470     "callf", "callfi", "callfii", "callfiii",
 471     "streamunichar",
 472     "mzero", "mcopy", "malloc", "mfree",
 473     "accelfunc", "accelparam",
 474     "hasundo", "discardundo",
 475     "numtof", "ftonumz", "ftonumn", "ceil", "floor",
 476     "fadd", "fsub", "fmul", "fdiv", "fmod",
 477     "sqrt", "exp", "log", "pow",
 478     "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
 479     "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
 480     "numtod", "dtonumz", "dtonumn", "ftod", "dtof", "dceil", "dfloor",
 481     "dadd", "dsub", "dmul", "ddiv", "dmodr", "dmodq",
 482     "dsqrt", "dexp", "dlog", "dpow",
 483     "dsin", "dcos", "dtan", "dasin", "dacos", "datan", "datan2",
 484     "jdeq", "jdne", "jdlt", "jdle", "jdgt", "jdge", "jdisnan", "jdisinf",
 485     ""
 486 };
 487
 488 keyword_group opcode_macros =
 489 { { "" },
 490   OPCODE_MACRO_TT, FALSE, TRUE
 491 };
 492
 493 static char *opmacro_list_z[] = { "" };
 494
 495 static char *opmacro_list_g[] = {
 496     "pull", "push", "dload", "dstore",
 497     ""
 498 };
 499
 500 keyword_group directives =
 501 { { "abbreviate", "array", "attribute", "class", "constant",
 502     "default", "dictionary", "end", "endif", "extend", "fake_action",
 503     "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
 504     "iffalse", "import", "include", "link", "lowstring", "message",
 505     "nearby", "object", "origsource", "property", "release", "replace",
 506     "serial", "switches", "statusline", "stub", "system_file", "trace",
 507     "undef", "verb", "version", "zcharacter",
 508     "" },
 509     DIRECTIVE_TT, FALSE, FALSE
 510 };
 511
 512 keyword_group trace_keywords =
 513 { { "dictionary", "symbols", "objects", "verbs",
 514     "assembly", "expressions", "lines", "tokens", "linker",
 515     "on", "off", "" },
 516     TRACE_KEYWORD_TT, FALSE, TRUE
 517 };
 518
 519 keyword_group segment_markers =
 520 { { "class", "has", "private", "with", "" },
 521     SEGMENT_MARKER_TT, FALSE, TRUE
 522 };
 523
 524 keyword_group directive_keywords =
 525 { { "alias", "long", "additive",
 526     "score", "time",
 527     "noun", "held", "multi", "multiheld", "multiexcept",
 528     "multiinside", "creature", "special", "number", "scope", "topic",
 529     "reverse", "meta", "only", "replace", "first", "last",
 530     "string", "table", "buffer", "data", "initial", "initstr",
 531     "with", "private", "has", "class",
 532     "error", "fatalerror", "warning",
 533     "terminating", "static", "individual",
 534     "" },
 535     DIR_KEYWORD_TT, FALSE, TRUE
 536 };
 537
 538 keyword_group misc_keywords =
 539 { { "char", "name", "the", "a", "an", "The", "number",
 540     "roman", "reverse", "bold", "underline", "fixed", "on", "off",
 541     "to", "address", "string", "object", "near", "from", "property", "A", "" },
 542     MISC_KEYWORD_TT, FALSE, TRUE
 543 };
 544
 545 keyword_group statements =
 546 { { "box", "break", "continue", "default", "do", "else", "font", "for",
 547     "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
 548     "print", "print_ret", "quit", "read", "remove", "restore", "return",
 549     "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
 550     "until", "while", "" },
 551     STATEMENT_TT, FALSE, TRUE
 552 };
 553
 554 keyword_group conditions =
 555 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
 556     CND_TT, FALSE, TRUE
 557 };
 558
 559 keyword_group system_functions =
 560 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
 561     "sibling", "younger", "youngest", "metaclass", "glk", "" },
 562     SYSFUN_TT, FALSE, TRUE
 563 };
 564
 565 keyword_group system_constants =
 566 { { "adjectives_table", "actions_table", "classes_table",
 567     "identifiers_table", "preactions_table", "version_number",
 568     "largest_object", "strings_offset", "code_offset",
 569     "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
 570     "static_memory_offset", "array_names_offset", "readable_memory_offset",
 571     "cpv__start", "cpv__end", "ipv__start", "ipv__end",
 572     "array__start", "array__end",
 573     "lowest_attribute_number", "highest_attribute_number",
 574     "attribute_names_array",
 575     "lowest_property_number", "highest_property_number",
 576     "property_names_array",
 577     "lowest_action_number", "highest_action_number",
 578     "action_names_array",
 579     "lowest_fake_action_number", "highest_fake_action_number",
 580     "fake_action_names_array",
 581     "lowest_routine_number", "highest_routine_number", "routines_array",
 582     "routine_names_array", "routine_flags_array",
 583     "lowest_global_number", "highest_global_number", "globals_array",
 584     "global_names_array", "global_flags_array",
 585     "lowest_array_number", "highest_array_number", "arrays_array",
 586     "array_names_array", "array_flags_array",
 587     "lowest_constant_number", "highest_constant_number", "constants_array",
 588     "constant_names_array",
 589     "lowest_class_number", "highest_class_number", "class_objects_array",
 590     "lowest_object_number", "highest_object_number",
 591     "oddeven_packing",
 592     "grammar_table", "dictionary_table", "dynam_string_table",
 593     "" },
 594     SYSTEM_CONSTANT_TT, FALSE, TRUE
 595 };
 596
 597 keyword_group *keyword_groups[12]
 598 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
 599     &directive_keywords, &misc_keywords, &statements, &conditions,
 600     &system_functions, &system_constants, &opcode_macros};
 601
 602 /* These keywords are set to point to local_variable_names entries when
 603    a routine header is parsed. See construct_local_variable_tables().        */
 604 keyword_group local_variables =
 605 { { "" },
 606     LOCAL_VARIABLE_TT, FALSE, FALSE
 607 };
 608
 609 static int lexical_context(void)
 610 {
 611     /*  The lexical context is a number representing all of the context
 612         information in the lexical analyser: the same input text will
 613         always translate to the same output tokens whenever the context
 614         is the same.
 615
 616         (For many years, the "dont_enter_into_symbol_table" variable
 617         was omitted from this number. But now we can include it.)            */
 618
 619     int c = 0;
 620     if (opcode_names.enabled)         c |= 1;
 621     if (directives.enabled)           c |= 2;
 622     if (trace_keywords.enabled)       c |= 4;
 623     if (segment_markers.enabled)      c |= 8;
 624     if (directive_keywords.enabled)   c |= 16;
 625     if (misc_keywords.enabled)        c |= 32;
 626     if (statements.enabled)           c |= 64;
 627     if (conditions.enabled)           c |= 128;
 628     if (system_functions.enabled)     c |= 256;
 629     if (system_constants.enabled)     c |= 512;
 630     if (local_variables.enabled)      c |= 1024;
 631
 632     if (return_sp_as_variable)        c |= 2048;
 633     if (dont_enter_into_symbol_table) c |= 4096;
 634
 635     return(c);
 636 }
 637
 638 static void print_context(int c)
 639 {
 640     if (c < 0) {
 641         printf("??? ");
 642         return;
 643     }
 644     if ((c & 1) != 0) printf("OPC ");
 645     if ((c & 2) != 0) printf("DIR ");
 646     if ((c & 4) != 0) printf("TK ");
 647     if ((c & 8) != 0) printf("SEG ");
 648     if ((c & 16) != 0) printf("DK ");
 649     if ((c & 32) != 0) printf("MK ");
 650     if ((c & 64) != 0) printf("STA ");
 651     if ((c & 128) != 0) printf("CND ");
 652     if ((c & 256) != 0) printf("SFUN ");
 653     if ((c & 512) != 0) printf("SCON ");
 654     if ((c & 1024) != 0) printf("LV ");
 655     if ((c & 2048) != 0) printf("sp ");
 656     if ((c & 4096) != 0) printf("dontent ");
 657 }
 658
 659 static int *keywords_hash_table;
 660 static int *keywords_hash_ends_table;
 661 static int *keywords_data_table;
 662
 663 static int *local_variable_hash_table;
 664 static int *local_variable_hash_codes;
 665
 666 /* Note that MAX_LOCAL_VARIABLES is the maximum number of local variables
 667    for this VM, *including* "sp" (the stack pointer "local").
 668    This used to be a memory setting. Now it is a constant: 16 for Z-code,
 669    119 for Glulx.
 670 */
 671
 672 /* The number of local variables in the current routine. */
 673 int no_locals;
 674
 675 /* Names of local variables in the current routine.
 676    The values are positions in local_variable_names_memlist.
 677    This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local"
 678    is not included in this array.)
 679
 680    (This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1.
 681    But right now we just allocate the max.)
 682  */
 683 int *local_variable_name_offsets;
 684
 685 static memory_list local_variable_names_memlist;
 686 /* How much of local_variable_names_memlist is used by the no_local locals. */
 687 static int local_variable_names_usage;
 688
 689 static char one_letter_locals[128];
 690
 691 static void make_keywords_tables(void)
 692 {   int i, j, h, tp=0;
 693     char **oplist, **maclist;
 694
 695     if (!glulx_mode) {
 696         oplist = opcode_list_z;
 697         maclist = opmacro_list_z;
 698     }
 699     else {
 700         oplist = opcode_list_g;
 701         maclist = opmacro_list_g;
 702     }
 703
 704     for (j=0; *(oplist[j]); j++) {
 705         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 706             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 707             compiler_error("opcode_list has overflowed opcode_names.keywords");
 708             break;
 709         }
 710         opcode_names.keywords[j] = oplist[j];
 711     }
 712     opcode_names.keywords[j] = "";
 713
 714     for (j=0; *(maclist[j]); j++) {
 715         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 716             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 717             compiler_error("opmacro_list has overflowed opcode_macros.keywords");
 718             break;
 719         }
 720         opcode_macros.keywords[j] = maclist[j];
 721     }
 722     opcode_macros.keywords[j] = "";
 723
 724     for (i=0; i<HASH_TAB_SIZE; i++)
 725     {   keywords_hash_table[i] = -1;
 726         keywords_hash_ends_table[i] = -1;
 727     }
 728
 729     for (i=1; i<=11; i++)
 730     {   keyword_group *kg = keyword_groups[i];
 731         for (j=0; *(kg->keywords[j]) != 0; j++)
 732         {
 733             if (tp >= MAX_KEYWORDS) {
 734                 /* Gotta increase MAX_KEYWORDS */
 735                 compiler_error("keywords_data_table has overflowed MAX_KEYWORDS");
 736                 break;
 737             }
 738             h = hash_code_from_string(kg->keywords[j]);
 739             if (keywords_hash_table[h] == -1)
 740                 keywords_hash_table[h] = tp;
 741             else
 742               *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
 743             keywords_hash_ends_table[h] = tp;
 744             *(keywords_data_table + 3*tp) = i;
 745             *(keywords_data_table + 3*tp+1) = j;
 746             *(keywords_data_table + 3*tp+2) = -1;
 747             tp++;
 748         }
 749     }
 750 }
 751
 752 extern void clear_local_variables(void)
 753 {
 754     no_locals = 0;
 755     local_variable_names_usage = 0;
 756 }
 757
 758 extern void add_local_variable(char *name)
 759 {
 760     int len;
 761
 762     if (no_locals >= MAX_LOCAL_VARIABLES-1) {
 763         /* This should have been caught before we got here */
 764         error("too many local variables");
 765         return;
 766     }
 767
 768     len = strlen(name)+1;
 769     ensure_memory_list_available(&local_variable_names_memlist, local_variable_names_usage + len);
 770     local_variable_name_offsets[no_locals++] = local_variable_names_usage;
 771     strcpy((char *)local_variable_names_memlist.data+local_variable_names_usage, name);
 772     local_variable_names_usage += len;
 773 }
 774
 775 extern char *get_local_variable_name(int index)
 776 {
 777     if (index < 0 || index >= no_locals)
 778         return "???";   /* shouldn't happen */
 779
 780     return (char *)local_variable_names_memlist.data + local_variable_name_offsets[index];
 781 }
 782
 783 /* Look at the strings stored in local_variable_names (from 0 to no_locals).
 784    Set local_variables.keywords to point to these, and also prepare the
 785    hash tables.
 786    This must be called after add_local_variable(), but before we start
 787    compiling function code. */
 788 extern void construct_local_variable_tables(void)
 789 {   int i, h;
 790     for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
 791     for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
 792
 793     for (i=0; i<no_locals; i++)
 794     {
 795         char *p = (char *)local_variable_names_memlist.data + local_variable_name_offsets[i];
 796         local_variables.keywords[i] = p;
 797         if (p[1] == 0)
 798         {   one_letter_locals[(uchar)p[0]] = i;
 799             if (isupper(p[0])) one_letter_locals[tolower(p[0])] = i;
 800             if (islower(p[0])) one_letter_locals[toupper(p[0])] = i;
 801         }
 802         h = hash_code_from_string(p);
 803         if (local_variable_hash_table[h] == -1)
 804             local_variable_hash_table[h] = i;
 805         local_variable_hash_codes[i] = h;
 806     }
 807     /* Clear the rest. */
 808     for (;i<MAX_LOCAL_VARIABLES-1;i++) {
 809         local_variables.keywords[i] = "";
 810         local_variable_hash_codes[i] = 0;
 811     }
 812 }
 813
 814 static void interpret_identifier(char *p, int pos)
 815 {   int index, hashcode;
 816
 817     /*  An identifier is either a keyword or a "symbol", a name which the
 818         lexical analyser leaves to higher levels of Inform to understand.    */
 819
 820     circle[pos].newsymbol = FALSE;
 821
 822     hashcode = hash_code_from_string(p);
 823
 824     /*  If dont_enter_into_symbol_table is true, we skip all keywords
 825         (and variables) and just mark the name as an unquoted string.
 826         Except that if dont_enter_into_symbol_table is -2, we recognize
 827         directive keywords (only).
 828     */
 829
 830     if (dont_enter_into_symbol_table) {
 831
 832         if (dont_enter_into_symbol_table == -2) {
 833             /* This is a simplified version of the keyword-checking loop
 834                below. */
 835             index = keywords_hash_table[hashcode];
 836             while (index >= 0)
 837             {   int *i = keywords_data_table + 3*index;
 838                 keyword_group *kg = keyword_groups[*i];
 839                 if (kg == &directives)
 840                 {   char *q = kg->keywords[*(i+1)];
 841                     if (((kg->case_sensitive) && (strcmp(p, q)==0))
 842                         || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
 843                     {   circle[pos].type = kg->change_token_type;
 844                         circle[pos].value = *(i+1);
 845                         return;
 846                     }
 847                 }
 848                 index = *(i+2);
 849             }
 850         }
 851
 852         circle[pos].type = UQ_TT;
 853         circle[pos].value = 0;
 854         return;
 855     }
 856
 857     /*  If this is assembly language, perhaps it is "sp"?                    */
 858
 859     if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
 860     {   circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
 861         return;
 862     }
 863
 864     /*  Test for local variables first, quite quickly.                       */
 865
 866     if (local_variables.enabled)
 867     {   if (p[1]==0)
 868         {   index = one_letter_locals[(uchar)p[0]];
 869             if (index<MAX_LOCAL_VARIABLES)
 870             {   circle[pos].type = LOCAL_VARIABLE_TT;
 871                 circle[pos].value = index+1;
 872                 return;
 873             }
 874         }
 875         index = local_variable_hash_table[hashcode];
 876         if (index >= 0)
 877         {   for (;index<no_locals;index++)
 878             {   if (hashcode == local_variable_hash_codes[index])
 879                 {
 880                     char *locname = (char *)local_variable_names_memlist.data + local_variable_name_offsets[index];
 881                     if (strcmpcis(p, locname)==0)
 882                     {   circle[pos].type = LOCAL_VARIABLE_TT;
 883                         circle[pos].value = index+1;
 884                         return;
 885                     }
 886                 }
 887             }
 888         }
 889     }
 890
 891     /*  Now the bulk of the keywords.  Note that the lexer doesn't recognise
 892         the name of a system function which has been Replaced.               */
 893
 894     index = keywords_hash_table[hashcode];
 895     while (index >= 0)
 896     {   int *i = keywords_data_table + 3*index;
 897         keyword_group *kg = keyword_groups[*i];
 898         if (kg->enabled)
 899         {   char *q = kg->keywords[*(i+1)];
 900             if (((kg->case_sensitive) && (strcmp(p, q)==0))
 901                 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
 902             {   if ((kg != &system_functions)
 903                     || (system_function_usage[*(i+1)]!=2))
 904                 {   circle[pos].type = kg->change_token_type;
 905                     circle[pos].value = *(i+1);
 906                     return;
 907                 }
 908             }
 909         }
 910         index = *(i+2);
 911     }
 912
 913     /*  Search for the name; create it if necessary.                         */
 914
 915     circle[pos].value = symbol_index(p, hashcode, &circle[pos].newsymbol);
 916     circle[pos].type = SYMBOL_TT;
 917 }
 918
 919
 920 /* ------------------------------------------------------------------------- */
 921 /*   The tokeniser grid aids a rapid decision about the consequences of a    */
 922 /*   character reached in the buffer.  In effect it is an efficiently stored */
 923 /*   transition table using an algorithm similar to that of S. C. Johnson's  */
 924 /*   "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9).       */
 925 /*   My thanks to Dilip Sequeira for suggesting this.                        */
 926 /*                                                                           */
 927 /*       tokeniser_grid[c]   is (16*n + m) if c is the first character of    */
 928 /*                               separator numbers n, n+1, ..., n+m-1        */
 929 /*                           or certain special values (QUOTE_CODE, etc)     */
 930 /*                           or 0 otherwise                                  */
 931 /*                                                                           */
 932 /*   Since 1000/16 = 62, the code numbers below will need increasing if the  */
 933 /*   number of separators supported exceeds 61.                              */
 934 /* ------------------------------------------------------------------------- */
 935
 936 static int tokeniser_grid[256];
 937
 938 #define QUOTE_CODE      1000
 939 #define DQUOTE_CODE     1001
 940 #define NULL_CODE       1002
 941 #define SPACE_CODE      1003
 942 #define NEGATIVE_CODE   1004
 943 #define DIGIT_CODE      1005
 944 #define RADIX_CODE      1006
 945 #define KEYWORD_CODE    1007
 946 #define EOF_CODE        1008
 947 #define WHITESPACE_CODE 1009
 948 #define COMMENT_CODE    1010
 949 #define IDENTIFIER_CODE 1011
 950
 951 /*  This list cannot safely be changed without also changing the header
 952     separator #defines.  The ordering is significant in that (i) all entries
 953     beginning with the same character must be adjacent and (ii) that if
 954     X is a an initial substring of Y then X must come before Y.
 955
 956     E.g. --> must occur before -- to prevent "-->0" being tokenised
 957     wrongly as "--", ">", "0" rather than "-->", "0".                        */
 958
 959 static const char separators[NUMBER_SEPARATORS][4] =
 960 {   "->", "-->", "--", "-", "++", "+", "*", "/", "%",
 961     "||", "|", "&&", "&", "~~",
 962     "~=", "~", "==", "=", ">=", ">",
 963     "<=", "<", "(", ")", ",",
 964     ".&", ".#", "..&", "..#", "..", ".",
 965     "::", ":", "@", ";", "[", "]", "{", "}",
 966     "$", "?~", "?",
 967     "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
 968 };
 969
 970 static void make_tokeniser_grid(void)
 971 {
 972     /*  Construct the grid to the specification above.                       */
 973
 974     int i, j;
 975
 976     for (i=0; i<256; i++) tokeniser_grid[i]=0;
 977
 978     for (i=0; i<NUMBER_SEPARATORS; i++)
 979     {   j=separators[i][0];
 980         if (tokeniser_grid[j]==0)
 981             tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
 982     }
 983     tokeniser_grid['\''] = QUOTE_CODE;
 984     tokeniser_grid['\"'] = DQUOTE_CODE;
 985     tokeniser_grid[0]    = EOF_CODE;
 986     tokeniser_grid[' ']  = WHITESPACE_CODE;
 987     tokeniser_grid['\n'] = WHITESPACE_CODE;
 988     tokeniser_grid['\r'] = WHITESPACE_CODE;
 989     tokeniser_grid['$']  = RADIX_CODE;
 990     tokeniser_grid['!']  = COMMENT_CODE;
 991
 992     tokeniser_grid['0']  = DIGIT_CODE;
 993     tokeniser_grid['1']  = DIGIT_CODE;
 994     tokeniser_grid['2']  = DIGIT_CODE;
 995     tokeniser_grid['3']  = DIGIT_CODE;
 996     tokeniser_grid['4']  = DIGIT_CODE;
 997     tokeniser_grid['5']  = DIGIT_CODE;
 998     tokeniser_grid['6']  = DIGIT_CODE;
 999     tokeniser_grid['7']  = DIGIT_CODE;
1000     tokeniser_grid['8']  = DIGIT_CODE;
1001     tokeniser_grid['9']  = DIGIT_CODE;
1002
1003     tokeniser_grid['a']  = IDENTIFIER_CODE;
1004     tokeniser_grid['b']  = IDENTIFIER_CODE;
1005     tokeniser_grid['c']  = IDENTIFIER_CODE;
1006     tokeniser_grid['d']  = IDENTIFIER_CODE;
1007     tokeniser_grid['e']  = IDENTIFIER_CODE;
1008     tokeniser_grid['f']  = IDENTIFIER_CODE;
1009     tokeniser_grid['g']  = IDENTIFIER_CODE;
1010     tokeniser_grid['h']  = IDENTIFIER_CODE;
1011     tokeniser_grid['i']  = IDENTIFIER_CODE;
1012     tokeniser_grid['j']  = IDENTIFIER_CODE;
1013     tokeniser_grid['k']  = IDENTIFIER_CODE;
1014     tokeniser_grid['l']  = IDENTIFIER_CODE;
1015     tokeniser_grid['m']  = IDENTIFIER_CODE;
1016     tokeniser_grid['n']  = IDENTIFIER_CODE;
1017     tokeniser_grid['o']  = IDENTIFIER_CODE;
1018     tokeniser_grid['p']  = IDENTIFIER_CODE;
1019     tokeniser_grid['q']  = IDENTIFIER_CODE;
1020     tokeniser_grid['r']  = IDENTIFIER_CODE;
1021     tokeniser_grid['s']  = IDENTIFIER_CODE;
1022     tokeniser_grid['t']  = IDENTIFIER_CODE;
1023     tokeniser_grid['u']  = IDENTIFIER_CODE;
1024     tokeniser_grid['v']  = IDENTIFIER_CODE;
1025     tokeniser_grid['w']  = IDENTIFIER_CODE;
1026     tokeniser_grid['x']  = IDENTIFIER_CODE;
1027     tokeniser_grid['y']  = IDENTIFIER_CODE;
1028     tokeniser_grid['z']  = IDENTIFIER_CODE;
1029
1030     tokeniser_grid['A']  = IDENTIFIER_CODE;
1031     tokeniser_grid['B']  = IDENTIFIER_CODE;
1032     tokeniser_grid['C']  = IDENTIFIER_CODE;
1033     tokeniser_grid['D']  = IDENTIFIER_CODE;
1034     tokeniser_grid['E']  = IDENTIFIER_CODE;
1035     tokeniser_grid['F']  = IDENTIFIER_CODE;
1036     tokeniser_grid['G']  = IDENTIFIER_CODE;
1037     tokeniser_grid['H']  = IDENTIFIER_CODE;
1038     tokeniser_grid['I']  = IDENTIFIER_CODE;
1039     tokeniser_grid['J']  = IDENTIFIER_CODE;
1040     tokeniser_grid['K']  = IDENTIFIER_CODE;
1041     tokeniser_grid['L']  = IDENTIFIER_CODE;
1042     tokeniser_grid['M']  = IDENTIFIER_CODE;
1043     tokeniser_grid['N']  = IDENTIFIER_CODE;
1044     tokeniser_grid['O']  = IDENTIFIER_CODE;
1045     tokeniser_grid['P']  = IDENTIFIER_CODE;
1046     tokeniser_grid['Q']  = IDENTIFIER_CODE;
1047     tokeniser_grid['R']  = IDENTIFIER_CODE;
1048     tokeniser_grid['S']  = IDENTIFIER_CODE;
1049     tokeniser_grid['T']  = IDENTIFIER_CODE;
1050     tokeniser_grid['U']  = IDENTIFIER_CODE;
1051     tokeniser_grid['V']  = IDENTIFIER_CODE;
1052     tokeniser_grid['W']  = IDENTIFIER_CODE;
1053     tokeniser_grid['X']  = IDENTIFIER_CODE;
1054     tokeniser_grid['Y']  = IDENTIFIER_CODE;
1055     tokeniser_grid['Z']  = IDENTIFIER_CODE;
1056
1057     tokeniser_grid['_']  = IDENTIFIER_CODE;
1058 }
1059
1060 /* ------------------------------------------------------------------------- */
1061 /*   Definition of a lexical block: a source file or a string containing     */
1062 /*   text for lexical analysis; an independent source from the point of      */
1063 /*   view of issuing error reports.                                          */
1064 /* ------------------------------------------------------------------------- */
1065
1066 typedef struct LexicalBlock_s
1067 {   char *filename;                              /*  Full translated name    */
1068     int   main_flag;                             /*  TRUE if the main file
1069                                                      (the first one opened)  */
1070     int   sys_flag;                              /*  TRUE if a System_File   */
1071     int   source_line;                           /*  Line number count       */
1072     int   line_start;                            /*  Char number within file
1073                                                      where the current line
1074                                                      starts                  */
1075     int   chars_read;                            /*  Char number of read pos */
1076     int   file_no;                               /*  Or 255 if not from a
1077                                                      file; used for debug
1078                                                      information             */
1079     char *orig_source;                           /* From #origsource direct  */
1080     int orig_file;
1081     int32 orig_line;
1082     int32 orig_char;
1083 } LexicalBlock;
1084
1085 static LexicalBlock NoFileOpen =
1086     {   "<before compilation>",  FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1087
1088 static LexicalBlock MakingOutput =
1089     {   "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1090
1091 static LexicalBlock StringLB =
1092     {   "<veneer routine>",      FALSE, TRUE,  0, 0, 0, 255, NULL, 0, 0, 0 };
1093
1094 static LexicalBlock *CurrentLB;                  /*  The current lexical
1095                                                      block of input text     */
1096
1097 extern void declare_systemfile(void)
1098 {   CurrentLB->sys_flag = TRUE;
1099 }
1100
1101 extern int is_systemfile(void)
1102 {   return ((CurrentLB->sys_flag)?1:0);
1103 }
1104
1105 extern void set_origsource_location(char *source, int32 line, int32 charnum)
1106 {
1107     int file_no;
1108
1109     if (!source) {
1110         /* Clear the Origsource declaration. */
1111         CurrentLB->orig_file = 0;
1112         CurrentLB->orig_source = NULL;
1113         CurrentLB->orig_line = 0;
1114         CurrentLB->orig_char = 0;
1115         return;
1116     }
1117
1118     /* Get the file number for a new or existing InputFiles entry. */
1119     file_no = register_orig_sourcefile(source);
1120
1121     CurrentLB->orig_file = file_no;
1122     CurrentLB->orig_source = InputFiles[file_no-1].filename;
1123     CurrentLB->orig_line = line;
1124     CurrentLB->orig_char = charnum;
1125 }
1126
1127 /* Error locations. */
1128
1129 extern debug_location get_current_debug_location(void)
1130 {   debug_location result;
1131     /* Assume that all input characters are one byte. */
1132     result.file_index = CurrentLB->file_no;
1133     result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
1134     result.end_byte_index = result.beginning_byte_index;
1135     result.beginning_line_number = CurrentLB->source_line;
1136     result.end_line_number = result.beginning_line_number;
1137     result.beginning_character_number =
1138         CurrentLB->chars_read - CurrentLB->line_start;
1139     result.end_character_number = result.beginning_character_number;
1140     result.orig_file_index = CurrentLB->orig_file;
1141     result.orig_beg_line_number = CurrentLB->orig_line;
1142     result.orig_beg_char_number = CurrentLB->orig_char;
1143     return result;
1144 }
1145
1146 static debug_location ErrorReport_debug_location;
1147
1148 extern void report_errors_at_current_line(void)
1149 {   ErrorReport.line_number = CurrentLB->source_line;
1150     ErrorReport.file_number = CurrentLB->file_no;
1151     if (ErrorReport.file_number == 255)
1152         ErrorReport.file_number = -1;
1153     ErrorReport.source      = CurrentLB->filename;
1154     ErrorReport.main_flag   = CurrentLB->main_flag;
1155     if (debugfile_switch)
1156         ErrorReport_debug_location = get_current_debug_location();
1157     ErrorReport.orig_file = CurrentLB->orig_file;
1158     ErrorReport.orig_source = CurrentLB->orig_source;
1159     ErrorReport.orig_line = CurrentLB->orig_line;
1160     ErrorReport.orig_char = CurrentLB->orig_char;
1161 }
1162
1163 extern debug_location get_error_report_debug_location(void)
1164 {   return ErrorReport_debug_location;
1165 }
1166
1167 extern int32 get_current_line_start(void)
1168 {   return CurrentLB->line_start;
1169 }
1170
1171 brief_location blank_brief_location;
1172
1173 extern brief_location get_brief_location(ErrorPosition *errpos)
1174 {
1175     brief_location loc;
1176     loc.file_index = errpos->file_number;
1177     loc.line_number = errpos->line_number;
1178     loc.orig_file_index = errpos->orig_file;
1179     loc.orig_line_number = errpos->orig_line;
1180     return loc;
1181 }
1182
1183 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1184 {
1185     if (loc.file_index != -1)
1186     {   errpos->file_number = loc.file_index;
1187         errpos->line_number = loc.line_number;
1188         errpos->main_flag = (errpos->file_number == 1);
1189         errpos->orig_source = NULL;
1190         errpos->orig_file = loc.orig_file_index;
1191         errpos->orig_line = loc.orig_line_number;
1192         errpos->orig_char = 0;
1193     }
1194 }
1195
1196 /* ------------------------------------------------------------------------- */
1197 /*   Hash printing and line counting                                         */
1198 /* ------------------------------------------------------------------------- */
1199
1200 static void print_hash(void)
1201 {
1202     /*  Hash-printing is the practice of printing a # character every 100
1203         lines of source code (the -x switch), reassuring the user that
1204         progress is being made                                               */
1205
1206     if (no_hash_printed_yet)
1207     {   printf("::"); no_hash_printed_yet = FALSE;
1208     }
1209     printf("#"); hash_printed_since_newline = TRUE;
1210
1211 #ifndef MAC_FACE
1212     /*  On some systems, text output is buffered to a line at a time, and
1213         this would frustrate the point of hash-printing, so:                 */
1214
1215     fflush(stdout);
1216 #endif
1217 }
1218
1219 static void reached_new_line(void)
1220 {
1221     /*  Called to signal that a new line has been reached in the source code */
1222
1223     forerrors_pointer = 0;
1224
1225     CurrentLB->source_line++;
1226     CurrentLB->line_start = CurrentLB->chars_read;
1227
1228     total_source_line_count++;
1229
1230     if (total_source_line_count%100==0)
1231     {   if (hash_switch) print_hash();
1232 #ifdef MAC_MPW
1233         SpinCursor(32);                    /* I.e., allow other tasks to run */
1234 #endif
1235     }
1236
1237 #ifdef MAC_FACE
1238     if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1239     {   ProcessEvents (&g_proc);
1240         if (g_proc != true)
1241         {   free_arrays();
1242             close_all_source();
1243             abort_transcript_file();
1244             longjmp (g_fallback, 1);
1245         }
1246     }
1247 #endif
1248 }
1249
1250 static void new_syntax_line(void)
1251 {   if (source_to_analyse != NULL) forerrors_pointer = 0;
1252     report_errors_at_current_line();
1253 }
1254
1255 /* Return 10 raised to the expo power.
1256  *
1257  * I'm avoiding the standard pow() function for a rather lame reason:
1258  * it's in the libmath (-lm) library, and I don't want to change the
1259  * build model for the compiler. So, this is implemented with a stupid
1260  * lookup table. It's faster than pow() for small values of expo.
1261  * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1262  * anyway, so I don't expect that to be a problem.
1263  *
1264  * (For some reason, frexp() and ldexp(), which are used later on, do
1265  * not require libmath to be linked in.)
1266  */
1267 static double pow10_cheap(int expo)
1268 {
1269     #define POW10_RANGE (8)
1270     static double powers[POW10_RANGE*2+1] = {
1271         0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1272         1.0,
1273         10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1274     };
1275
1276     double res = 1.0;
1277
1278     if (expo < 0) {
1279         for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1280             res *= powers[0];
1281         }
1282         return res * powers[POW10_RANGE+expo];
1283     }
1284     else {
1285         for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1286             res *= powers[POW10_RANGE*2];
1287         }
1288         return res * powers[POW10_RANGE+expo];
1289     }
1290 }
1291
1292 /* Return the IEEE-754 single-precision encoding of a floating-point
1293  * number.
1294  *
1295  * The number is provided in the pieces it was parsed in:
1296  *    [+|-] intv "." fracv "e" [+|-]expo
1297  *
1298  * If the magnitude is too large (beyond about 3.4e+38), this returns
1299  * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1300  * small (below about 1e-45), this returns a zero value (0x00000000 or
1301  * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1302  * lexer should never do that).
1303  *
1304  * Note that using a float constant does *not* set the uses_float_features
1305  * flag (which would cause the game file to be labelled 3.1.2). Same with
1306  * double constants and the uses_double_features flag. There's no VM
1307  * feature here, just an integer. Of course, any use of the float *opcodes*
1308  * will set the flag.
1309  *
1310  * The math functions in this routine require #including <math.h>, but
1311  * they should not require linking the math library (-lm). At least,
1312  * they do not on OSX and Linux.
1313  */
1314 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1315 {
1316     double absval = (intv + fracv) * pow10_cheap(expo);
1317     int32 sign = (signbit ? 0x80000000 : 0x0);
1318     double mant;
1319     int32 fbits;
1320
1321     if (isinf(absval)) {
1322         return sign | 0x7f800000; /* infinity */
1323     }
1324     if (isnan(absval)) {
1325         return sign | 0x7fc00000;
1326     }
1327
1328     mant = frexp(absval, &expo);
1329
1330     /* Normalize mantissa to be in the range [1.0, 2.0) */
1331     if (0.5 <= mant && mant < 1.0) {
1332         mant *= 2.0;
1333         expo--;
1334     }
1335     else if (mant == 0.0) {
1336         expo = 0;
1337     }
1338     else {
1339         return sign | 0x7f800000; /* infinity */
1340     }
1341
1342     if (expo >= 128) {
1343         return sign | 0x7f800000; /* infinity */
1344     }
1345     else if (expo < -126) {
1346         /* Denormalized (very small) number */
1347         mant = ldexp(mant, 126 + expo);
1348         expo = 0;
1349     }
1350     else if (!(expo == 0 && mant == 0.0)) {
1351         expo += 127;
1352         mant -= 1.0; /* Get rid of leading 1 */
1353     }
1354
1355     mant *= 8388608.0; /* 2^23 */
1356     fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1357     if (fbits >> 23) {
1358         /* The carry propagated out of a string of 23 1 bits. */
1359         fbits = 0;
1360         expo++;
1361         if (expo >= 255) {
1362             return sign | 0x7f800000; /* infinity */
1363         }
1364     }
1365
1366     /* At this point, expo is less than 2^8; fbits is less than 2^23; neither is negative. */
1367     return (sign) | ((int32)(expo << 23)) | (fbits);
1368 }
1369
1370 /* Same as the above, but we return *half* of a 64-bit double, depending on whether wanthigh is true (high half) or false (low half).
1371  */
1372 static int32 construct_double(int wanthigh, int signbit, double intv, double fracv, int expo)
1373 {
1374     double absval = (intv + fracv) * pow10_cheap(expo);
1375     int32 sign = (signbit ? 0x80000000 : 0x0);
1376     double mant;
1377     uint32 fhi, flo;
1378
1379     if (isinf(absval)) {
1380         goto Infinity;
1381     }
1382     if (isnan(absval)) {
1383         goto NotANumber;
1384     }
1385
1386     mant = frexp(absval, &expo);
1387
1388     /* Normalize mantissa to be in the range [1.0, 2.0) */
1389     if (0.5 <= mant && mant < 1.0) {
1390         mant *= 2.0;
1391         expo--;
1392     }
1393     else if (mant == 0.0) {
1394         expo = 0;
1395     }
1396     else {
1397         goto Infinity;
1398     }
1399
1400     if (expo >= 1024) {
1401         goto Infinity;
1402     }
1403     else if (expo < -1022) {
1404         /* Denormalized (very small) number */
1405         mant = ldexp(mant, 1022 + expo);
1406         expo = 0;
1407     }
1408     else if (!(expo == 0 && mant == 0.0)) {
1409         expo += 1023;
1410         mant -= 1.0; /* Get rid of leading 1 */
1411     }
1412
1413     /* fhi receives the high 28 bits; flo the low 24 bits (total 52 bits) */
1414     mant *= 268435456.0;          /* 2^28 */
1415     fhi = (uint32)mant;           /* Truncate */
1416     mant -= (double)fhi;
1417     mant *= 16777216.0;           /* 2^24 */
1418     flo = (uint32)(mant+0.5);     /* Round */
1419
1420     if (flo >> 24) {
1421         /* The carry propagated out of a string of 24 1 bits. */
1422         flo = 0;
1423         fhi++;
1424         if (fhi >> 28) {
1425             /* And it also propagated out of the next 28 bits. */
1426             fhi = 0;
1427             expo++;
1428             if (expo >= 2047) {
1429                 goto Infinity;
1430             }
1431         }
1432     }
1433
1434     /* At this point, expo is less than 2^11; fhi is less than 2^28; flo is less than 2^24; none are negative. */
1435     if (wanthigh)
1436         return (sign) | ((int32)(expo << 20)) | ((int32)(fhi >> 8));
1437     else
1438         return (int32)((fhi & 0xFF) << 24) | (int32)(flo);
1439
1440  Infinity:
1441     if (wanthigh)
1442         return sign | 0x7FF00000;
1443     else
1444         return 0x00000000;
1445
1446  NotANumber:
1447     if (wanthigh)
1448         return sign | 0x7FF80000;
1449     else
1450         return 0x00000001;
1451 }
1452
1453 /* ------------------------------------------------------------------------- */
1454 /*   Characters are read via a "pipeline" of variables, allowing us to look  */
1455 /*       up to three characters ahead of the current position.               */
1456 /*                                                                           */
1457 /*   There are two possible sources: from the source files being loaded in,  */
1458 /*   and from a string inside Inform (which is where the code for veneer     */
1459 /*   routines comes from).  Each source has its own get-next-character       */
1460 /*   routine.                                                                */
1461 /* ------------------------------------------------------------------------- */
1462 /*   Source 1: from files                                                    */
1463 /*                                                                           */
1464 /*   Note that file_load_chars(p, size) loads "size" bytes into buffer "p"   */
1465 /*   from the current input file.  If the file runs out, then if it was      */
1466 /*   the last source file 4 null characters are placed in the buffer: if it  */
1467 /*   was only an Include file ending, then a '\n' character is placed there  */
1468 /*   (essentially to force termination of any comment line) followed by      */
1469 /*   three harmless spaces.                                                  */
1470 /*                                                                           */
1471 /*   The routine returns the number of characters it has written, and note   */
1472 /*   that this conveniently ensures that all characters in the buffer come   */
1473 /*   from the same file.                                                     */
1474 /* ------------------------------------------------------------------------- */
1475
1476 #define SOURCE_BUFFER_SIZE 4096                  /*  Typical disc block size */
1477
1478 typedef struct Sourcefile_s
1479 {   char *buffer;                                /*  Input buffer            */
1480     int   read_pos;                              /*  Read position in buffer */
1481     int   size;                                  /*  Number of meaningful
1482                                                      characters in buffer    */
1483     int   la, la2, la3;                          /*  Three characters of
1484                                                      lookahead pipeline      */
1485     int   file_no;                               /*  Internal file number
1486                                                      (1, 2, 3, ...)          */
1487     LexicalBlock LB;
1488 } Sourcefile;
1489
1490 static Sourcefile *FileStack;     /*  Allocated to FileStack_max */
1491 static memory_list FileStack_memlist;
1492 static int FileStack_max;         /*  The highest value that File_sp has
1493                                       reached
1494                                       (Filestack entries to this depth have
1495                                       a buffer allocated)                    */
1496
1497 static int File_sp;               /*  Current stack pointer                  */
1498 static Sourcefile *CF;            /*  Top entry on stack (always equal to
1499                                       FileStack[File_sp-1])                  */
1500
1501 static int last_input_file;
1502
1503 /* Set CF and CurrentLB.
1504    This does not increment File_sp; the caller must do that. */
1505 static void begin_buffering_file(int i, int file_no)
1506 {   int j, cnt; uchar *p;
1507
1508     CF = NULL;
1509     CurrentLB = NULL;
1510
1511     ensure_memory_list_available(&FileStack_memlist, i+1);
1512     while (i >= FileStack_max) {
1513         FileStack[FileStack_max++].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1514     }
1515
1516     p = (uchar *) FileStack[i].buffer;
1517
1518     if (i>0)
1519     {   FileStack[i-1].la  = lookahead;
1520         FileStack[i-1].la2 = lookahead2;
1521         FileStack[i-1].la3 = lookahead3;
1522     }
1523
1524     FileStack[i].file_no = file_no;
1525     FileStack[i].size = file_load_chars(file_no,
1526         (char *) p, SOURCE_BUFFER_SIZE);
1527     /* If the file is shorter than SOURCE_BUFFER_SIZE, it's now closed already. We still need to set up the file entry though. */
1528
1529     lookahead  = source_to_iso_grid[p[0]];
1530     lookahead2 = source_to_iso_grid[p[1]];
1531     lookahead3 = source_to_iso_grid[p[2]];
1532     if (LOOKAHEAD_SIZE != 3)
1533         compiler_error
1534             ("Lexer lookahead size does not match hard-coded lookahead code");
1535     FileStack[i].read_pos = LOOKAHEAD_SIZE;
1536
1537     if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1538                else FileStack[i].LB.main_flag = FALSE;
1539     FileStack[i].LB.sys_flag = FALSE;
1540     FileStack[i].LB.source_line = 1;
1541     FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1542     FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1543     FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1544     FileStack[i].LB.file_no = file_no;
1545     FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1546     FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1547
1548     InputFiles[file_no-1].initial_buffering = FALSE;
1549
1550     CurrentLB = &(FileStack[i].LB);
1551     CF = &(FileStack[i]);
1552
1553     /* Check for recursive inclusion */
1554     cnt = 0;
1555     for (j=0; j<i; j++)
1556     {   if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1557             cnt++;
1558     }
1559     if (cnt==1)
1560         warning_named("File included more than once",
1561             FileStack[j].LB.filename);
1562 }
1563
1564 static void create_char_pipeline(void)
1565 {
1566     File_sp = 0;
1567     begin_buffering_file(File_sp++, 1);
1568     pipeline_made = TRUE;
1569     last_input_file = current_input_file;
1570 }
1571
1572 static int get_next_char_from_pipeline(void)
1573 {   uchar *p;
1574
1575     while (last_input_file < current_input_file)
1576     {
1577         /*  An "Include" file must have opened since the last character
1578             was read. Perhaps more than one. We run forward through the
1579             list and add them to the include stack. But we ignore
1580             "Origsource" files (which were never actually opened for
1581             reading). */
1582
1583         last_input_file++;
1584         if (!InputFiles[last_input_file-1].is_input)
1585             continue;
1586
1587         begin_buffering_file(File_sp++, last_input_file);
1588     }
1589     if (last_input_file != current_input_file)
1590         compiler_error("last_input_file did not match after Include");
1591
1592     if (File_sp == 0)
1593     {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1594     }
1595
1596     if (CF->read_pos == CF->size)
1597     {   CF->size =
1598             file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1599         CF->read_pos = 0;
1600     }
1601     else
1602     if (CF->read_pos == -(CF->size))
1603     {   set_token_location(get_current_debug_location());
1604         File_sp--;
1605         if (File_sp == 0)
1606         {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1607         }
1608         CF = &(FileStack[File_sp-1]);
1609         CurrentLB = &(FileStack[File_sp-1].LB);
1610         lookahead  = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1611         if (CF->read_pos == CF->size)
1612         {   CF->size =
1613                 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1614             CF->read_pos = 0;
1615         }
1616         set_token_location(get_current_debug_location());
1617     }
1618
1619     p = (uchar *) (CF->buffer);
1620
1621     current = lookahead;
1622     lookahead = lookahead2;
1623     lookahead2 = lookahead3;
1624     lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1625
1626     CurrentLB->chars_read++;
1627     if (forerrors_pointer < FORERRORS_SIZE-1)
1628         forerrors_buff[forerrors_pointer++] = current;
1629
1630     /* The file is open in binary mode, so we have to do our own newline
1631        conversion. (We want to do it consistently across all platforms.)
1632
1633        The strategy is to convert all \r (CR) characters to \n (LF), but
1634        *don't* advance the line counter for \r if it's followed by \n.
1635        The rest of the lexer treats multiple \n characters the same as
1636        one, so the simple conversion will work out okay.
1637
1638        (Note that, for historical reasons, a ctrl-L (formfeed) is also
1639        treated as \r. This conversion has already been handled by
1640        source_to_iso_grid[].)
1641     */
1642     if (current == '\n') {
1643         reached_new_line();
1644     }
1645     else if (current == '\r') {
1646         current = '\n';
1647         if (lookahead != '\n')
1648             reached_new_line();
1649     }
1650
1651     return(current);
1652 }
1653
1654 /* ------------------------------------------------------------------------- */
1655 /*   Source 2: from a (null-terminated) string                               */
1656 /* ------------------------------------------------------------------------- */
1657
1658 static int source_to_analyse_pointer;            /*  Current read position   */
1659
1660 static int get_next_char_from_string(void)
1661 {   uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1662     current = source_to_iso_grid[p[0]];
1663
1664     if (current == 0)    lookahead  = 0;
1665                     else lookahead  = source_to_iso_grid[p[1]];
1666     if (lookahead == 0)  lookahead2 = 0;
1667                     else lookahead2 = source_to_iso_grid[p[2]];
1668     if (lookahead2 == 0) lookahead3 = 0;
1669                     else lookahead3 = source_to_iso_grid[p[3]];
1670
1671     CurrentLB->chars_read++;
1672     if (forerrors_pointer < FORERRORS_SIZE-1)
1673         forerrors_buff[forerrors_pointer++] = current;
1674
1675     /* We shouldn't have \r when compiling from string (veneer function).
1676        If we do, just shove it under the carpet. */
1677     if (current == '\r') current = '\n';
1678     if (current == '\n') reached_new_line();
1679
1680     return(current);
1681 }
1682
1683 /* ========================================================================= */
1684 /*   The interface between the lexer and Inform's higher levels:             */
1685 /*                                                                           */
1686 /*       put_token_back()            (effectively) move the read position    */
1687 /*                                       back by one token                   */
1688 /*                                                                           */
1689 /*       get_next_token()            copy the token at the current read      */
1690 /*                                       position into the triple            */
1691 /*                                   (token_type, token_value, token_text)   */
1692 /*                                       and move the read position forward  */
1693 /*                                       by one                              */
1694 /*                                                                           */
1695 /*       release_token_texts()       discard all the tokens that have been   */
1696 /*                                       read in, except for put-back ones   */
1697 /*                                                                           */
1698 /*       restart_lexer(source, name) if source is NULL, initialise the lexer */
1699 /*                                       to read from source files;          */
1700 /*                                       otherwise, to read from this null-  */
1701 /*                                       terminated string.                  */
1702 /* ------------------------------------------------------------------------- */
1703
1704 extern void release_token_texts(void)
1705 {
1706     /* This is called at the beginning of every (top-level) directive and
1707        every statement. It drops all token usage so that the lextexts
1708        array can be reused.
1709
1710        Call this immediately before a get_next_token() call.
1711
1712        This should *not* be called within parse_expression(). Expression
1713        code generation relies on token data being retained across the whole
1714        expression.
1715     */
1716     int ix;
1717
1718     token_text = NULL;
1719
1720     if (tokens_put_back == 0) {
1721         cur_lextexts = 0;
1722         return;
1723     }
1724
1725     /* If any tokens have been put back, we have to preserve their text.
1726        Move their lextext usage to the head of the lextexts array. */
1727
1728     for (ix=0; ix<tokens_put_back; ix++) {
1729         int oldpos;
1730         lextext temp;
1731         int pos = circle_position - tokens_put_back + 1 + ix;
1732         if (pos < 0) pos += CIRCLE_SIZE;
1733
1734         oldpos = circle[pos].lextext;
1735         circle[pos].lextext = ix;
1736         /* Swap the entire lextext structure (including size) */
1737         temp = lextexts[ix];
1738         lextexts[ix] = lextexts[oldpos];
1739         lextexts[oldpos] = temp;
1740     }
1741     cur_lextexts = tokens_put_back;
1742 }
1743
1744 extern void put_token_back(void)
1745 {   tokens_put_back++;
1746
1747     int pos = circle_position - tokens_put_back + 1;
1748     if (pos<0) pos += CIRCLE_SIZE;
1749
1750     if (tokens_trace_level > 0)
1751     {
1752         printf("<- ");
1753         if (tokens_trace_level > 1) {
1754             describe_token(&circle[pos]);
1755             printf("\n");
1756         }
1757     }
1758
1759     if (circle[pos].type == SYMBOL_TT && circle[pos].newsymbol) {
1760         /* Remove the symbol from the symbol table. (Or mark it as unreachable
1761            anyhow.) */
1762         end_symbol_scope(circle[pos].value, TRUE);
1763         /* Remove new-symbol flag, and force reinterpretation next time
1764            we see the symbol. */
1765         circle[pos].newsymbol = FALSE;
1766         circle[pos].context = -1;
1767     }
1768
1769     /*  The following error, of course, should never happen!                 */
1770
1771     if (tokens_put_back == CIRCLE_SIZE)
1772     {   compiler_error("The lexical analyser has collapsed because of a wrong \
1773 assumption inside Inform");
1774         tokens_put_back--;
1775         return;
1776     }
1777 }
1778
1779 /* The get_next_token() code reads characters into the current lextext,
1780    which is lextexts[lex_index]. It uses these routines to add and remove
1781    characters, reallocing when necessary.
1782
1783    lex_pos is the current number of characters in the lextext. It is
1784    not necessarily null-terminated until get_next_token() completes.
1785  */
1786
1787 /* Add one character */
1788 static void lexaddc(char ch)
1789 {
1790     if ((size_t)lex_pos >= lextexts[lex_index].size) {
1791         size_t newsize = lextexts[lex_index].size * 2;
1792         my_realloc(&lextexts[lex_index].text, lextexts[lex_index].size, newsize, "one lexeme text");
1793         lextexts[lex_index].size = newsize;
1794     }
1795     lextexts[lex_index].text[lex_pos++] = ch;
1796 }
1797
1798 /* Remove the last character and ensure it's null-terminated */
1799 static void lexdelc(void)
1800 {
1801     if (lex_pos > 0) {
1802         lex_pos--;
1803     }
1804     lextexts[lex_index].text[lex_pos] = 0;
1805 }
1806
1807 /* Return the last character */
1808 static char lexlastc(void)
1809 {
1810     if (lex_pos == 0) {
1811         return 0;
1812     }
1813     return lextexts[lex_index].text[lex_pos-1];
1814 }
1815
1816 /* Add a string of characters (including the null) */
1817 static void lexadds(char *str)
1818 {
1819     while (*str) {
1820         lexaddc(*str);
1821         str++;
1822     }
1823     lexaddc(0);
1824 }
1825
1826 extern void get_next_token(void)
1827 {   int d, i, j, k, quoted_size, e, radix, context;
1828     uint32 n;
1829     char *r;
1830     int floatend;
1831     int returning_a_put_back_token = TRUE;
1832
1833     context = lexical_context();
1834
1835     if (tokens_put_back > 0)
1836     {   i = circle_position - tokens_put_back + 1;
1837         if (i<0) i += CIRCLE_SIZE;
1838         tokens_put_back--;
1839         if (context != circle[i].context)
1840         {   j = circle[i].type;
1841             if ((j==0) || ((j>=100) && (j<200)))
1842                 interpret_identifier(circle[i].text, i);
1843             circle[i].context = context;
1844         }
1845         goto ReturnBack;
1846     }
1847     returning_a_put_back_token = FALSE;
1848
1849     if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1850     else circle_position++;
1851
1852     lex_index = cur_lextexts++;
1853     if (lex_index >= no_lextexts) {
1854         /* fresh lextext block; must init it */
1855         no_lextexts = lex_index+1;
1856         ensure_memory_list_available(&lextexts_memlist, no_lextexts);
1857         lextexts[lex_index].size = 64;   /* this can grow */
1858         lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text");
1859     }
1860     lex_pos = 0;
1861     lextexts[lex_index].text[0] = 0; /* start with an empty string */
1862
1863     circle[circle_position].lextext = lex_index;
1864     circle[circle_position].text = NULL; /* will fill in later */
1865     circle[circle_position].value = 0;
1866     circle[circle_position].type = 0;
1867     circle[circle_position].newsymbol = FALSE;
1868     circle[circle_position].context = context;
1869
1870     StartTokenAgain:
1871     d = (*get_next_char)();
1872     e = tokeniser_grid[d];
1873
1874     if (next_token_begins_syntax_line)
1875     {   if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1876         {   new_syntax_line();
1877             next_token_begins_syntax_line = FALSE;
1878         }
1879     }
1880
1881     circle[circle_position].location = get_current_debug_location();
1882
1883     switch(e)
1884     {   case 0: char_error("Illegal character found in source:", d);
1885             goto StartTokenAgain;
1886
1887         case WHITESPACE_CODE:
1888             while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1889                 (*get_next_char)();
1890             goto StartTokenAgain;
1891
1892         case COMMENT_CODE:
1893             while ((lookahead != '\n') && (lookahead != '\r') && (lookahead != 0))
1894                 (*get_next_char)();
1895             goto StartTokenAgain;
1896
1897         case EOF_CODE:
1898             circle[circle_position].type = EOF_TT;
1899             lexadds("<end of file>");
1900             break;
1901
1902         case DIGIT_CODE:
1903             radix = 10;
1904             ReturnNumber:
1905             n=0;
1906             do
1907             {   n = n*radix + character_digit_value[d];
1908                 lexaddc(d);
1909             } while ((character_digit_value[lookahead] < radix)
1910                      && (d = (*get_next_char)(), TRUE));
1911
1912             lexaddc(0);
1913             circle[circle_position].type = NUMBER_TT;
1914             circle[circle_position].value = (int32)n;
1915             break;
1916
1917             FloatNumber:
1918             /* When we reach here, d is the sign bit ('+' or '-').
1919                If we're constructing a 32-bit float, floatend is 0;
1920                for a 64-bit double, floatend is '>' for high, '<' for low. */
1921             {   int expo=0; double intv=0, fracv=0;
1922                 int expocount=0, intcount=0, fraccount=0;
1923                 int signbit = (d == '-');
1924                 lexaddc(d);
1925                 while (character_digit_value[lookahead] < 10) {
1926                     intv = 10.0*intv + character_digit_value[lookahead];
1927                     intcount++;
1928                     lexaddc(lookahead);
1929                     (*get_next_char)();
1930                 }
1931                 if (lookahead == '.') {
1932                     double fracpow = 1.0;
1933                     lexaddc(lookahead);
1934                     (*get_next_char)();
1935                     while (character_digit_value[lookahead] < 10) {
1936                         fracpow *= 0.1;
1937                         fracv = fracv + fracpow*character_digit_value[lookahead];
1938                         fraccount++;
1939                         lexaddc(lookahead);
1940                         (*get_next_char)();
1941                     }
1942                 }
1943                 if (lookahead == 'e' || lookahead == 'E') {
1944                     int exposign = 0;
1945                     lexaddc(lookahead);
1946                     (*get_next_char)();
1947                     if (lookahead == '+' || lookahead == '-') {
1948                         exposign = (lookahead == '-');
1949                         lexaddc(lookahead);
1950                         (*get_next_char)();
1951                     }
1952                     while (character_digit_value[lookahead] < 10) {
1953                         expo = 10*expo + character_digit_value[lookahead];
1954                         expocount++;
1955                         lexaddc(lookahead);
1956                         (*get_next_char)();
1957                     }
1958                     if (expocount == 0)
1959                         error("Floating-point literal must have digits after the 'e'");
1960                     if (exposign) { expo = -expo; }
1961                 }
1962                 if (intcount + fraccount == 0)
1963                     error("Floating-point literal must have digits");
1964                 if (floatend == '>')
1965                     n = construct_double(TRUE, signbit, intv, fracv, expo);
1966                 else if (floatend == '<')
1967                     n = construct_double(FALSE, signbit, intv, fracv, expo);
1968                 else
1969                     n = construct_float(signbit, intv, fracv, expo);
1970             }
1971             lexaddc(0);
1972             circle[circle_position].type = NUMBER_TT;
1973             circle[circle_position].value = n;
1974             if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1975             break;
1976
1977         case RADIX_CODE:
1978             radix = 16; d = (*get_next_char)();
1979             if (d == '-' || d == '+') {
1980                 floatend = 0;
1981                 goto FloatNumber;
1982             }
1983             if (d == '<' || d == '>') {
1984                 floatend = d;
1985                 d = (*get_next_char)();
1986                 if (d == '-' || d == '+') {
1987                     goto FloatNumber;
1988                 }
1989                 error("Signed number expected after '$<' or '$>'");
1990             }
1991             if (d == '$') { d = (*get_next_char)(); radix = 2; }
1992             if (character_digit_value[d] >= radix)
1993             {   if (radix == 2)
1994                     error("Binary number expected after '$$'");
1995                 else
1996                     error("Hexadecimal number expected after '$'");
1997             }
1998             goto ReturnNumber;
1999
2000         case QUOTE_CODE:     /* Single-quotes: scan a literal string */
2001             quoted_size=0;
2002             do
2003             {   e = d; d = (*get_next_char)(); lexaddc(d);
2004                 quoted_size++;
2005                 if ((d == '\'') && (e != '@'))
2006                 {   if (quoted_size == 1)
2007                     {   d = (*get_next_char)(); lexaddc(d);
2008                         if (d != '\'')
2009                             error("No text between quotation marks ''");
2010                     }
2011                     break;
2012                 }
2013             } while (d != 0);
2014             if (d==0) ebf_error("'\''", "end of file");
2015             lexdelc();
2016             circle[circle_position].type = SQ_TT;
2017             break;
2018
2019         case DQUOTE_CODE:    /* Double-quotes: scan a literal string */
2020             do
2021             {   d = (*get_next_char)(); lexaddc(d);
2022                 if (d == '\n')
2023                 {   lex_pos--;
2024                     while (lexlastc() == ' ') lex_pos--;
2025                     if (lexlastc() != '^') lexaddc(' ');
2026                     while ((lookahead != 0) &&
2027                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
2028                     (*get_next_char)();
2029                 }
2030                 else if (d == '\\')
2031                 {   int newline_passed = FALSE;
2032                     lex_pos--;
2033                     while ((lookahead != 0) &&
2034                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
2035                         if ((d = (*get_next_char)()) == '\n')
2036                             newline_passed = TRUE;
2037                     if (!newline_passed)
2038                     {   char chb[4];
2039                         chb[0] = '\"'; chb[1] = lookahead;
2040                         chb[2] = '\"'; chb[3] = 0;
2041                         ebf_error("empty rest of line after '\\' in string",
2042                             chb);
2043                     }
2044                 }
2045             }   while ((d != 0) && (d!='\"'));
2046             if (d==0) ebf_error("'\"'", "end of file");
2047             lexdelc();
2048             circle[circle_position].type = DQ_TT;
2049             break;
2050
2051         case IDENTIFIER_CODE:    /* Letter or underscore: an identifier */
2052
2053             lexaddc(d); n=1;
2054             while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
2055                    || (tokeniser_grid[lookahead] == DIGIT_CODE)))
2056                 n++, lexaddc((*get_next_char)());
2057
2058             lexaddc(0);
2059
2060             interpret_identifier(lextexts[lex_index].text, circle_position);
2061             break;
2062
2063         default:
2064
2065             /*  The character is initial to at least one of the separators  */
2066
2067             for (j=e>>4, k=j+(e&0x0f); j<k; j++)
2068             {   r = (char *) separators[j];
2069                 if (r[1]==0)
2070                 {   lexaddc(d);
2071                     lexaddc(0);
2072                     goto SeparatorMatched;
2073                 }
2074                 else
2075                 if (r[2]==0)
2076                 {   if (*(r+1) == lookahead)
2077                     {   lexaddc(d);
2078                         lexaddc((*get_next_char)());
2079                         lexaddc(0);
2080                         goto SeparatorMatched;
2081                     }
2082                 }
2083                 else
2084                 {   if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
2085                     {   lexaddc(d);
2086                         lexaddc((*get_next_char)());
2087                         lexaddc((*get_next_char)());
2088                         lexaddc(0);
2089                         goto SeparatorMatched;
2090                     }
2091                 }
2092             }
2093
2094             /*  The following contingency never in fact arises with the
2095                 current set of separators, but might in future  */
2096
2097             lexaddc(d); lexaddc(lookahead); lexaddc(lookahead2);
2098             lexaddc(0);
2099             error_named("Unrecognised combination in source:",
2100                 lextexts[lex_index].text);
2101             goto StartTokenAgain;
2102
2103             SeparatorMatched:
2104
2105             circle[circle_position].type = SEP_TT;
2106             circle[circle_position].value = j;
2107             switch(j)
2108             {   case SEMICOLON_SEP: break;
2109                 case HASHNDOLLAR_SEP:
2110                 case HASHWDOLLAR_SEP:
2111                     if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
2112                     {   error_named("Character expected after",
2113                             lextexts[lex_index].text);
2114                         break;
2115                     }
2116                     lex_pos--;
2117                     lexaddc((*get_next_char)());
2118                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
2119                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
2120                         lexaddc((*get_next_char)());
2121                     lexaddc(0);
2122                     break;
2123                 case HASHADOLLAR_SEP:
2124                 case HASHGDOLLAR_SEP:
2125                 case HASHRDOLLAR_SEP:
2126                 case HASHHASH_SEP:
2127                     if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
2128                     {   error_named("Alphabetic character expected after",
2129                             lextexts[lex_index].text);
2130                         break;
2131                     }
2132                     lex_pos--;
2133                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
2134                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
2135                         lexaddc((*get_next_char)());
2136                     lexaddc(0);
2137                     break;
2138             }
2139             break;
2140     }
2141
2142     /* We can now assign the text pointer, since the lextext has finished reallocing. */
2143     circle[circle_position].text = lextexts[lex_index].text;
2144     i = circle_position;
2145
2146     ReturnBack:
2147     /* We've either parsed a new token or selected a put-back one.
2148        i is the circle-position of the token in question. Time to
2149        export the token data where higher-level code can find it. */
2150     token_value = circle[i].value;
2151     token_type = circle[i].type;
2152     token_text = circle[i].text;
2153     if (!returning_a_put_back_token)
2154     {   set_token_location(circle[i].location);
2155     }
2156
2157     if (tokens_trace_level > 0)
2158     {   if (tokens_trace_level == 1) {
2159             printf("'%s' ", circle[i].text);
2160             if (circle[i].type == EOF_TT) printf("\n");
2161         }
2162         else
2163         {   printf("-> "); describe_token(&circle[i]);
2164             printf(" ");
2165             if (tokens_trace_level > 2) {
2166                 if (circle[i].newsymbol) printf("newsym ");
2167                 print_context(circle[i].context);
2168             }
2169             printf("\n");
2170         }
2171     }
2172 }
2173
2174 static char veneer_error_title[64];
2175
2176 extern void restart_lexer(char *lexical_source, char *name)
2177 {   int i;
2178     circle_position = 0;
2179     for (i=0; i<CIRCLE_SIZE; i++)
2180     {   circle[i].type = 0;
2181         circle[i].value = 0;
2182         circle[i].newsymbol = FALSE;
2183         circle[i].text = "(if this is ever visible, there is a bug)";
2184         circle[i].lextext = -1;
2185         circle[i].context = 0;
2186     }
2187
2188     cur_lextexts = 0;
2189     /* But we don't touch no_lextexts; those allocated blocks can be reused */
2190     lex_index = -1;
2191     lex_pos = -1;
2192
2193     tokens_put_back = 0;
2194     forerrors_pointer = 0;
2195     dont_enter_into_symbol_table = FALSE;
2196     return_sp_as_variable = FALSE;
2197     next_token_begins_syntax_line = TRUE;
2198
2199     source_to_analyse = lexical_source;
2200
2201     if (source_to_analyse == NULL)
2202     {   get_next_char = get_next_char_from_pipeline;
2203         if (!pipeline_made) create_char_pipeline();
2204         forerrors_buff[0] = 0; forerrors_pointer = 0;
2205     }
2206     else
2207     {   get_next_char = get_next_char_from_string;
2208         source_to_analyse_pointer = 0;
2209         CurrentLB = &StringLB;
2210         sprintf(veneer_error_title, "<veneer routine '%s'>", name);
2211         StringLB.filename = veneer_error_title;
2212
2213         CurrentLB->source_line = 1;
2214         CurrentLB->line_start  = 0;
2215         CurrentLB->chars_read  = 0;
2216     }
2217 }
2218
2219 /* ========================================================================= */
2220 /*   Data structure management routines                                      */
2221 /* ------------------------------------------------------------------------- */
2222
2223 extern void init_lexer_vars(void)
2224 {
2225     FileStack = NULL;
2226     FileStack_max = 0;
2227     CF = NULL;
2228     CurrentLB = NULL;
2229
2230     lextexts = NULL;
2231     no_lextexts = 0;
2232     cur_lextexts = 0;
2233     lex_index = -1;
2234     lex_pos = -1;
2235
2236     no_locals = 0;
2237     local_variable_names_usage = 0;
2238
2239     blank_brief_location.file_index = -1;
2240     blank_brief_location.line_number = 0;
2241     blank_brief_location.orig_file_index = 0;
2242     blank_brief_location.orig_line_number = 0;
2243 }
2244
2245 extern void lexer_begin_prepass(void)
2246 {   total_source_line_count = 0;
2247     CurrentLB = &NoFileOpen;
2248     report_errors_at_current_line();
2249 }
2250
2251 extern void lexer_begin_pass(void)
2252 {   no_hash_printed_yet = TRUE;
2253     hash_printed_since_newline = FALSE;
2254
2255     pipeline_made = FALSE;
2256
2257     no_locals = 0;
2258
2259     restart_lexer(NULL, NULL);
2260 }
2261
2262 extern void lexer_endpass(void)
2263 {   CurrentLB = &MakingOutput;
2264     report_errors_at_current_line();
2265 }
2266
2267 extern void lexer_allocate_arrays(void)
2268 {
2269     initialise_memory_list(&FileStack_memlist,
2270         sizeof(Sourcefile), 4, (void**)&FileStack,
2271         "source file stack");
2272     FileStack_max = 0;
2273
2274     initialise_memory_list(&lextexts_memlist,
2275         sizeof(lextext), 200, (void**)&lextexts,
2276         "lexeme texts list");
2277     cur_lextexts = 0;
2278
2279     keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2280         "keyword hash table");
2281     keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2282         "keyword hash end table");
2283     keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
2284         "keyword hashing linked list");
2285
2286     initialise_memory_list(&local_variable_names_memlist,
2287         sizeof(char), MAX_LOCAL_VARIABLES*32, NULL,
2288         "text of local variable names");
2289     local_variable_name_offsets = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES-1,
2290         "offsets of local variable names");
2291     local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2292         "local variable hash table");
2293     local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
2294         "local variable hash codes");
2295
2296     make_tokeniser_grid();
2297     make_keywords_tables();
2298
2299     first_token_locations =
2300         my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
2301     first_token_locations->location.file_index = 0;
2302     first_token_locations->location.beginning_byte_index = 0;
2303     first_token_locations->location.end_byte_index = 0;
2304     first_token_locations->location.beginning_line_number = 0;
2305     first_token_locations->location.end_line_number = 0;
2306     first_token_locations->location.beginning_character_number = 0;
2307     first_token_locations->location.end_character_number = 0;
2308     first_token_locations->location.orig_file_index = 0;
2309     first_token_locations->location.orig_beg_line_number = 0;
2310     first_token_locations->location.orig_beg_char_number = 0;
2311     first_token_locations->next = NULL;
2312     first_token_locations->reference_count = 0;
2313     last_token_location = first_token_locations;
2314 }
2315
2316 extern void lexer_free_arrays(void)
2317 {   int ix;
2318     CF = NULL;
2319     CurrentLB = NULL;
2320
2321     for (ix=0; ix<FileStack_max; ix++) {
2322         my_free(&FileStack[ix].buffer, "source file buffer");
2323     }
2324     deallocate_memory_list(&FileStack_memlist);
2325
2326     for (ix=0; ix<no_lextexts; ix++) {
2327         my_free(&lextexts[ix].text, "one lexeme text");
2328     }
2329     deallocate_memory_list(&lextexts_memlist);
2330
2331     my_free(&keywords_hash_table, "keyword hash table");
2332     my_free(&keywords_hash_ends_table, "keyword hash end table");
2333     my_free(&keywords_data_table, "keyword hashing linked list");
2334
2335     deallocate_memory_list(&local_variable_names_memlist);
2336     my_free(&local_variable_name_offsets, "offsets of local variable names");
2337     my_free(&local_variable_hash_table, "local variable hash table");
2338     my_free(&local_variable_hash_codes, "local variable hash codes");
2339
2340     cleanup_token_locations(NULL);
2341 }
2342
2343 /* ========================================================================= */