src/lexer.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "lexer" : Lexical analyser                                              */
   3 /*                                                                           */
   4 /*   Part of Inform 6.41                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2022                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 int total_source_line_count,            /* Number of source lines so far     */
  25
  26     no_hash_printed_yet,                /* Have not yet printed the first #  */
  27     hash_printed_since_newline,         /* A hash has been printed since the
  28                                            most recent new-line was printed
  29                                            (generally as a result of an error
  30                                            message or the start of pass)     */
  31     dont_enter_into_symbol_table,       /* Return names as text (with
  32                                            token type DQ_TT, i.e., as if
  33                                            they had double-quotes around)
  34                                            and not as entries in the symbol
  35                                            table, when TRUE. If -2, only the
  36                                            keyword table is searched.        */
  37     return_sp_as_variable;              /* When TRUE, the word "sp" denotes
  38                                            the stack pointer variable
  39                                            (used in assembly language only)  */
  40 int next_token_begins_syntax_line;      /* When TRUE, start a new syntax
  41                                            line (for error reporting, etc.)
  42                                            on the source code line where
  43                                            the next token appears            */
  44
  45 int32 last_mapped_line;  /* Last syntax line reported to debugging file      */
  46
  47 /* ------------------------------------------------------------------------- */
  48 /*   The lexer's output is a sequence of structs, each called a "token",     */
  49 /*   representing one lexical unit (or "lexeme") each.  Instead of providing */
  50 /*   "lookahead" (that is, always having available the next token after the  */
  51 /*   current one, so that syntax analysers higher up in Inform can have      */
  52 /*   advance knowledge of what is coming), the lexer instead has a system    */
  53 /*   where tokens can be read in and then "put back again".                  */
  54 /*   The meaning of the number (and to some extent the text) supplied with   */
  55 /*   a token depends on its type: see "header.h" for the list of types.      */
  56 /*   For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
  57 /*   number, and translated to the token:                                    */
  58 /*     type NUMBER_TT, value 483, text "$1e3"                                */
  59 /* ------------------------------------------------------------------------- */
  60 /*   These three variables are set to the current token on a call to         */
  61 /*   get_next_token() (but are not changed by a call to put_token_back()).   */
  62 /*   (It would be tidier to use a token_data structure, rather than having   */
  63 /*   get_next_token() unpack three values. But this is the way it is.)       */
  64 /* ------------------------------------------------------------------------- */
  65
  66 int token_type;
  67 int32 token_value;
  68 char *token_text;
  69
  70 /* ------------------------------------------------------------------------- */
  71 /*   The next two variables are the head and tail of a singly linked list.   */
  72 /*   The tail stores the portion most recently read from the current         */
  73 /*   lexical block; its end values therefore describe the location of the    */
  74 /*   current token, and are updated whenever the three variables above are   */
  75 /*   via set_token_location(...).  Earlier vertices, if any, represent the   */
  76 /*   regions of lexical blocks read beforehand, where new vertices are       */
  77 /*   only introduced by interruptions like a file inclusion or an EOF.       */
  78 /*   Vertices are deleted off of the front of the list once they are no      */
  79 /*   longer referenced by pending debug information records.                 */
  80 /* ------------------------------------------------------------------------- */
  81
  82 static debug_locations *first_token_locations;
  83 static debug_locations *last_token_location;
  84
  85 extern debug_location get_token_location(void)
  86 {   debug_location result;
  87     debug_location *location = &(last_token_location->location);
  88     result.file_index = location->file_index;
  89     result.beginning_byte_index = location->end_byte_index;
  90     result.end_byte_index = location->end_byte_index;
  91     result.beginning_line_number = location->end_line_number;
  92     result.end_line_number = location->end_line_number;
  93     result.beginning_character_number = location->end_character_number;
  94     result.end_character_number = location->end_character_number;
  95     result.orig_file_index = location->orig_file_index;
  96     result.orig_beg_line_number = location->orig_beg_line_number;
  97     result.orig_beg_char_number = location->orig_beg_char_number;
  98     return result;
  99 }
 100
 101 extern debug_locations get_token_locations(void)
 102 {   debug_locations result;
 103     result.location = get_token_location();
 104     result.next = NULL;
 105     result.reference_count = 0;
 106     return result;
 107 }
 108
 109 static void set_token_location(debug_location location)
 110 {   if (location.file_index == last_token_location->location.file_index)
 111     {   last_token_location->location.end_byte_index =
 112             location.end_byte_index;
 113         last_token_location->location.end_line_number =
 114             location.end_line_number;
 115         last_token_location->location.end_character_number =
 116             location.end_character_number;
 117         last_token_location->location.orig_file_index =
 118             location.orig_file_index;
 119         last_token_location->location.orig_beg_line_number =
 120             location.orig_beg_line_number;
 121         last_token_location->location.orig_beg_char_number =
 122             location.orig_beg_char_number;
 123     } else
 124     {   debug_locations*successor =
 125             my_malloc
 126                 (sizeof(debug_locations),
 127                  "debug locations of recent tokens");
 128         successor->location = location;
 129         successor->next = NULL;
 130         successor->reference_count = 0;
 131         last_token_location->next = successor;
 132         last_token_location = successor;
 133     }
 134 }
 135
 136 extern debug_location_beginning get_token_location_beginning(void)
 137 {   debug_location_beginning result;
 138     ++(last_token_location->reference_count);
 139     result.head = last_token_location;
 140     result.beginning_byte_index =
 141         last_token_location->location.end_byte_index;
 142     result.beginning_line_number =
 143         last_token_location->location.end_line_number;
 144     result.beginning_character_number =
 145         last_token_location->location.end_character_number;
 146     result.orig_file_index = last_token_location->location.orig_file_index;
 147     result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
 148     result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
 149
 150     return result;
 151 }
 152
 153 static void cleanup_token_locations(debug_location_beginning*beginning)
 154 {   if (first_token_locations)
 155     {   while (first_token_locations &&
 156                !first_token_locations->reference_count)
 157         {   debug_locations*moribund = first_token_locations;
 158             first_token_locations = moribund->next;
 159             my_free(&moribund, "debug locations of recent tokens");
 160             if (beginning &&
 161                 (beginning->head == moribund || !first_token_locations))
 162             {   compiler_error
 163                     ("Records needed by a debug_location_beginning are no "
 164                      "longer allocated, perhaps because of an invalid reuse "
 165                      "of this or an earlier beginning");
 166             }
 167         }
 168     } else
 169     {   if (beginning)
 170         {   compiler_error
 171                 ("Attempt to use a debug_location_beginning when no token "
 172                  "locations are defined");
 173         } else
 174         {   compiler_error
 175                 ("Attempt to clean up token locations when no token locations "
 176                  "are defined");
 177         }
 178     }
 179 }
 180
 181 extern void discard_token_location(debug_location_beginning beginning)
 182 {   --(beginning.head->reference_count);
 183 }
 184
 185 extern debug_locations get_token_location_end
 186     (debug_location_beginning beginning)
 187 {   debug_locations result;
 188     cleanup_token_locations(&beginning);
 189     --(beginning.head->reference_count);
 190     /* Sometimes we know what we'll read before we switch to the lexical block
 191        where we'll read it.  In such cases the beginning will be placed in the
 192        prior block and last exactly zero bytes there.  It's misleading to
 193        include such ranges, so we gobble them. */
 194     if (beginning.head->location.end_byte_index ==
 195           beginning.beginning_byte_index &&
 196         beginning.head->next)
 197     {   beginning.head = beginning.head->next;
 198         result.location = beginning.head->location;
 199         result.location.beginning_byte_index = 0;
 200         result.location.beginning_line_number = 1;
 201         result.location.beginning_character_number = 1;
 202     } else
 203     {   result.location = beginning.head->location;
 204         result.location.beginning_byte_index =
 205             beginning.beginning_byte_index;
 206         result.location.beginning_line_number =
 207             beginning.beginning_line_number;
 208         result.location.beginning_character_number =
 209             beginning.beginning_character_number;
 210     }
 211
 212     result.location.orig_file_index =
 213         beginning.orig_file_index;
 214     result.location.orig_beg_line_number =
 215         beginning.orig_beg_line_number;
 216     result.location.orig_beg_char_number =
 217         beginning.orig_beg_char_number;
 218
 219     result.next = beginning.head->next;
 220     result.reference_count = 0;
 221     return result;
 222 }
 223
 224 /* ------------------------------------------------------------------------- */
 225 /*   In order to be able to put tokens back efficiently, the lexer stores    */
 226 /*   tokens in a "circle": the variable circle_position ranges between       */
 227 /*   0 and CIRCLE_SIZE-1.  We only need a circle size as large as the        */
 228 /*   maximum number of tokens ever put back at once, plus 1 (in effect, the  */
 229 /*   maximum token lookahead ever needed in syntax analysis, plus 1).        */
 230 /*                                                                           */
 231 /*   Note that the circle struct type is lexeme_data, whereas the expression */
 232 /*   code all works in token_data. They have slightly different needs. The   */
 233 /*   data is exported through the token_text, token_value, token_type        */
 234 /*   globals, so there's no need to use the same struct at both levels.      */
 235 /*                                                                           */
 236 /*   Unlike some compilers, Inform does not have a context-free lexer: in    */
 237 /*   fact it has 12288 different possible states.  However, the context only */
 238 /*   affects the interpretation of "identifiers": lexemes beginning with a   */
 239 /*   letter and containing up to 32 chars of alphanumeric and underscore     */
 240 /*   chars.  (For example, "default" may refer to the directive or statement */
 241 /*   of that name, and which token values are returned depends on the        */
 242 /*   current lexical context.)                                               */
 243 /*                                                                           */
 244 /*   Along with each token, we also store the lexical context it was         */
 245 /*   translated under; because if it is called for again, there may need     */
 246 /*   to be a fresh interpretation of it if the context has changed.          */
 247 /* ------------------------------------------------------------------------- */
 248
 249 #define CIRCLE_SIZE 6
 250
 251 /*   (The worst case for token lookahead is distinguishing between an
 252      old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".)   */
 253
 254 static int circle_position;
 255 static lexeme_data circle[CIRCLE_SIZE];
 256
 257 /* ------------------------------------------------------------------------- */
 258 /*   A complication, however, is that the text of some lexemes needs to be   */
 259 /*   held in Inform's memory for much longer periods: for example, a         */
 260 /*   dictionary word lexeme (like "'south'") must have its text preserved    */
 261 /*   until the code generation time for the expression it occurs in, when    */
 262 /*   the dictionary reference is actually made. We handle this by keeping    */
 263 /*   all lexeme text until the end of the statement (or, for top-level       */
 264 /*   directives, until the end of the directive). Then we call               */
 265 /*   release_token_texts() to start over. The lextexts array will therefore  */
 266 /*   grow to the largest number of lexemes in a single statement or          */
 267 /*   directive.                                                              */
 268 /* ------------------------------------------------------------------------- */
 269
 270 typedef struct lextext_s {
 271     char *text;
 272     size_t size; /* Allocated size (including terminal null)
 273                     This is always at least MAX_IDENTIFIER_LENGTH+1         */
 274 } lextext;
 275
 276 static lextext *lextexts; /* Allocated to no_lextexts */
 277 static memory_list lextexts_memlist;
 278 static int no_lextexts;
 279
 280 static int cur_lextexts;    /* Number of lextexts in current use
 281                                (cur_lextexts <= no_lextexts)                 */
 282
 283 static int lex_index;       /* Index of lextext being written to             */
 284 static int lex_pos;         /* Current write position in that lextext        */
 285
 286 /* ------------------------------------------------------------------------- */
 287 /*   The lexer itself needs up to 3 characters of lookahead (it uses an      */
 288 /*   LR(3) grammar to translate characters into tokens).                     */
 289 /* ------------------------------------------------------------------------- */
 290
 291 #define LOOKAHEAD_SIZE 3
 292
 293 static int current, lookahead,          /* The latest character read, and    */
 294     lookahead2, lookahead3;             /* the three characters following it */
 295
 296 static int pipeline_made;               /* Whether or not the pipeline of
 297                                            characters has been constructed
 298                                            yet (this pass)                   */
 299
 300 static int (* get_next_char)(void);     /* Routine for reading the stream of
 301                                            characters: the lexer does not
 302                                            need any "ungetc" routine for
 303                                            putting them back again.  End of
 304                                            stream is signalled by returning
 305                                            zero.                             */
 306
 307 static char *source_to_analyse;         /* The current lexical source:
 308                                            NULL for "load from source files",
 309                                            otherwise this points to a string
 310                                            containing Inform code            */
 311
 312 static int tokens_put_back;             /* Count of the number of backward
 313                                            moves made from the last-read
 314                                            token                             */
 315
 316 /* This gets called for both token_data and lexeme_data structs. It prints
 317    a description of the common part (the text, value, type fields).
 318 */
 319 extern void describe_token_triple(const char *text, int32 value, int type)
 320 {
 321     /*  Many of the token types are not set in this file, but later on in
 322         Inform's higher stages (for example, in the expression evaluator);
 323         but this routine describes them all.                                 */
 324
 325     printf("{ ");
 326
 327     switch(type)
 328     {
 329         /*  The following token types occur in lexer output:                 */
 330
 331         case SYMBOL_TT:          printf("symbol ");
 332                                  describe_symbol(value);
 333                                  break;
 334         case NUMBER_TT:          printf("literal number %d", value);
 335                                  break;
 336         case DQ_TT:              printf("string \"%s\"", text);
 337                                  break;
 338         case SQ_TT:              printf("string '%s'", text);
 339                                  break;
 340         case SEP_TT:             printf("separator '%s'", text);
 341                                  break;
 342         case EOF_TT:             printf("end of file");
 343                                  break;
 344
 345         case STATEMENT_TT:       printf("statement name '%s'", text);
 346                                  break;
 347         case SEGMENT_MARKER_TT:  printf("object segment marker '%s'", text);
 348                                  break;
 349         case DIRECTIVE_TT:       printf("directive name '%s'", text);
 350                                  break;
 351         case CND_TT:             printf("textual conditional '%s'", text);
 352                                  break;
 353         case OPCODE_NAME_TT:     printf("opcode name '%s'", text);
 354                                  break;
 355         case SYSFUN_TT:          printf("built-in function name '%s'", text);
 356                                  break;
 357         case LOCAL_VARIABLE_TT:  printf("local variable name '%s'", text);
 358                                  break;
 359         case MISC_KEYWORD_TT:    printf("statement keyword '%s'", text);
 360                                  break;
 361         case DIR_KEYWORD_TT:     printf("directive keyword '%s'", text);
 362                                  break;
 363         case TRACE_KEYWORD_TT:   printf("'trace' keyword '%s'", text);
 364                                  break;
 365         case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", text);
 366                                  break;
 367
 368         /*  The remaining are etoken types, not set by the lexer             */
 369
 370         case OP_TT:              printf("operator '%s'",
 371                                      operators[value].description);
 372                                  break;
 373         case ENDEXP_TT:          printf("end of expression");
 374                                  break;
 375         case SUBOPEN_TT:         printf("open bracket");
 376                                  break;
 377         case SUBCLOSE_TT:        printf("close bracket");
 378                                  break;
 379         case LARGE_NUMBER_TT:    printf("large number: '%s'=%d",text,value);
 380                                  break;
 381         case SMALL_NUMBER_TT:    printf("small number: '%s'=%d",text,value);
 382                                  break;
 383         case VARIABLE_TT:        printf("variable '%s'=%d", text, value);
 384                                  break;
 385         case DICTWORD_TT:        printf("dictionary word '%s'", text);
 386                                  break;
 387         case ACTION_TT:          printf("action name '%s'", text);
 388                                  break;
 389
 390         default:
 391             printf("** unknown token type %d, text='%s', value=%d **",
 392             type, text, value);
 393     }
 394     printf(" }");
 395 }
 396
 397 /* ------------------------------------------------------------------------- */
 398 /*   All but one of the Inform keywords (most of them opcode names used      */
 399 /*   only by the assembler).  (The one left over is "sp", a keyword used in  */
 400 /*   assembly language only.)                                                */
 401 /*                                                                           */
 402 /*   A "keyword group" is a set of keywords to be searched for.  If a match  */
 403 /*   is made on an identifier, the token type becomes that given in the KG   */
 404 /*   and the token value is its index in the KG.                             */
 405 /*                                                                           */
 406 /*   The keyword ordering must correspond with the appropriate #define's in  */
 407 /*   "header.h" but is otherwise not significant.                            */
 408 /* ------------------------------------------------------------------------- */
 409
 410 /* This must exceed the total number of keywords across all groups,
 411    including opcodes. */
 412 #define MAX_KEYWORDS (500)
 413
 414 /* The values will be filled in at compile time, when we know
 415    which opcode set to use. */
 416 keyword_group opcode_names =
 417 { { "" },
 418     OPCODE_NAME_TT, FALSE, TRUE
 419 };
 420
 421 static char *opcode_list_z[] = {
 422     "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
 423     "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
 424     "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
 425     "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
 426     "print_char", "print_num", "random", "push", "pull", "split_window",
 427     "set_window", "output_stream", "input_stream", "sound_effect", "jz",
 428     "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
 429     "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
 430     "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
 431     "restore", "restart", "ret_popped", "pop", "quit", "new_line",
 432     "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
 433     "erase_window", "erase_line", "set_cursor", "get_cursor",
 434     "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
 435     "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
 436     "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
 437     "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
 438     "restore_undo", "draw_picture", "picture_data", "erase_picture",
 439     "set_margins", "move_window", "window_size", "window_style",
 440     "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
 441     "mouse_window", "push_stack", "put_wind_prop", "print_form",
 442     "make_menu", "picture_table", "print_unicode", "check_unicode",
 443     ""
 444 };
 445
 446 static char *opcode_list_g[] = {
 447     "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
 448     "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
 449     "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
 450     "jltu", "jgeu", "jgtu", "jleu",
 451     "call", "return",
 452     "catch", "throw", "tailcall",
 453     "copy", "copys", "copyb", "sexs", "sexb", "aload",
 454     "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
 455     "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
 456     "streamchar", "streamnum", "streamstr",
 457     "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
 458     "random", "setrandom", "quit", "verify",
 459     "restart", "save", "restore", "saveundo", "restoreundo", "protect",
 460     "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
 461     "linearsearch", "binarysearch", "linkedsearch",
 462     "callf", "callfi", "callfii", "callfiii",
 463     "streamunichar",
 464     "mzero", "mcopy", "malloc", "mfree",
 465     "accelfunc", "accelparam",
 466     "hasundo", "discardundo",
 467     "numtof", "ftonumz", "ftonumn", "ceil", "floor",
 468     "fadd", "fsub", "fmul", "fdiv", "fmod",
 469     "sqrt", "exp", "log", "pow",
 470     "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
 471     "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
 472     "numtod", "dtonumz", "dtonumn", "ftod", "dtof", "dceil", "dfloor",
 473     "dadd", "dsub", "dmul", "ddiv", "dmodr", "dmodq",
 474     "dsqrt", "dexp", "dlog", "dpow",
 475     "dsin", "dcos", "dtan", "dasin", "dacos", "datan", "datan2",
 476     "jdeq", "jdne", "jdlt", "jdle", "jdgt", "jdge", "jdisnan", "jdisinf",
 477     ""
 478 };
 479
 480 keyword_group opcode_macros =
 481 { { "" },
 482   OPCODE_MACRO_TT, FALSE, TRUE
 483 };
 484
 485 static char *opmacro_list_z[] = { "" };
 486
 487 static char *opmacro_list_g[] = {
 488     "pull", "push", "dload", "dstore",
 489     ""
 490 };
 491
 492 keyword_group directives =
 493 { { "abbreviate", "array", "attribute", "class", "constant",
 494     "default", "dictionary", "end", "endif", "extend", "fake_action",
 495     "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
 496     "iffalse", "import", "include", "link", "lowstring", "message",
 497     "nearby", "object", "origsource", "property", "release", "replace",
 498     "serial", "switches", "statusline", "stub", "system_file", "trace",
 499     "undef", "verb", "version", "zcharacter",
 500     "" },
 501     DIRECTIVE_TT, FALSE, FALSE
 502 };
 503
 504 keyword_group trace_keywords =
 505 { { "dictionary", "symbols", "objects", "verbs",
 506     "assembly", "expressions", "lines", "tokens", "linker",
 507     "on", "off", "" },
 508     TRACE_KEYWORD_TT, FALSE, TRUE
 509 };
 510
 511 keyword_group segment_markers =
 512 { { "class", "has", "private", "with", "" },
 513     SEGMENT_MARKER_TT, FALSE, TRUE
 514 };
 515
 516 keyword_group directive_keywords =
 517 { { "alias", "long", "additive",
 518     "score", "time",
 519     "noun", "held", "multi", "multiheld", "multiexcept",
 520     "multiinside", "creature", "special", "number", "scope", "topic",
 521     "reverse", "meta", "only", "replace", "first", "last",
 522     "string", "table", "buffer", "data", "initial", "initstr",
 523     "with", "private", "has", "class",
 524     "error", "fatalerror", "warning",
 525     "terminating", "static", "individual",
 526     "" },
 527     DIR_KEYWORD_TT, FALSE, TRUE
 528 };
 529
 530 keyword_group misc_keywords =
 531 { { "char", "name", "the", "a", "an", "The", "number",
 532     "roman", "reverse", "bold", "underline", "fixed", "on", "off",
 533     "to", "address", "string", "object", "near", "from", "property", "A", "" },
 534     MISC_KEYWORD_TT, FALSE, TRUE
 535 };
 536
 537 keyword_group statements =
 538 { { "box", "break", "continue", "default", "do", "else", "font", "for",
 539     "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
 540     "print", "print_ret", "quit", "read", "remove", "restore", "return",
 541     "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
 542     "until", "while", "" },
 543     STATEMENT_TT, FALSE, TRUE
 544 };
 545
 546 keyword_group conditions =
 547 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
 548     CND_TT, FALSE, TRUE
 549 };
 550
 551 keyword_group system_functions =
 552 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
 553     "sibling", "younger", "youngest", "metaclass", "glk", "" },
 554     SYSFUN_TT, FALSE, TRUE
 555 };
 556
 557 keyword_group system_constants =
 558 { { "adjectives_table", "actions_table", "classes_table",
 559     "identifiers_table", "preactions_table", "version_number",
 560     "largest_object", "strings_offset", "code_offset",
 561     "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
 562     "static_memory_offset", "array_names_offset", "readable_memory_offset",
 563     "cpv__start", "cpv__end", "ipv__start", "ipv__end",
 564     "array__start", "array__end",
 565     "lowest_attribute_number", "highest_attribute_number",
 566     "attribute_names_array",
 567     "lowest_property_number", "highest_property_number",
 568     "property_names_array",
 569     "lowest_action_number", "highest_action_number",
 570     "action_names_array",
 571     "lowest_fake_action_number", "highest_fake_action_number",
 572     "fake_action_names_array",
 573     "lowest_routine_number", "highest_routine_number", "routines_array",
 574     "routine_names_array", "routine_flags_array",
 575     "lowest_global_number", "highest_global_number", "globals_array",
 576     "global_names_array", "global_flags_array",
 577     "lowest_array_number", "highest_array_number", "arrays_array",
 578     "array_names_array", "array_flags_array",
 579     "lowest_constant_number", "highest_constant_number", "constants_array",
 580     "constant_names_array",
 581     "lowest_class_number", "highest_class_number", "class_objects_array",
 582     "lowest_object_number", "highest_object_number",
 583     "oddeven_packing",
 584     "grammar_table", "dictionary_table", "dynam_string_table",
 585     "" },
 586     SYSTEM_CONSTANT_TT, FALSE, TRUE
 587 };
 588
 589 keyword_group *keyword_groups[12]
 590 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
 591     &directive_keywords, &misc_keywords, &statements, &conditions,
 592     &system_functions, &system_constants, &opcode_macros};
 593
 594 /* These keywords are set to point to local_variable_names entries when
 595    a routine header is parsed. See construct_local_variable_tables().        */
 596 keyword_group local_variables =
 597 { { "" },
 598     LOCAL_VARIABLE_TT, FALSE, FALSE
 599 };
 600
 601 static int lexical_context(void)
 602 {
 603     /*  The lexical context is a number representing all of the context
 604         information in the lexical analyser: the same input text will
 605         always translate to the same output tokens whenever the context
 606         is the same.
 607
 608         In fact, for efficiency reasons this number omits the bit of
 609         information held in the variable "dont_enter_into_symbol_table".
 610         Inform never needs to backtrack through tokens parsed in that
 611         way (thankfully, as it would be expensive indeed to check
 612         the tokens).                                                         */
 613
 614     int c = 0;
 615     if (opcode_names.enabled)         c |= 1;
 616     if (directives.enabled)           c |= 2;
 617     if (trace_keywords.enabled)       c |= 4;
 618     if (segment_markers.enabled)      c |= 8;
 619     if (directive_keywords.enabled)   c |= 16;
 620     if (misc_keywords.enabled)        c |= 32;
 621     if (statements.enabled)           c |= 64;
 622     if (conditions.enabled)           c |= 128;
 623     if (system_functions.enabled)     c |= 256;
 624     if (system_constants.enabled)     c |= 512;
 625     if (local_variables.enabled)      c |= 1024;
 626
 627     if (return_sp_as_variable)        c |= 2048;
 628     return(c);
 629 }
 630
 631 static void print_context(int c)
 632 {
 633     if ((c & 1) != 0) printf("OPC ");
 634     if ((c & 2) != 0) printf("DIR ");
 635     if ((c & 4) != 0) printf("TK ");
 636     if ((c & 8) != 0) printf("SEG ");
 637     if ((c & 16) != 0) printf("DK ");
 638     if ((c & 32) != 0) printf("MK ");
 639     if ((c & 64) != 0) printf("STA ");
 640     if ((c & 128) != 0) printf("CND ");
 641     if ((c & 256) != 0) printf("SFUN ");
 642     if ((c & 512) != 0) printf("SCON ");
 643     if ((c & 1024) != 0) printf("LV ");
 644     if ((c & 2048) != 0) printf("sp ");
 645 }
 646
 647 static int *keywords_hash_table;
 648 static int *keywords_hash_ends_table;
 649 static int *keywords_data_table;
 650
 651 static int *local_variable_hash_table;
 652 static int *local_variable_hash_codes;
 653
 654 /* Note that MAX_LOCAL_VARIABLES is the maximum number of local variables
 655    for this VM, *including* "sp" (the stack pointer "local").
 656    This used to be a memory setting. Now it is a constant: 16 for Z-code,
 657    119 for Glulx.
 658 */
 659
 660 /* Names of local variables in the current routine.
 661    This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local"
 662    is not included in this array.)
 663
 664    (This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1.
 665    But right now we just allocate the max.)
 666  */
 667 identstruct *local_variable_names;
 668
 669 static char one_letter_locals[128];
 670
 671 static void make_keywords_tables(void)
 672 {   int i, j, h, tp=0;
 673     char **oplist, **maclist;
 674
 675     if (!glulx_mode) {
 676         oplist = opcode_list_z;
 677         maclist = opmacro_list_z;
 678     }
 679     else {
 680         oplist = opcode_list_g;
 681         maclist = opmacro_list_g;
 682     }
 683
 684     for (j=0; *(oplist[j]); j++) {
 685         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 686             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 687             compiler_error("opcode_list has overflowed opcode_names.keywords");
 688             break;
 689         }
 690         opcode_names.keywords[j] = oplist[j];
 691     }
 692     opcode_names.keywords[j] = "";
 693
 694     for (j=0; *(maclist[j]); j++) {
 695         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 696             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 697             compiler_error("opmacro_list has overflowed opcode_macros.keywords");
 698             break;
 699         }
 700         opcode_macros.keywords[j] = maclist[j];
 701     }
 702     opcode_macros.keywords[j] = "";
 703
 704     for (i=0; i<HASH_TAB_SIZE; i++)
 705     {   keywords_hash_table[i] = -1;
 706         keywords_hash_ends_table[i] = -1;
 707     }
 708
 709     for (i=1; i<=11; i++)
 710     {   keyword_group *kg = keyword_groups[i];
 711         for (j=0; *(kg->keywords[j]) != 0; j++)
 712         {
 713             if (tp >= MAX_KEYWORDS) {
 714                 /* Gotta increase MAX_KEYWORDS */
 715                 compiler_error("keywords_data_table has overflowed MAX_KEYWORDS");
 716                 break;
 717             }
 718             h = hash_code_from_string(kg->keywords[j]);
 719             if (keywords_hash_table[h] == -1)
 720                 keywords_hash_table[h] = tp;
 721             else
 722               *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
 723             keywords_hash_ends_table[h] = tp;
 724             *(keywords_data_table + 3*tp) = i;
 725             *(keywords_data_table + 3*tp+1) = j;
 726             *(keywords_data_table + 3*tp+2) = -1;
 727             tp++;
 728         }
 729     }
 730 }
 731
 732 /* Look at the strings stored in local_variable_names (from 0 to no_locals).
 733    Set local_variables.keywords to point to these, and also prepare the
 734    hash tables. */
 735 extern void construct_local_variable_tables(void)
 736 {   int i, h;
 737     for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
 738     for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
 739
 740     for (i=0; i<no_locals; i++)
 741     {
 742         char *p = local_variable_names[i].text;
 743         local_variables.keywords[i] = p;
 744         if (p[1] == 0)
 745         {   one_letter_locals[(uchar)p[0]] = i;
 746             if (isupper(p[0])) one_letter_locals[tolower(p[0])] = i;
 747             if (islower(p[0])) one_letter_locals[toupper(p[0])] = i;
 748         }
 749         h = hash_code_from_string(p);
 750         if (local_variable_hash_table[h] == -1)
 751             local_variable_hash_table[h] = i;
 752         local_variable_hash_codes[i] = h;
 753     }
 754     /* Clear the rest. */
 755     for (;i<MAX_LOCAL_VARIABLES-1;i++) {
 756         local_variables.keywords[i] = "";
 757         local_variable_hash_codes[i] = 0;
 758     }
 759 }
 760
 761 static void interpret_identifier(char *p, int pos, int dirs_only_flag)
 762 {   int index, hashcode;
 763
 764     /*  An identifier is either a keyword or a "symbol", a name which the
 765         lexical analyser leaves to higher levels of Inform to understand.    */
 766
 767     hashcode = hash_code_from_string(p);
 768
 769     if (dirs_only_flag) goto KeywordSearch;
 770
 771     /*  If this is assembly language, perhaps it is "sp"?                    */
 772
 773     if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
 774     {   circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
 775         return;
 776     }
 777
 778     /*  Test for local variables first, quite quickly.                       */
 779
 780     if (local_variables.enabled)
 781     {   if (p[1]==0)
 782         {   index = one_letter_locals[(uchar)p[0]];
 783             if (index<MAX_LOCAL_VARIABLES)
 784             {   circle[pos].type = LOCAL_VARIABLE_TT;
 785                 circle[pos].value = index+1;
 786                 return;
 787             }
 788         }
 789         index = local_variable_hash_table[hashcode];
 790         if (index >= 0)
 791         {   for (;index<no_locals;index++)
 792             {   if (hashcode == local_variable_hash_codes[index])
 793                 {   if (strcmpcis(p, local_variable_names[index].text)==0)
 794                     {   circle[pos].type = LOCAL_VARIABLE_TT;
 795                         circle[pos].value = index+1;
 796                         return;
 797                     }
 798                 }
 799             }
 800         }
 801     }
 802
 803     /*  Now the bulk of the keywords.  Note that the lexer doesn't recognise
 804         the name of a system function which has been Replaced.               */
 805
 806     KeywordSearch:
 807     index = keywords_hash_table[hashcode];
 808     while (index >= 0)
 809     {   int *i = keywords_data_table + 3*index;
 810         keyword_group *kg = keyword_groups[*i];
 811         if (((!dirs_only_flag) && (kg->enabled))
 812             || (dirs_only_flag && (kg == &directives)))
 813         {   char *q = kg->keywords[*(i+1)];
 814             if (((kg->case_sensitive) && (strcmp(p, q)==0))
 815                 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
 816             {   if ((kg != &system_functions)
 817                     || (system_function_usage[*(i+1)]!=2))
 818                 {   circle[pos].type = kg->change_token_type;
 819                     circle[pos].value = *(i+1);
 820                     return;
 821                 }
 822             }
 823         }
 824         index = *(i+2);
 825     }
 826
 827     if (dirs_only_flag) return;
 828
 829     /*  Search for the name; create it if necessary.                         */
 830
 831     circle[pos].value = symbol_index(p, hashcode);
 832     circle[pos].type = SYMBOL_TT;
 833 }
 834
 835
 836 /* ------------------------------------------------------------------------- */
 837 /*   The tokeniser grid aids a rapid decision about the consequences of a    */
 838 /*   character reached in the buffer.  In effect it is an efficiently stored */
 839 /*   transition table using an algorithm similar to that of S. C. Johnson's  */
 840 /*   "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9).       */
 841 /*   My thanks to Dilip Sequeira for suggesting this.                        */
 842 /*                                                                           */
 843 /*       tokeniser_grid[c]   is (16*n + m) if c is the first character of    */
 844 /*                               separator numbers n, n+1, ..., n+m-1        */
 845 /*                           or certain special values (QUOTE_CODE, etc)     */
 846 /*                           or 0 otherwise                                  */
 847 /*                                                                           */
 848 /*   Since 1000/16 = 62, the code numbers below will need increasing if the  */
 849 /*   number of separators supported exceeds 61.                              */
 850 /* ------------------------------------------------------------------------- */
 851
 852 static int tokeniser_grid[256];
 853
 854 #define QUOTE_CODE      1000
 855 #define DQUOTE_CODE     1001
 856 #define NULL_CODE       1002
 857 #define SPACE_CODE      1003
 858 #define NEGATIVE_CODE   1004
 859 #define DIGIT_CODE      1005
 860 #define RADIX_CODE      1006
 861 #define KEYWORD_CODE    1007
 862 #define EOF_CODE        1008
 863 #define WHITESPACE_CODE 1009
 864 #define COMMENT_CODE    1010
 865 #define IDENTIFIER_CODE 1011
 866
 867 /*  This list cannot safely be changed without also changing the header
 868     separator #defines.  The ordering is significant in that (i) all entries
 869     beginning with the same character must be adjacent and (ii) that if
 870     X is a an initial substring of Y then X must come before Y.
 871
 872     E.g. --> must occur before -- to prevent "-->0" being tokenised
 873     wrongly as "--", ">", "0" rather than "-->", "0".                        */
 874
 875 static const char separators[NUMBER_SEPARATORS][4] =
 876 {   "->", "-->", "--", "-", "++", "+", "*", "/", "%",
 877     "||", "|", "&&", "&", "~~",
 878     "~=", "~", "==", "=", ">=", ">",
 879     "<=", "<", "(", ")", ",",
 880     ".&", ".#", "..&", "..#", "..", ".",
 881     "::", ":", "@", ";", "[", "]", "{", "}",
 882     "$", "?~", "?",
 883     "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
 884 };
 885
 886 static void make_tokeniser_grid(void)
 887 {
 888     /*  Construct the grid to the specification above.                       */
 889
 890     int i, j;
 891
 892     for (i=0; i<256; i++) tokeniser_grid[i]=0;
 893
 894     for (i=0; i<NUMBER_SEPARATORS; i++)
 895     {   j=separators[i][0];
 896         if (tokeniser_grid[j]==0)
 897             tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
 898     }
 899     tokeniser_grid['\''] = QUOTE_CODE;
 900     tokeniser_grid['\"'] = DQUOTE_CODE;
 901     tokeniser_grid[0]    = EOF_CODE;
 902     tokeniser_grid[' ']  = WHITESPACE_CODE;
 903     tokeniser_grid['\n'] = WHITESPACE_CODE;
 904     tokeniser_grid['$']  = RADIX_CODE;
 905     tokeniser_grid['!']  = COMMENT_CODE;
 906
 907     tokeniser_grid['0']  = DIGIT_CODE;
 908     tokeniser_grid['1']  = DIGIT_CODE;
 909     tokeniser_grid['2']  = DIGIT_CODE;
 910     tokeniser_grid['3']  = DIGIT_CODE;
 911     tokeniser_grid['4']  = DIGIT_CODE;
 912     tokeniser_grid['5']  = DIGIT_CODE;
 913     tokeniser_grid['6']  = DIGIT_CODE;
 914     tokeniser_grid['7']  = DIGIT_CODE;
 915     tokeniser_grid['8']  = DIGIT_CODE;
 916     tokeniser_grid['9']  = DIGIT_CODE;
 917
 918     tokeniser_grid['a']  = IDENTIFIER_CODE;
 919     tokeniser_grid['b']  = IDENTIFIER_CODE;
 920     tokeniser_grid['c']  = IDENTIFIER_CODE;
 921     tokeniser_grid['d']  = IDENTIFIER_CODE;
 922     tokeniser_grid['e']  = IDENTIFIER_CODE;
 923     tokeniser_grid['f']  = IDENTIFIER_CODE;
 924     tokeniser_grid['g']  = IDENTIFIER_CODE;
 925     tokeniser_grid['h']  = IDENTIFIER_CODE;
 926     tokeniser_grid['i']  = IDENTIFIER_CODE;
 927     tokeniser_grid['j']  = IDENTIFIER_CODE;
 928     tokeniser_grid['k']  = IDENTIFIER_CODE;
 929     tokeniser_grid['l']  = IDENTIFIER_CODE;
 930     tokeniser_grid['m']  = IDENTIFIER_CODE;
 931     tokeniser_grid['n']  = IDENTIFIER_CODE;
 932     tokeniser_grid['o']  = IDENTIFIER_CODE;
 933     tokeniser_grid['p']  = IDENTIFIER_CODE;
 934     tokeniser_grid['q']  = IDENTIFIER_CODE;
 935     tokeniser_grid['r']  = IDENTIFIER_CODE;
 936     tokeniser_grid['s']  = IDENTIFIER_CODE;
 937     tokeniser_grid['t']  = IDENTIFIER_CODE;
 938     tokeniser_grid['u']  = IDENTIFIER_CODE;
 939     tokeniser_grid['v']  = IDENTIFIER_CODE;
 940     tokeniser_grid['w']  = IDENTIFIER_CODE;
 941     tokeniser_grid['x']  = IDENTIFIER_CODE;
 942     tokeniser_grid['y']  = IDENTIFIER_CODE;
 943     tokeniser_grid['z']  = IDENTIFIER_CODE;
 944
 945     tokeniser_grid['A']  = IDENTIFIER_CODE;
 946     tokeniser_grid['B']  = IDENTIFIER_CODE;
 947     tokeniser_grid['C']  = IDENTIFIER_CODE;
 948     tokeniser_grid['D']  = IDENTIFIER_CODE;
 949     tokeniser_grid['E']  = IDENTIFIER_CODE;
 950     tokeniser_grid['F']  = IDENTIFIER_CODE;
 951     tokeniser_grid['G']  = IDENTIFIER_CODE;
 952     tokeniser_grid['H']  = IDENTIFIER_CODE;
 953     tokeniser_grid['I']  = IDENTIFIER_CODE;
 954     tokeniser_grid['J']  = IDENTIFIER_CODE;
 955     tokeniser_grid['K']  = IDENTIFIER_CODE;
 956     tokeniser_grid['L']  = IDENTIFIER_CODE;
 957     tokeniser_grid['M']  = IDENTIFIER_CODE;
 958     tokeniser_grid['N']  = IDENTIFIER_CODE;
 959     tokeniser_grid['O']  = IDENTIFIER_CODE;
 960     tokeniser_grid['P']  = IDENTIFIER_CODE;
 961     tokeniser_grid['Q']  = IDENTIFIER_CODE;
 962     tokeniser_grid['R']  = IDENTIFIER_CODE;
 963     tokeniser_grid['S']  = IDENTIFIER_CODE;
 964     tokeniser_grid['T']  = IDENTIFIER_CODE;
 965     tokeniser_grid['U']  = IDENTIFIER_CODE;
 966     tokeniser_grid['V']  = IDENTIFIER_CODE;
 967     tokeniser_grid['W']  = IDENTIFIER_CODE;
 968     tokeniser_grid['X']  = IDENTIFIER_CODE;
 969     tokeniser_grid['Y']  = IDENTIFIER_CODE;
 970     tokeniser_grid['Z']  = IDENTIFIER_CODE;
 971
 972     tokeniser_grid['_']  = IDENTIFIER_CODE;
 973 }
 974
 975 /* ------------------------------------------------------------------------- */
 976 /*   Definition of a lexical block: a source file or a string containing     */
 977 /*   text for lexical analysis; an independent source from the point of      */
 978 /*   view of issuing error reports.                                          */
 979 /* ------------------------------------------------------------------------- */
 980
 981 typedef struct LexicalBlock_s
 982 {   char *filename;                              /*  Full translated name    */
 983     int   main_flag;                             /*  TRUE if the main file
 984                                                      (the first one opened)  */
 985     int   sys_flag;                              /*  TRUE if a System_File   */
 986     int   source_line;                           /*  Line number count       */
 987     int   line_start;                            /*  Char number within file
 988                                                      where the current line
 989                                                      starts                  */
 990     int   chars_read;                            /*  Char number of read pos */
 991     int   file_no;                               /*  Or 255 if not from a
 992                                                      file; used for debug
 993                                                      information             */
 994     char *orig_source;                           /* From #origsource direct  */
 995     int orig_file;
 996     int32 orig_line;
 997     int32 orig_char;
 998 } LexicalBlock;
 999
1000 static LexicalBlock NoFileOpen =
1001     {   "<before compilation>",  FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1002
1003 static LexicalBlock MakingOutput =
1004     {   "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1005
1006 static LexicalBlock StringLB =
1007     {   "<veneer routine>",      FALSE, TRUE,  0, 0, 0, 255, NULL, 0, 0, 0 };
1008
1009 static LexicalBlock *CurrentLB;                  /*  The current lexical
1010                                                      block of input text     */
1011
1012 extern void declare_systemfile(void)
1013 {   CurrentLB->sys_flag = TRUE;
1014 }
1015
1016 extern int is_systemfile(void)
1017 {   return ((CurrentLB->sys_flag)?1:0);
1018 }
1019
1020 extern void set_origsource_location(char *source, int32 line, int32 charnum)
1021 {
1022     int file_no;
1023
1024     if (!source) {
1025         /* Clear the Origsource declaration. */
1026         CurrentLB->orig_file = 0;
1027         CurrentLB->orig_source = NULL;
1028         CurrentLB->orig_line = 0;
1029         CurrentLB->orig_char = 0;
1030         return;
1031     }
1032
1033     /* Get the file number for a new or existing InputFiles entry. */
1034     file_no = register_orig_sourcefile(source);
1035
1036     CurrentLB->orig_file = file_no;
1037     CurrentLB->orig_source = InputFiles[file_no-1].filename;
1038     CurrentLB->orig_line = line;
1039     CurrentLB->orig_char = charnum;
1040 }
1041
1042 /* Error locations. */
1043
1044 extern debug_location get_current_debug_location(void)
1045 {   debug_location result;
1046     /* Assume that all input characters are one byte. */
1047     result.file_index = CurrentLB->file_no;
1048     result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
1049     result.end_byte_index = result.beginning_byte_index;
1050     result.beginning_line_number = CurrentLB->source_line;
1051     result.end_line_number = result.beginning_line_number;
1052     result.beginning_character_number =
1053         CurrentLB->chars_read - CurrentLB->line_start;
1054     result.end_character_number = result.beginning_character_number;
1055     result.orig_file_index = CurrentLB->orig_file;
1056     result.orig_beg_line_number = CurrentLB->orig_line;
1057     result.orig_beg_char_number = CurrentLB->orig_char;
1058     return result;
1059 }
1060
1061 static debug_location ErrorReport_debug_location;
1062
1063 extern void report_errors_at_current_line(void)
1064 {   ErrorReport.line_number = CurrentLB->source_line;
1065     ErrorReport.file_number = CurrentLB->file_no;
1066     if (ErrorReport.file_number == 255)
1067         ErrorReport.file_number = -1;
1068     ErrorReport.source      = CurrentLB->filename;
1069     ErrorReport.main_flag   = CurrentLB->main_flag;
1070     if (debugfile_switch)
1071         ErrorReport_debug_location = get_current_debug_location();
1072     ErrorReport.orig_file = CurrentLB->orig_file;
1073     ErrorReport.orig_source = CurrentLB->orig_source;
1074     ErrorReport.orig_line = CurrentLB->orig_line;
1075     ErrorReport.orig_char = CurrentLB->orig_char;
1076 }
1077
1078 extern debug_location get_error_report_debug_location(void)
1079 {   return ErrorReport_debug_location;
1080 }
1081
1082 extern int32 get_current_line_start(void)
1083 {   return CurrentLB->line_start;
1084 }
1085
1086 brief_location blank_brief_location;
1087
1088 extern brief_location get_brief_location(ErrorPosition *errpos)
1089 {
1090     brief_location loc;
1091     loc.file_index = errpos->file_number;
1092     loc.line_number = errpos->line_number;
1093     loc.orig_file_index = errpos->orig_file;
1094     loc.orig_line_number = errpos->orig_line;
1095     return loc;
1096 }
1097
1098 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1099 {
1100     if (loc.file_index != -1)
1101     {   errpos->file_number = loc.file_index;
1102         errpos->line_number = loc.line_number;
1103         errpos->main_flag = (errpos->file_number == 1);
1104         errpos->orig_source = NULL;
1105         errpos->orig_file = loc.orig_file_index;
1106         errpos->orig_line = loc.orig_line_number;
1107         errpos->orig_char = 0;
1108     }
1109 }
1110
1111 /* ------------------------------------------------------------------------- */
1112 /*   Hash printing and line counting                                         */
1113 /* ------------------------------------------------------------------------- */
1114
1115 static void print_hash(void)
1116 {
1117     /*  Hash-printing is the practice of printing a # character every 100
1118         lines of source code (the -x switch), reassuring the user that
1119         progress is being made                                               */
1120
1121     if (no_hash_printed_yet)
1122     {   printf("::"); no_hash_printed_yet = FALSE;
1123     }
1124     printf("#"); hash_printed_since_newline = TRUE;
1125
1126 #ifndef MAC_FACE
1127     /*  On some systems, text output is buffered to a line at a time, and
1128         this would frustrate the point of hash-printing, so:                 */
1129
1130     fflush(stdout);
1131 #endif
1132 }
1133
1134 static void reached_new_line(void)
1135 {
1136     /*  Called to signal that a new line has been reached in the source code */
1137
1138     forerrors_pointer = 0;
1139
1140     CurrentLB->source_line++;
1141     CurrentLB->line_start = CurrentLB->chars_read;
1142
1143     total_source_line_count++;
1144
1145     if (total_source_line_count%100==0)
1146     {   if (hash_switch) print_hash();
1147 #ifdef MAC_MPW
1148         SpinCursor(32);                    /* I.e., allow other tasks to run */
1149 #endif
1150     }
1151
1152 #ifdef MAC_FACE
1153     if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1154     {   ProcessEvents (&g_proc);
1155         if (g_proc != true)
1156         {   free_arrays();
1157             close_all_source();
1158             abort_transcript_file();
1159             longjmp (g_fallback, 1);
1160         }
1161     }
1162 #endif
1163 }
1164
1165 static void new_syntax_line(void)
1166 {   if (source_to_analyse != NULL) forerrors_pointer = 0;
1167     report_errors_at_current_line();
1168 }
1169
1170 /* Return 10 raised to the expo power.
1171  *
1172  * I'm avoiding the standard pow() function for a rather lame reason:
1173  * it's in the libmath (-lm) library, and I don't want to change the
1174  * build model for the compiler. So, this is implemented with a stupid
1175  * lookup table. It's faster than pow() for small values of expo.
1176  * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1177  * anyway, so I don't expect that to be a problem.
1178  *
1179  * (For some reason, frexp() and ldexp(), which are used later on, do
1180  * not require libmath to be linked in.)
1181  */
1182 static double pow10_cheap(int expo)
1183 {
1184     #define POW10_RANGE (8)
1185     static double powers[POW10_RANGE*2+1] = {
1186         0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1187         1.0,
1188         10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1189     };
1190
1191     double res = 1.0;
1192
1193     if (expo < 0) {
1194         for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1195             res *= powers[0];
1196         }
1197         return res * powers[POW10_RANGE+expo];
1198     }
1199     else {
1200         for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1201             res *= powers[POW10_RANGE*2];
1202         }
1203         return res * powers[POW10_RANGE+expo];
1204     }
1205 }
1206
1207 /* Return the IEEE-754 single-precision encoding of a floating-point
1208  * number.
1209  *
1210  * The number is provided in the pieces it was parsed in:
1211  *    [+|-] intv "." fracv "e" [+|-]expo
1212  *
1213  * If the magnitude is too large (beyond about 3.4e+38), this returns
1214  * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1215  * small (below about 1e-45), this returns a zero value (0x00000000 or
1216  * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1217  * lexer should never do that).
1218  *
1219  * Note that using a float constant does *not* set the uses_float_features
1220  * flag (which would cause the game file to be labelled 3.1.2). Same with
1221  * double constants and the uses_double_features flag. There's no VM
1222  * feature here, just an integer. Of course, any use of the float *opcodes*
1223  * will set the flag.
1224  *
1225  * The math functions in this routine require #including <math.h>, but
1226  * they should not require linking the math library (-lm). At least,
1227  * they do not on OSX and Linux.
1228  */
1229 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1230 {
1231     double absval = (intv + fracv) * pow10_cheap(expo);
1232     int32 sign = (signbit ? 0x80000000 : 0x0);
1233     double mant;
1234     int32 fbits;
1235
1236     if (isinf(absval)) {
1237         return sign | 0x7f800000; /* infinity */
1238     }
1239     if (isnan(absval)) {
1240         return sign | 0x7fc00000;
1241     }
1242
1243     mant = frexp(absval, &expo);
1244
1245     /* Normalize mantissa to be in the range [1.0, 2.0) */
1246     if (0.5 <= mant && mant < 1.0) {
1247         mant *= 2.0;
1248         expo--;
1249     }
1250     else if (mant == 0.0) {
1251         expo = 0;
1252     }
1253     else {
1254         return sign | 0x7f800000; /* infinity */
1255     }
1256
1257     if (expo >= 128) {
1258         return sign | 0x7f800000; /* infinity */
1259     }
1260     else if (expo < -126) {
1261         /* Denormalized (very small) number */
1262         mant = ldexp(mant, 126 + expo);
1263         expo = 0;
1264     }
1265     else if (!(expo == 0 && mant == 0.0)) {
1266         expo += 127;
1267         mant -= 1.0; /* Get rid of leading 1 */
1268     }
1269
1270     mant *= 8388608.0; /* 2^23 */
1271     fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1272     if (fbits >> 23) {
1273         /* The carry propagated out of a string of 23 1 bits. */
1274         fbits = 0;
1275         expo++;
1276         if (expo >= 255) {
1277             return sign | 0x7f800000; /* infinity */
1278         }
1279     }
1280
1281     /* At this point, expo is less than 2^8; fbits is less than 2^23; neither is negative. */
1282     return (sign) | ((int32)(expo << 23)) | (fbits);
1283 }
1284
1285 /* Same as the above, but we return *half* of a 64-bit double, depending on whether wanthigh is true (high half) or false (low half).
1286  */
1287 static int32 construct_double(int wanthigh, int signbit, double intv, double fracv, int expo)
1288 {
1289     double absval = (intv + fracv) * pow10_cheap(expo);
1290     int32 sign = (signbit ? 0x80000000 : 0x0);
1291     double mant;
1292     uint32 fhi, flo;
1293
1294     if (isinf(absval)) {
1295         goto Infinity;
1296     }
1297     if (isnan(absval)) {
1298         goto NotANumber;
1299     }
1300
1301     mant = frexp(absval, &expo);
1302
1303     /* Normalize mantissa to be in the range [1.0, 2.0) */
1304     if (0.5 <= mant && mant < 1.0) {
1305         mant *= 2.0;
1306         expo--;
1307     }
1308     else if (mant == 0.0) {
1309         expo = 0;
1310     }
1311     else {
1312         goto Infinity;
1313     }
1314
1315     if (expo >= 1024) {
1316         goto Infinity;
1317     }
1318     else if (expo < -1022) {
1319         /* Denormalized (very small) number */
1320         mant = ldexp(mant, 1022 + expo);
1321         expo = 0;
1322     }
1323     else if (!(expo == 0 && mant == 0.0)) {
1324         expo += 1023;
1325         mant -= 1.0; /* Get rid of leading 1 */
1326     }
1327
1328     /* fhi receives the high 28 bits; flo the low 24 bits (total 52 bits) */
1329     mant *= 268435456.0;          /* 2^28 */
1330     fhi = (uint32)mant;           /* Truncate */
1331     mant -= (double)fhi;
1332     mant *= 16777216.0;           /* 2^24 */
1333     flo = (uint32)(mant+0.5);     /* Round */
1334
1335     if (flo >> 24) {
1336         /* The carry propagated out of a string of 24 1 bits. */
1337         flo = 0;
1338         fhi++;
1339         if (fhi >> 28) {
1340             /* And it also propagated out of the next 28 bits. */
1341             fhi = 0;
1342             expo++;
1343             if (expo >= 2047) {
1344                 goto Infinity;
1345             }
1346         }
1347     }
1348
1349     /* At this point, expo is less than 2^11; fhi is less than 2^28; flo is less than 2^24; none are negative. */
1350     if (wanthigh)
1351         return (sign) | ((int32)(expo << 20)) | ((int32)(fhi >> 8));
1352     else
1353         return (int32)((fhi & 0xFF) << 24) | (int32)(flo);
1354
1355  Infinity:
1356     if (wanthigh)
1357         return sign | 0x7FF00000;
1358     else
1359         return 0x00000000;
1360
1361  NotANumber:
1362     if (wanthigh)
1363         return sign | 0x7FF80000;
1364     else
1365         return 0x00000001;
1366 }
1367
1368 /* ------------------------------------------------------------------------- */
1369 /*   Characters are read via a "pipeline" of variables, allowing us to look  */
1370 /*       up to three characters ahead of the current position.               */
1371 /*                                                                           */
1372 /*   There are two possible sources: from the source files being loaded in,  */
1373 /*   and from a string inside Inform (which is where the code for veneer     */
1374 /*   routines comes from).  Each source has its own get-next-character       */
1375 /*   routine.                                                                */
1376 /* ------------------------------------------------------------------------- */
1377 /*   Source 1: from files                                                    */
1378 /*                                                                           */
1379 /*   Note that file_load_chars(p, size) loads "size" bytes into buffer "p"   */
1380 /*   from the current input file.  If the file runs out, then if it was      */
1381 /*   the last source file 4 EOF characters are placed in the buffer: if it   */
1382 /*   was only an Include file ending, then a '\n' character is placed there  */
1383 /*   (essentially to force termination of any comment line) followed by      */
1384 /*   three harmless spaces.                                                  */
1385 /*                                                                           */
1386 /*   The routine returns the number of characters it has written, and note   */
1387 /*   that this conveniently ensures that all characters in the buffer come   */
1388 /*   from the same file.                                                     */
1389 /* ------------------------------------------------------------------------- */
1390
1391 #define SOURCE_BUFFER_SIZE 4096                  /*  Typical disc block size */
1392
1393 typedef struct Sourcefile_s
1394 {   char *buffer;                                /*  Input buffer            */
1395     int   read_pos;                              /*  Read position in buffer */
1396     int   size;                                  /*  Number of meaningful
1397                                                      characters in buffer    */
1398     int   la, la2, la3;                          /*  Three characters of
1399                                                      lookahead pipeline      */
1400     int   file_no;                               /*  Internal file number
1401                                                      (1, 2, 3, ...)          */
1402     LexicalBlock LB;
1403 } Sourcefile;
1404
1405 static Sourcefile *FileStack;     /*  Allocated to FileStack_max */
1406 static memory_list FileStack_memlist;
1407 static int FileStack_max;         /*  The highest value that File_sp has
1408                                       reached
1409                                       (Filestack entries to this depth have
1410                                       a buffer allocated)                    */
1411
1412 static int File_sp;               /*  Current stack pointer                  */
1413 static Sourcefile *CF;            /*  Top entry on stack (always equal to
1414                                       FileStack[File_sp-1])                  */
1415
1416 static int last_input_file;
1417
1418 /* Set CF and CurrentLB.
1419    This does not increment File_sp; the caller must do that. */
1420 static void begin_buffering_file(int i, int file_no)
1421 {   int j, cnt; uchar *p;
1422
1423     CF = NULL;
1424     CurrentLB = NULL;
1425
1426     ensure_memory_list_available(&FileStack_memlist, i+1);
1427     while (i >= FileStack_max) {
1428         FileStack[FileStack_max++].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1429     }
1430
1431     p = (uchar *) FileStack[i].buffer;
1432
1433     if (i>0)
1434     {   FileStack[i-1].la  = lookahead;
1435         FileStack[i-1].la2 = lookahead2;
1436         FileStack[i-1].la3 = lookahead3;
1437     }
1438
1439     FileStack[i].file_no = file_no;
1440     FileStack[i].size = file_load_chars(file_no,
1441         (char *) p, SOURCE_BUFFER_SIZE);
1442     /* If the file is shorter than SOURCE_BUFFER_SIZE, it's now closed already. We still need to set up the file entry though. */
1443
1444     lookahead  = source_to_iso_grid[p[0]];
1445     lookahead2 = source_to_iso_grid[p[1]];
1446     lookahead3 = source_to_iso_grid[p[2]];
1447     if (LOOKAHEAD_SIZE != 3)
1448         compiler_error
1449             ("Lexer lookahead size does not match hard-coded lookahead code");
1450     FileStack[i].read_pos = LOOKAHEAD_SIZE;
1451
1452     if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1453                else FileStack[i].LB.main_flag = FALSE;
1454     FileStack[i].LB.sys_flag = FALSE;
1455     FileStack[i].LB.source_line = 1;
1456     FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1457     FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1458     FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1459     FileStack[i].LB.file_no = file_no;
1460     FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1461     FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1462
1463     InputFiles[file_no-1].initial_buffering = FALSE;
1464
1465     CurrentLB = &(FileStack[i].LB);
1466     CF = &(FileStack[i]);
1467
1468     /* Check for recursive inclusion */
1469     cnt = 0;
1470     for (j=0; j<i; j++)
1471     {   if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1472             cnt++;
1473     }
1474     if (cnt==1)
1475         warning_named("File included more than once",
1476             FileStack[j].LB.filename);
1477 }
1478
1479 static void create_char_pipeline(void)
1480 {
1481     File_sp = 0;
1482     begin_buffering_file(File_sp++, 1);
1483     pipeline_made = TRUE;
1484     last_input_file = current_input_file;
1485 }
1486
1487 static int get_next_char_from_pipeline(void)
1488 {   uchar *p;
1489
1490     while (last_input_file < current_input_file)
1491     {
1492         /*  An "Include" file must have opened since the last character
1493             was read. Perhaps more than one. We run forward through the
1494             list and add them to the include stack. But we ignore
1495             "Origsource" files (which were never actually opened for
1496             reading). */
1497
1498         last_input_file++;
1499         if (!InputFiles[last_input_file-1].is_input)
1500             continue;
1501
1502         begin_buffering_file(File_sp++, last_input_file);
1503     }
1504     if (last_input_file != current_input_file)
1505         compiler_error("last_input_file did not match after Include");
1506
1507     if (File_sp == 0)
1508     {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1509     }
1510
1511     if (CF->read_pos == CF->size)
1512     {   CF->size =
1513             file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1514         CF->read_pos = 0;
1515     }
1516     else
1517     if (CF->read_pos == -(CF->size))
1518     {   set_token_location(get_current_debug_location());
1519         File_sp--;
1520         if (File_sp == 0)
1521         {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1522         }
1523         CF = &(FileStack[File_sp-1]);
1524         CurrentLB = &(FileStack[File_sp-1].LB);
1525         lookahead  = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1526         if (CF->read_pos == CF->size)
1527         {   CF->size =
1528                 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1529             CF->read_pos = 0;
1530         }
1531         set_token_location(get_current_debug_location());
1532     }
1533
1534     p = (uchar *) (CF->buffer);
1535
1536     current = lookahead;
1537     lookahead = lookahead2;
1538     lookahead2 = lookahead3;
1539     lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1540
1541     CurrentLB->chars_read++;
1542     if (forerrors_pointer < FORERRORS_SIZE-1)
1543         forerrors_buff[forerrors_pointer++] = current;
1544     if (current == '\n') reached_new_line();
1545     return(current);
1546 }
1547
1548 /* ------------------------------------------------------------------------- */
1549 /*   Source 2: from a string                                                 */
1550 /* ------------------------------------------------------------------------- */
1551
1552 static int source_to_analyse_pointer;            /*  Current read position   */
1553
1554 static int get_next_char_from_string(void)
1555 {   uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1556     current = source_to_iso_grid[p[0]];
1557
1558     if (current == 0)    lookahead  = 0;
1559                     else lookahead  = source_to_iso_grid[p[1]];
1560     if (lookahead == 0)  lookahead2 = 0;
1561                     else lookahead2 = source_to_iso_grid[p[2]];
1562     if (lookahead2 == 0) lookahead3 = 0;
1563                     else lookahead3 = source_to_iso_grid[p[3]];
1564
1565     CurrentLB->chars_read++;
1566     if (forerrors_pointer < FORERRORS_SIZE-1)
1567         forerrors_buff[forerrors_pointer++] = current;
1568     if (current == '\n') reached_new_line();
1569     return(current);
1570 }
1571
1572 /* ========================================================================= */
1573 /*   The interface between the lexer and Inform's higher levels:             */
1574 /*                                                                           */
1575 /*       put_token_back()            (effectively) move the read position    */
1576 /*                                       back by one token                   */
1577 /*                                                                           */
1578 /*       get_next_token()            copy the token at the current read      */
1579 /*                                       position into the triple            */
1580 /*                                   (token_type, token_value, token_text)   */
1581 /*                                       and move the read position forward  */
1582 /*                                       by one                              */
1583 /*                                                                           */
1584 /*       release_token_texts()       discard all the tokens that have been   */
1585 /*                                       read in, except for put-back ones   */
1586 /*                                                                           */
1587 /*       restart_lexer(source, name) if source is NULL, initialise the lexer */
1588 /*                                       to read from source files;          */
1589 /*                                   otherwise, to read from this string.    */
1590 /* ------------------------------------------------------------------------- */
1591
1592 extern void release_token_texts(void)
1593 {
1594     /* This is called at the beginning of every (top-level) directive and
1595        every statement. It drops all token usage so that the lextexts
1596        array can be reused.
1597
1598        Call this immediately before a get_next_token() call.
1599
1600        This should *not* be called within parse_expression(). Expression
1601        code generation relies on token data being retained across the whole
1602        expression.
1603     */
1604     int ix;
1605
1606     token_text = NULL;
1607
1608     if (tokens_put_back == 0) {
1609         cur_lextexts = 0;
1610         return;
1611     }
1612
1613     /* If any tokens have been put back, we have to preserve their text.
1614        Move their lextext usage to the head of the lextexts array. */
1615
1616     for (ix=0; ix<tokens_put_back; ix++) {
1617         int oldpos;
1618         lextext temp;
1619         int pos = circle_position - tokens_put_back + 1 + ix;
1620         if (pos < 0) pos += CIRCLE_SIZE;
1621
1622         oldpos = circle[pos].lextext;
1623         circle[pos].lextext = ix;
1624         /* Swap the entire lextext structure (including size) */
1625         temp = lextexts[ix];
1626         lextexts[ix] = lextexts[oldpos];
1627         lextexts[oldpos] = temp;
1628     }
1629     cur_lextexts = tokens_put_back;
1630 }
1631
1632 extern void put_token_back(void)
1633 {   tokens_put_back++;
1634
1635     if (tokens_trace_level > 0)
1636     {   if (tokens_trace_level == 1) printf("<- ");
1637         else printf("<-\n");
1638     }
1639
1640     /*  The following error, of course, should never happen!                 */
1641
1642     if (tokens_put_back == CIRCLE_SIZE)
1643     {   compiler_error("The lexical analyser has collapsed because of a wrong \
1644 assumption inside Inform");
1645         tokens_put_back--;
1646         return;
1647     }
1648 }
1649
1650 /* The get_next_token() code reads characters into the current lextext,
1651    which is lextexts[lex_index]. It uses these routines to add and remove
1652    characters, reallocing when necessary.
1653
1654    lex_pos is the current number of characters in the lextext. It is
1655    not necessarily null-terminated until get_next_token() completes.
1656  */
1657
1658 /* Add one character */
1659 static void lexaddc(char ch)
1660 {
1661     if ((size_t)lex_pos >= lextexts[lex_index].size) {
1662         size_t newsize = lextexts[lex_index].size * 2;
1663         my_realloc(&lextexts[lex_index].text, lextexts[lex_index].size, newsize, "one lexeme text");
1664         lextexts[lex_index].size = newsize;
1665     }
1666     lextexts[lex_index].text[lex_pos++] = ch;
1667 }
1668
1669 /* Remove the last character and ensure it's null-terminated */
1670 static void lexdelc(void)
1671 {
1672     if (lex_pos > 0) {
1673         lex_pos--;
1674     }
1675     lextexts[lex_index].text[lex_pos] = 0;
1676 }
1677
1678 /* Return the last character */
1679 static char lexlastc(void)
1680 {
1681     if (lex_pos == 0) {
1682         return 0;
1683     }
1684     return lextexts[lex_index].text[lex_pos-1];
1685 }
1686
1687 /* Add a string of characters (including the null) */
1688 static void lexadds(char *str)
1689 {
1690     while (*str) {
1691         lexaddc(*str);
1692         str++;
1693     }
1694     lexaddc(0);
1695 }
1696
1697 extern void get_next_token(void)
1698 {   int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1699     int floatend;
1700     int returning_a_put_back_token = TRUE;
1701
1702     context = lexical_context();
1703
1704     if (tokens_put_back > 0)
1705     {   i = circle_position - tokens_put_back + 1;
1706         if (i<0) i += CIRCLE_SIZE;
1707         tokens_put_back--;
1708         if (context != circle[i].context)
1709         {   j = circle[i].type;
1710             if ((j==0) || ((j>=100) && (j<200)))
1711                 interpret_identifier(circle[i].text, i, FALSE);
1712             circle[i].context = context;
1713         }
1714         goto ReturnBack;
1715     }
1716     returning_a_put_back_token = FALSE;
1717
1718     if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1719     else circle_position++;
1720
1721     lex_index = cur_lextexts++;
1722     if (lex_index >= no_lextexts) {
1723         /* fresh lextext block; must init it */
1724         no_lextexts = lex_index+1;
1725         ensure_memory_list_available(&lextexts_memlist, no_lextexts);
1726         lextexts[lex_index].size = MAX_IDENTIFIER_LENGTH + 1;
1727         lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text");
1728     }
1729     lex_pos = 0;
1730     lextexts[lex_index].text[0] = 0; /* start with an empty string */
1731
1732     circle[circle_position].lextext = lex_index;
1733     circle[circle_position].text = NULL; /* will fill in later */
1734     circle[circle_position].value = 0;
1735     circle[circle_position].type = 0;
1736     circle[circle_position].context = context;
1737
1738     StartTokenAgain:
1739     d = (*get_next_char)();
1740     e = tokeniser_grid[d];
1741
1742     if (next_token_begins_syntax_line)
1743     {   if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1744         {   new_syntax_line();
1745             next_token_begins_syntax_line = FALSE;
1746         }
1747     }
1748
1749     circle[circle_position].location = get_current_debug_location();
1750
1751     switch(e)
1752     {   case 0: char_error("Illegal character found in source:", d);
1753             goto StartTokenAgain;
1754
1755         case WHITESPACE_CODE:
1756             while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1757                 (*get_next_char)();
1758             goto StartTokenAgain;
1759
1760         case COMMENT_CODE:
1761             while ((lookahead != '\n') && (lookahead != 0))
1762                 (*get_next_char)();
1763             goto StartTokenAgain;
1764
1765         case EOF_CODE:
1766             circle[circle_position].type = EOF_TT;
1767             lexadds("<end of file>");
1768             break;
1769
1770         case DIGIT_CODE:
1771             radix = 10;
1772             ReturnNumber:
1773             n=0;
1774             do
1775             {   n = n*radix + character_digit_value[d];
1776                 lexaddc(d);
1777             } while ((character_digit_value[lookahead] < radix)
1778                      && (d = (*get_next_char)(), TRUE));
1779
1780             lexaddc(0);
1781             circle[circle_position].type = NUMBER_TT;
1782             circle[circle_position].value = n;
1783             break;
1784
1785             FloatNumber:
1786             /* When we reach here, d is the sign bit ('+' or '-').
1787                If we're constructing a 32-bit float, floatend is 0;
1788                for a 64-bit double, floatend is '>' for high, '<' for low. */
1789             {   int expo=0; double intv=0, fracv=0;
1790                 int expocount=0, intcount=0, fraccount=0;
1791                 int signbit = (d == '-');
1792                 lexaddc(d);
1793                 while (character_digit_value[lookahead] < 10) {
1794                     intv = 10.0*intv + character_digit_value[lookahead];
1795                     intcount++;
1796                     lexaddc(lookahead);
1797                     (*get_next_char)();
1798                 }
1799                 if (lookahead == '.') {
1800                     double fracpow = 1.0;
1801                     lexaddc(lookahead);
1802                     (*get_next_char)();
1803                     while (character_digit_value[lookahead] < 10) {
1804                         fracpow *= 0.1;
1805                         fracv = fracv + fracpow*character_digit_value[lookahead];
1806                         fraccount++;
1807                         lexaddc(lookahead);
1808                         (*get_next_char)();
1809                     }
1810                 }
1811                 if (lookahead == 'e' || lookahead == 'E') {
1812                     int exposign = 0;
1813                     lexaddc(lookahead);
1814                     (*get_next_char)();
1815                     if (lookahead == '+' || lookahead == '-') {
1816                         exposign = (lookahead == '-');
1817                         lexaddc(lookahead);
1818                         (*get_next_char)();
1819                     }
1820                     while (character_digit_value[lookahead] < 10) {
1821                         expo = 10*expo + character_digit_value[lookahead];
1822                         expocount++;
1823                         lexaddc(lookahead);
1824                         (*get_next_char)();
1825                     }
1826                     if (expocount == 0)
1827                         error("Floating-point literal must have digits after the 'e'");
1828                     if (exposign) { expo = -expo; }
1829                 }
1830                 if (intcount + fraccount == 0)
1831                     error("Floating-point literal must have digits");
1832                 if (floatend == '>')
1833                     n = construct_double(TRUE, signbit, intv, fracv, expo);
1834                 else if (floatend == '<')
1835                     n = construct_double(FALSE, signbit, intv, fracv, expo);
1836                 else
1837                     n = construct_float(signbit, intv, fracv, expo);
1838             }
1839             lexaddc(0);
1840             circle[circle_position].type = NUMBER_TT;
1841             circle[circle_position].value = n;
1842             if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1843             break;
1844
1845         case RADIX_CODE:
1846             radix = 16; d = (*get_next_char)();
1847             if (d == '-' || d == '+') {
1848                 floatend = 0;
1849                 goto FloatNumber;
1850             }
1851             if (d == '<' || d == '>') {
1852                 floatend = d;
1853                 d = (*get_next_char)();
1854                 if (d == '-' || d == '+') {
1855                     goto FloatNumber;
1856                 }
1857                 error("Signed number expected after '$<' or '$>'");
1858             }
1859             if (d == '$') { d = (*get_next_char)(); radix = 2; }
1860             if (character_digit_value[d] >= radix)
1861             {   if (radix == 2)
1862                     error("Binary number expected after '$$'");
1863                 else
1864                     error("Hexadecimal number expected after '$'");
1865             }
1866             goto ReturnNumber;
1867
1868         case QUOTE_CODE:     /* Single-quotes: scan a literal string */
1869             quoted_size=0;
1870             do
1871             {   e = d; d = (*get_next_char)(); lexaddc(d);
1872                 if (quoted_size++==64)
1873                 {   error(
1874                     "Too much text for one pair of quotations '...' to hold");
1875                     lexaddc('\''); break;
1876                 }
1877                 if ((d == '\'') && (e != '@'))
1878                 {   if (quoted_size == 1)
1879                     {   d = (*get_next_char)(); lexaddc(d);
1880                         if (d != '\'')
1881                             error("No text between quotation marks ''");
1882                     }
1883                     break;
1884                 }
1885             } while (d != EOF);
1886             if (d==EOF) ebf_error("'\''", "end of file");
1887             lexdelc();
1888             circle[circle_position].type = SQ_TT;
1889             break;
1890
1891         case DQUOTE_CODE:    /* Double-quotes: scan a literal string */
1892             quoted_size=0;
1893             do
1894             {   d = (*get_next_char)(); lexaddc(d);
1895                 if (d == '\n')
1896                 {   lex_pos--;
1897                     while (lexlastc() == ' ') lex_pos--;
1898                     if (lexlastc() != '^') lexaddc(' ');
1899                     while ((lookahead != EOF) &&
1900                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1901                     (*get_next_char)();
1902                 }
1903                 else if (d == '\\')
1904                 {   int newline_passed = FALSE;
1905                     lex_pos--;
1906                     while ((lookahead != EOF) &&
1907                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1908                         if ((d = (*get_next_char)()) == '\n')
1909                             newline_passed = TRUE;
1910                     if (!newline_passed)
1911                     {   char chb[4];
1912                         chb[0] = '\"'; chb[1] = lookahead;
1913                         chb[2] = '\"'; chb[3] = 0;
1914                         ebf_error("empty rest of line after '\\' in string",
1915                             chb);
1916                     }
1917                 }
1918             }   while ((d != EOF) && (d!='\"'));
1919             if (d==EOF) ebf_error("'\"'", "end of file");
1920             lexdelc();
1921             circle[circle_position].type = DQ_TT;
1922             break;
1923
1924         case IDENTIFIER_CODE:    /* Letter or underscore: an identifier */
1925
1926             lexaddc(d); n=1;
1927             while ((n<=MAX_IDENTIFIER_LENGTH)
1928                    && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1929                    || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1930                 n++, lexaddc((*get_next_char)());
1931
1932             lexaddc(0);
1933
1934             if (n > MAX_IDENTIFIER_LENGTH)
1935             {   char bad_length[100];
1936                 sprintf(bad_length,
1937                     "Name exceeds the maximum length of %d characters:",
1938                          MAX_IDENTIFIER_LENGTH);
1939                 error_named(bad_length, lextexts[lex_index].text);
1940                 /* Eat any further extra characters in the identifier */
1941                 while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1942                         || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1943                     (*get_next_char)();
1944                 /* Trim token so that it doesn't violate
1945                    MAX_IDENTIFIER_LENGTH during error recovery */
1946                 lextexts[lex_index].text[MAX_IDENTIFIER_LENGTH] = 0;
1947             }
1948
1949             if (dont_enter_into_symbol_table)
1950             {   circle[circle_position].type = DQ_TT;
1951                 circle[circle_position].value = 0;
1952                 if (dont_enter_into_symbol_table == -2)
1953                     interpret_identifier(lextexts[lex_index].text, circle_position, TRUE);
1954                 break;
1955             }
1956
1957             interpret_identifier(lextexts[lex_index].text, circle_position, FALSE);
1958             break;
1959
1960         default:
1961
1962             /*  The character is initial to at least one of the separators  */
1963
1964             for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1965             {   r = (char *) separators[j];
1966                 if (r[1]==0)
1967                 {   lexaddc(d);
1968                     lexaddc(0);
1969                     goto SeparatorMatched;
1970                 }
1971                 else
1972                 if (r[2]==0)
1973                 {   if (*(r+1) == lookahead)
1974                     {   lexaddc(d);
1975                         lexaddc((*get_next_char)());
1976                         lexaddc(0);
1977                         goto SeparatorMatched;
1978                     }
1979                 }
1980                 else
1981                 {   if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1982                     {   lexaddc(d);
1983                         lexaddc((*get_next_char)());
1984                         lexaddc((*get_next_char)());
1985                         lexaddc(0);
1986                         goto SeparatorMatched;
1987                     }
1988                 }
1989             }
1990
1991             /*  The following contingency never in fact arises with the
1992                 current set of separators, but might in future  */
1993
1994             lexaddc(d); lexaddc(lookahead); lexaddc(lookahead2);
1995             lexaddc(0);
1996             error_named("Unrecognised combination in source:",
1997                 lextexts[lex_index].text);
1998             goto StartTokenAgain;
1999
2000             SeparatorMatched:
2001
2002             circle[circle_position].type = SEP_TT;
2003             circle[circle_position].value = j;
2004             switch(j)
2005             {   case SEMICOLON_SEP: break;
2006                 case HASHNDOLLAR_SEP:
2007                 case HASHWDOLLAR_SEP:
2008                     if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
2009                     {   error_named("Character expected after",
2010                             lextexts[lex_index].text);
2011                         break;
2012                     }
2013                     lex_pos--;
2014                     lexaddc((*get_next_char)());
2015                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
2016                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
2017                         lexaddc((*get_next_char)());
2018                     lexaddc(0);
2019                     break;
2020                 case HASHADOLLAR_SEP:
2021                 case HASHGDOLLAR_SEP:
2022                 case HASHRDOLLAR_SEP:
2023                 case HASHHASH_SEP:
2024                     if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
2025                     {   error_named("Alphabetic character expected after",
2026                             lextexts[lex_index].text);
2027                         break;
2028                     }
2029                     lex_pos--;
2030                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
2031                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
2032                         lexaddc((*get_next_char)());
2033                     lexaddc(0);
2034                     break;
2035             }
2036             break;
2037     }
2038
2039     /* We can now assign the text pointer, since the lextext has finished reallocing. */
2040     circle[circle_position].text = lextexts[lex_index].text;
2041     i = circle_position;
2042
2043     ReturnBack:
2044     /* We've either parsed a new token or selected a put-back one.
2045        i is the circle-position of the token in question. Time to
2046        export the token data where higher-level code can find it. */
2047     token_value = circle[i].value;
2048     token_type = circle[i].type;
2049     token_text = circle[i].text;
2050     if (!returning_a_put_back_token)
2051     {   set_token_location(circle[i].location);
2052     }
2053
2054     if (tokens_trace_level > 0)
2055     {   if (tokens_trace_level == 1) {
2056             printf("'%s' ", circle[i].text);
2057             if (circle[i].type == EOF_TT) printf("\n");
2058         }
2059         else
2060         {   printf("-> "); describe_token(&circle[i]);
2061             printf(" ");
2062             if (tokens_trace_level > 2) print_context(circle[i].context);
2063             printf("\n");
2064         }
2065     }
2066 }
2067
2068 static char veneer_error_title[64];
2069
2070 extern void restart_lexer(char *lexical_source, char *name)
2071 {   int i;
2072     circle_position = 0;
2073     for (i=0; i<CIRCLE_SIZE; i++)
2074     {   circle[i].type = 0;
2075         circle[i].value = 0;
2076         circle[i].text = "(if this is ever visible, there is a bug)";
2077         circle[i].lextext = -1;
2078         circle[i].context = 0;
2079     }
2080
2081     cur_lextexts = 0;
2082     /* But we don't touch no_lextexts; those allocated blocks can be reused */
2083     lex_index = -1;
2084     lex_pos = -1;
2085
2086     tokens_put_back = 0;
2087     forerrors_pointer = 0;
2088     dont_enter_into_symbol_table = FALSE;
2089     return_sp_as_variable = FALSE;
2090     next_token_begins_syntax_line = TRUE;
2091
2092     source_to_analyse = lexical_source;
2093
2094     if (source_to_analyse == NULL)
2095     {   get_next_char = get_next_char_from_pipeline;
2096         if (!pipeline_made) create_char_pipeline();
2097         forerrors_buff[0] = 0; forerrors_pointer = 0;
2098     }
2099     else
2100     {   get_next_char = get_next_char_from_string;
2101         source_to_analyse_pointer = 0;
2102         CurrentLB = &StringLB;
2103         sprintf(veneer_error_title, "<veneer routine '%s'>", name);
2104         StringLB.filename = veneer_error_title;
2105
2106         CurrentLB->source_line = 1;
2107         CurrentLB->line_start  = 0;
2108         CurrentLB->chars_read  = 0;
2109     }
2110 }
2111
2112 /* ========================================================================= */
2113 /*   Data structure management routines                                      */
2114 /* ------------------------------------------------------------------------- */
2115
2116 extern void init_lexer_vars(void)
2117 {
2118     FileStack = NULL;
2119     FileStack_max = 0;
2120     CF = NULL;
2121     CurrentLB = NULL;
2122
2123     lextexts = NULL;
2124     no_lextexts = 0;
2125     cur_lextexts = 0;
2126     lex_index = -1;
2127     lex_pos = -1;
2128
2129     blank_brief_location.file_index = -1;
2130     blank_brief_location.line_number = 0;
2131     blank_brief_location.orig_file_index = 0;
2132     blank_brief_location.orig_line_number = 0;
2133 }
2134
2135 extern void lexer_begin_prepass(void)
2136 {   total_source_line_count = 0;
2137     CurrentLB = &NoFileOpen;
2138     report_errors_at_current_line();
2139 }
2140
2141 extern void lexer_begin_pass(void)
2142 {   no_hash_printed_yet = TRUE;
2143     hash_printed_since_newline = FALSE;
2144
2145     pipeline_made = FALSE;
2146
2147     restart_lexer(NULL, NULL);
2148 }
2149
2150 extern void lexer_endpass(void)
2151 {   CurrentLB = &MakingOutput;
2152     report_errors_at_current_line();
2153 }
2154
2155 extern void lexer_allocate_arrays(void)
2156 {
2157     initialise_memory_list(&FileStack_memlist,
2158         sizeof(Sourcefile), 4, (void**)&FileStack,
2159         "source file stack");
2160     FileStack_max = 0;
2161
2162     initialise_memory_list(&lextexts_memlist,
2163         sizeof(lextext), 200, (void**)&lextexts,
2164         "lexeme texts list");
2165     cur_lextexts = 0;
2166
2167     keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2168         "keyword hash table");
2169     keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2170         "keyword hash end table");
2171     keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
2172         "keyword hashing linked list");
2173
2174     local_variable_names = my_calloc(sizeof(identstruct), MAX_LOCAL_VARIABLES-1,
2175         "text of local variable names");
2176     local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2177         "local variable hash table");
2178     local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
2179         "local variable hash codes");
2180
2181     make_tokeniser_grid();
2182     make_keywords_tables();
2183
2184     first_token_locations =
2185         my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
2186     first_token_locations->location.file_index = 0;
2187     first_token_locations->location.beginning_byte_index = 0;
2188     first_token_locations->location.end_byte_index = 0;
2189     first_token_locations->location.beginning_line_number = 0;
2190     first_token_locations->location.end_line_number = 0;
2191     first_token_locations->location.beginning_character_number = 0;
2192     first_token_locations->location.end_character_number = 0;
2193     first_token_locations->location.orig_file_index = 0;
2194     first_token_locations->location.orig_beg_line_number = 0;
2195     first_token_locations->location.orig_beg_char_number = 0;
2196     first_token_locations->next = NULL;
2197     first_token_locations->reference_count = 0;
2198     last_token_location = first_token_locations;
2199 }
2200
2201 extern void lexer_free_arrays(void)
2202 {   int ix;
2203     CF = NULL;
2204     CurrentLB = NULL;
2205
2206     for (ix=0; ix<FileStack_max; ix++) {
2207         my_free(&FileStack[ix].buffer, "source file buffer");
2208     }
2209     deallocate_memory_list(&FileStack_memlist);
2210
2211     for (ix=0; ix<no_lextexts; ix++) {
2212         my_free(&lextexts[ix].text, "one lexeme text");
2213     }
2214     deallocate_memory_list(&lextexts_memlist);
2215
2216     my_free(&keywords_hash_table, "keyword hash table");
2217     my_free(&keywords_hash_ends_table, "keyword hash end table");
2218     my_free(&keywords_data_table, "keyword hashing linked list");
2219
2220     my_free(&local_variable_names, "text of local variable names");
2221     my_free(&local_variable_hash_table, "local variable hash table");
2222     my_free(&local_variable_hash_codes, "local variable hash codes");
2223
2224     cleanup_token_locations(NULL);
2225 }
2226
2227 /* ========================================================================= */