src/lexer.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "lexer" : Lexical analyser                                              */
   3 /*                                                                           */
   4 /*   Part of Inform 6.40                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2022                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 int total_source_line_count,            /* Number of source lines so far     */
  25
  26     no_hash_printed_yet,                /* Have not yet printed the first #  */
  27     hash_printed_since_newline,         /* A hash has been printed since the
  28                                            most recent new-line was printed
  29                                            (generally as a result of an error
  30                                            message or the start of pass)     */
  31     dont_enter_into_symbol_table,       /* Return names as text (with
  32                                            token type DQ_TT, i.e., as if
  33                                            they had double-quotes around)
  34                                            and not as entries in the symbol
  35                                            table, when TRUE. If -2, only the
  36                                            keyword table is searched.        */
  37     return_sp_as_variable;              /* When TRUE, the word "sp" denotes
  38                                            the stack pointer variable
  39                                            (used in assembly language only)  */
  40 int next_token_begins_syntax_line;      /* When TRUE, start a new syntax
  41                                            line (for error reporting, etc.)
  42                                            on the source code line where
  43                                            the next token appears            */
  44
  45 int32 last_mapped_line;  /* Last syntax line reported to debugging file      */
  46
  47 /* ------------------------------------------------------------------------- */
  48 /*   The lexer's output is a sequence of structs, each called a "token",     */
  49 /*   representing one lexical unit (or "lexeme") each.  Instead of providing */
  50 /*   "lookahead" (that is, always having available the next token after the  */
  51 /*   current one, so that syntax analysers higher up in Inform can have      */
  52 /*   advance knowledge of what is coming), the lexer instead has a system    */
  53 /*   where tokens can be read in and then "put back again".                  */
  54 /*   The meaning of the number (and to some extent the text) supplied with   */
  55 /*   a token depends on its type: see "header.h" for the list of types.      */
  56 /*   For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
  57 /*   number, and translated to the token:                                    */
  58 /*     type NUMBER_TT, value 483, text "$1e3"                                */
  59 /* ------------------------------------------------------------------------- */
  60 /*   These three variables are set to the current token on a call to         */
  61 /*   get_next_token() (but are not changed by a call to put_token_back()).   */
  62 /*   (It would be tidier to use a token_data structure, rather than having   */
  63 /*   get_next_token() unpack three values. But this is the way it is.)       */
  64 /* ------------------------------------------------------------------------- */
  65
  66 int token_type;
  67 int32 token_value;
  68 char *token_text;
  69
  70 /* ------------------------------------------------------------------------- */
  71 /*   The next two variables are the head and tail of a singly linked list.   */
  72 /*   The tail stores the portion most recently read from the current         */
  73 /*   lexical block; its end values therefore describe the location of the    */
  74 /*   current token, and are updated whenever the three variables above are   */
  75 /*   via set_token_location(...).  Earlier vertices, if any, represent the   */
  76 /*   regions of lexical blocks read beforehand, where new vertices are       */
  77 /*   only introduced by interruptions like a file inclusion or an EOF.       */
  78 /*   Vertices are deleted off of the front of the list once they are no      */
  79 /*   longer referenced by pending debug information records.                 */
  80 /* ------------------------------------------------------------------------- */
  81
  82 static debug_locations *first_token_locations;
  83 static debug_locations *last_token_location;
  84
  85 extern debug_location get_token_location(void)
  86 {   debug_location result;
  87     debug_location *location = &(last_token_location->location);
  88     result.file_index = location->file_index;
  89     result.beginning_byte_index = location->end_byte_index;
  90     result.end_byte_index = location->end_byte_index;
  91     result.beginning_line_number = location->end_line_number;
  92     result.end_line_number = location->end_line_number;
  93     result.beginning_character_number = location->end_character_number;
  94     result.end_character_number = location->end_character_number;
  95     result.orig_file_index = location->orig_file_index;
  96     result.orig_beg_line_number = location->orig_beg_line_number;
  97     result.orig_beg_char_number = location->orig_beg_char_number;
  98     return result;
  99 }
 100
 101 extern debug_locations get_token_locations(void)
 102 {   debug_locations result;
 103     result.location = get_token_location();
 104     result.next = NULL;
 105     result.reference_count = 0;
 106     return result;
 107 }
 108
 109 static void set_token_location(debug_location location)
 110 {   if (location.file_index == last_token_location->location.file_index)
 111     {   last_token_location->location.end_byte_index =
 112             location.end_byte_index;
 113         last_token_location->location.end_line_number =
 114             location.end_line_number;
 115         last_token_location->location.end_character_number =
 116             location.end_character_number;
 117         last_token_location->location.orig_file_index =
 118             location.orig_file_index;
 119         last_token_location->location.orig_beg_line_number =
 120             location.orig_beg_line_number;
 121         last_token_location->location.orig_beg_char_number =
 122             location.orig_beg_char_number;
 123     } else
 124     {   debug_locations*successor =
 125             my_malloc
 126                 (sizeof(debug_locations),
 127                  "debug locations of recent tokens");
 128         successor->location = location;
 129         successor->next = NULL;
 130         successor->reference_count = 0;
 131         last_token_location->next = successor;
 132         last_token_location = successor;
 133     }
 134 }
 135
 136 extern debug_location_beginning get_token_location_beginning(void)
 137 {   debug_location_beginning result;
 138     ++(last_token_location->reference_count);
 139     result.head = last_token_location;
 140     result.beginning_byte_index =
 141         last_token_location->location.end_byte_index;
 142     result.beginning_line_number =
 143         last_token_location->location.end_line_number;
 144     result.beginning_character_number =
 145         last_token_location->location.end_character_number;
 146     result.orig_file_index = last_token_location->location.orig_file_index;
 147     result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
 148     result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
 149
 150     return result;
 151 }
 152
 153 static void cleanup_token_locations(debug_location_beginning*beginning)
 154 {   if (first_token_locations)
 155     {   while (first_token_locations &&
 156                !first_token_locations->reference_count)
 157         {   debug_locations*moribund = first_token_locations;
 158             first_token_locations = moribund->next;
 159             my_free(&moribund, "debug locations of recent tokens");
 160             if (beginning &&
 161                 (beginning->head == moribund || !first_token_locations))
 162             {   compiler_error
 163                     ("Records needed by a debug_location_beginning are no "
 164                      "longer allocated, perhaps because of an invalid reuse "
 165                      "of this or an earlier beginning");
 166             }
 167         }
 168     } else
 169     {   if (beginning)
 170         {   compiler_error
 171                 ("Attempt to use a debug_location_beginning when no token "
 172                  "locations are defined");
 173         } else
 174         {   compiler_error
 175                 ("Attempt to clean up token locations when no token locations "
 176                  "are defined");
 177         }
 178     }
 179 }
 180
 181 extern void discard_token_location(debug_location_beginning beginning)
 182 {   --(beginning.head->reference_count);
 183 }
 184
 185 extern debug_locations get_token_location_end
 186     (debug_location_beginning beginning)
 187 {   debug_locations result;
 188     cleanup_token_locations(&beginning);
 189     --(beginning.head->reference_count);
 190     /* Sometimes we know what we'll read before we switch to the lexical block
 191        where we'll read it.  In such cases the beginning will be placed in the
 192        prior block and last exactly zero bytes there.  It's misleading to
 193        include such ranges, so we gobble them. */
 194     if (beginning.head->location.end_byte_index ==
 195           beginning.beginning_byte_index &&
 196         beginning.head->next)
 197     {   beginning.head = beginning.head->next;
 198         result.location = beginning.head->location;
 199         result.location.beginning_byte_index = 0;
 200         result.location.beginning_line_number = 1;
 201         result.location.beginning_character_number = 1;
 202     } else
 203     {   result.location = beginning.head->location;
 204         result.location.beginning_byte_index =
 205             beginning.beginning_byte_index;
 206         result.location.beginning_line_number =
 207             beginning.beginning_line_number;
 208         result.location.beginning_character_number =
 209             beginning.beginning_character_number;
 210     }
 211
 212     result.location.orig_file_index =
 213         beginning.orig_file_index;
 214     result.location.orig_beg_line_number =
 215         beginning.orig_beg_line_number;
 216     result.location.orig_beg_char_number =
 217         beginning.orig_beg_char_number;
 218
 219     result.next = beginning.head->next;
 220     result.reference_count = 0;
 221     return result;
 222 }
 223
 224 /* ------------------------------------------------------------------------- */
 225 /*   In order to be able to put tokens back efficiently, the lexer stores    */
 226 /*   tokens in a "circle": the variable circle_position ranges between       */
 227 /*   0 and CIRCLE_SIZE-1.  We only need a circle size as large as the        */
 228 /*   maximum number of tokens ever put back at once, plus 1 (in effect, the  */
 229 /*   maximum token lookahead ever needed in syntax analysis, plus 1).        */
 230 /*                                                                           */
 231 /*   Note that the circle struct type is lexeme_data, whereas the expression */
 232 /*   code all works in token_data. They have slightly different needs. The   */
 233 /*   data is exported through the token_text, token_value, token_type        */
 234 /*   globals, so there's no need to use the same struct at both levels.      */
 235 /*                                                                           */
 236 /*   Unlike some compilers, Inform does not have a context-free lexer: in    */
 237 /*   fact it has 12288 different possible states.  However, the context only */
 238 /*   affects the interpretation of "identifiers": lexemes beginning with a   */
 239 /*   letter and containing up to 32 chars of alphanumeric and underscore     */
 240 /*   chars.  (For example, "default" may refer to the directive or statement */
 241 /*   of that name, and which token values are returned depends on the        */
 242 /*   current lexical context.)                                               */
 243 /*                                                                           */
 244 /*   Along with each token, we also store the lexical context it was         */
 245 /*   translated under; because if it is called for again, there may need     */
 246 /*   to be a fresh interpretation of it if the context has changed.          */
 247 /* ------------------------------------------------------------------------- */
 248
 249 #define CIRCLE_SIZE 6
 250
 251 /*   (The worst case for token lookahead is distinguishing between an
 252      old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".)   */
 253
 254 static int circle_position;
 255 static lexeme_data circle[CIRCLE_SIZE];
 256
 257 /* ------------------------------------------------------------------------- */
 258 /*   A complication, however, is that the text of some lexemes needs to be   */
 259 /*   held in Inform's memory for much longer periods: for example, a         */
 260 /*   dictionary word lexeme (like "'south'") must have its text preserved    */
 261 /*   until the code generation time for the expression it occurs in, when    */
 262 /*   the dictionary reference is actually made. We handle this by keeping    */
 263 /*   all lexeme text until the end of the statement (or, for top-level       */
 264 /*   directives, until the end of the directive). Then we call               */
 265 /*   release_token_texts() to start over. The lextexts array will therefore  */
 266 /*   grow to the largest number of lexemes in a single statement or          */
 267 /*   directive.                                                              */
 268 /* ------------------------------------------------------------------------- */
 269
 270 typedef struct lextext_s {
 271     char *text;
 272     size_t size; /* Allocated size (including terminal null)
 273                     This is always at least MAX_IDENTIFIER_LENGTH+1         */
 274 } lextext;
 275
 276 static lextext *lextexts; /* Allocated to no_lextexts */
 277 static memory_list lextexts_memlist;
 278 static int no_lextexts;
 279
 280 static int cur_lextexts;    /* Number of lextexts in current use
 281                                (cur_lextexts <= no_lextexts)                 */
 282
 283 static int lex_index;       /* Index of lextext being written to             */
 284 static int lex_pos;         /* Current write position in that lextext        */
 285
 286 /* ------------------------------------------------------------------------- */
 287 /*   The lexer itself needs up to 3 characters of lookahead (it uses an      */
 288 /*   LR(3) grammar to translate characters into tokens).                     */
 289 /* ------------------------------------------------------------------------- */
 290
 291 #define LOOKAHEAD_SIZE 3
 292
 293 static int current, lookahead,          /* The latest character read, and    */
 294     lookahead2, lookahead3;             /* the three characters following it */
 295
 296 static int pipeline_made;               /* Whether or not the pipeline of
 297                                            characters has been constructed
 298                                            yet (this pass)                   */
 299
 300 static int (* get_next_char)(void);     /* Routine for reading the stream of
 301                                            characters: the lexer does not
 302                                            need any "ungetc" routine for
 303                                            putting them back again.  End of
 304                                            stream is signalled by returning
 305                                            zero.                             */
 306
 307 static char *source_to_analyse;         /* The current lexical source:
 308                                            NULL for "load from source files",
 309                                            otherwise this points to a string
 310                                            containing Inform code            */
 311
 312 static int tokens_put_back;             /* Count of the number of backward
 313                                            moves made from the last-read
 314                                            token                             */
 315
 316 /* This gets called for both token_data and lexeme_data structs. It prints
 317    a description of the common part (the text, value, type fields).
 318 */
 319 extern void describe_token_triple(const char *text, int32 value, int type)
 320 {
 321     /*  Many of the token types are not set in this file, but later on in
 322         Inform's higher stages (for example, in the expression evaluator);
 323         but this routine describes them all.                                 */
 324
 325     printf("{ ");
 326
 327     switch(type)
 328     {
 329         /*  The following token types occur in lexer output:                 */
 330
 331         case SYMBOL_TT:          printf("symbol ");
 332                                  describe_symbol(value);
 333                                  break;
 334         case NUMBER_TT:          printf("literal number %d", value);
 335                                  break;
 336         case DQ_TT:              printf("string \"%s\"", text);
 337                                  break;
 338         case SQ_TT:              printf("string '%s'", text);
 339                                  break;
 340         case SEP_TT:             printf("separator '%s'", text);
 341                                  break;
 342         case EOF_TT:             printf("end of file");
 343                                  break;
 344
 345         case STATEMENT_TT:       printf("statement name '%s'", text);
 346                                  break;
 347         case SEGMENT_MARKER_TT:  printf("object segment marker '%s'", text);
 348                                  break;
 349         case DIRECTIVE_TT:       printf("directive name '%s'", text);
 350                                  break;
 351         case CND_TT:             printf("textual conditional '%s'", text);
 352                                  break;
 353         case OPCODE_NAME_TT:     printf("opcode name '%s'", text);
 354                                  break;
 355         case SYSFUN_TT:          printf("built-in function name '%s'", text);
 356                                  break;
 357         case LOCAL_VARIABLE_TT:  printf("local variable name '%s'", text);
 358                                  break;
 359         case MISC_KEYWORD_TT:    printf("statement keyword '%s'", text);
 360                                  break;
 361         case DIR_KEYWORD_TT:     printf("directive keyword '%s'", text);
 362                                  break;
 363         case TRACE_KEYWORD_TT:   printf("'trace' keyword '%s'", text);
 364                                  break;
 365         case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", text);
 366                                  break;
 367
 368         /*  The remaining are etoken types, not set by the lexer             */
 369
 370         case OP_TT:              printf("operator '%s'",
 371                                      operators[value].description);
 372                                  break;
 373         case ENDEXP_TT:          printf("end of expression");
 374                                  break;
 375         case SUBOPEN_TT:         printf("open bracket");
 376                                  break;
 377         case SUBCLOSE_TT:        printf("close bracket");
 378                                  break;
 379         case LARGE_NUMBER_TT:    printf("large number: '%s'=%d",text,value);
 380                                  break;
 381         case SMALL_NUMBER_TT:    printf("small number: '%s'=%d",text,value);
 382                                  break;
 383         case VARIABLE_TT:        printf("variable '%s'=%d", text, value);
 384                                  break;
 385         case DICTWORD_TT:        printf("dictionary word '%s'", text);
 386                                  break;
 387         case ACTION_TT:          printf("action name '%s'", text);
 388                                  break;
 389
 390         default:
 391             printf("** unknown token type %d, text='%s', value=%d **",
 392             type, text, value);
 393     }
 394     printf(" }");
 395 }
 396
 397 /* ------------------------------------------------------------------------- */
 398 /*   All but one of the Inform keywords (most of them opcode names used      */
 399 /*   only by the assembler).  (The one left over is "sp", a keyword used in  */
 400 /*   assembly language only.)                                                */
 401 /*                                                                           */
 402 /*   A "keyword group" is a set of keywords to be searched for.  If a match  */
 403 /*   is made on an identifier, the token type becomes that given in the KG   */
 404 /*   and the token value is its index in the KG.                             */
 405 /*                                                                           */
 406 /*   The keyword ordering must correspond with the appropriate #define's in  */
 407 /*   "header.h" but is otherwise not significant.                            */
 408 /* ------------------------------------------------------------------------- */
 409
 410 /* This must exceed the total number of keywords across all groups,
 411    including opcodes. */
 412 #define MAX_KEYWORDS (350)
 413
 414 /* The values will be filled in at compile time, when we know
 415    which opcode set to use. */
 416 keyword_group opcode_names =
 417 { { "" },
 418     OPCODE_NAME_TT, FALSE, TRUE
 419 };
 420
 421 static char *opcode_list_z[] = {
 422     "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
 423     "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
 424     "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
 425     "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
 426     "print_char", "print_num", "random", "push", "pull", "split_window",
 427     "set_window", "output_stream", "input_stream", "sound_effect", "jz",
 428     "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
 429     "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
 430     "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
 431     "restore", "restart", "ret_popped", "pop", "quit", "new_line",
 432     "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
 433     "erase_window", "erase_line", "set_cursor", "get_cursor",
 434     "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
 435     "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
 436     "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
 437     "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
 438     "restore_undo", "draw_picture", "picture_data", "erase_picture",
 439     "set_margins", "move_window", "window_size", "window_style",
 440     "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
 441     "mouse_window", "push_stack", "put_wind_prop", "print_form",
 442     "make_menu", "picture_table", "print_unicode", "check_unicode",
 443     ""
 444 };
 445
 446 static char *opcode_list_g[] = {
 447     "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
 448     "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
 449     "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
 450     "jltu", "jgeu", "jgtu", "jleu",
 451     "call", "return",
 452     "catch", "throw", "tailcall",
 453     "copy", "copys", "copyb", "sexs", "sexb", "aload",
 454     "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
 455     "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
 456     "streamchar", "streamnum", "streamstr",
 457     "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
 458     "random", "setrandom", "quit", "verify",
 459     "restart", "save", "restore", "saveundo", "restoreundo", "protect",
 460     "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
 461     "linearsearch", "binarysearch", "linkedsearch",
 462     "callf", "callfi", "callfii", "callfiii",
 463     "streamunichar",
 464     "mzero", "mcopy", "malloc", "mfree",
 465     "accelfunc", "accelparam",
 466     "numtof", "ftonumz", "ftonumn", "ceil", "floor",
 467     "fadd", "fsub", "fmul", "fdiv", "fmod",
 468     "sqrt", "exp", "log", "pow",
 469     "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
 470     "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
 471     "hasundo", "discardundo",
 472     ""
 473 };
 474
 475 keyword_group opcode_macros =
 476 { { "" },
 477   OPCODE_MACRO_TT, FALSE, TRUE
 478 };
 479
 480 static char *opmacro_list_z[] = { "" };
 481
 482 static char *opmacro_list_g[] = {
 483     "pull", "push",
 484     ""
 485 };
 486
 487 keyword_group directives =
 488 { { "abbreviate", "array", "attribute", "class", "constant",
 489     "default", "dictionary", "end", "endif", "extend", "fake_action",
 490     "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
 491     "iffalse", "import", "include", "link", "lowstring", "message",
 492     "nearby", "object", "origsource", "property", "release", "replace",
 493     "serial", "switches", "statusline", "stub", "system_file", "trace",
 494     "undef", "verb", "version", "zcharacter",
 495     "" },
 496     DIRECTIVE_TT, FALSE, FALSE
 497 };
 498
 499 keyword_group trace_keywords =
 500 { { "dictionary", "symbols", "objects", "verbs",
 501     "assembly", "expressions", "lines", "tokens", "linker",
 502     "on", "off", "" },
 503     TRACE_KEYWORD_TT, FALSE, TRUE
 504 };
 505
 506 keyword_group segment_markers =
 507 { { "class", "has", "private", "with", "" },
 508     SEGMENT_MARKER_TT, FALSE, TRUE
 509 };
 510
 511 keyword_group directive_keywords =
 512 { { "alias", "long", "additive",
 513     "score", "time",
 514     "noun", "held", "multi", "multiheld", "multiexcept",
 515     "multiinside", "creature", "special", "number", "scope", "topic",
 516     "reverse", "meta", "only", "replace", "first", "last",
 517     "string", "table", "buffer", "data", "initial", "initstr",
 518     "with", "private", "has", "class",
 519     "error", "fatalerror", "warning",
 520     "terminating", "static", "individual",
 521     "" },
 522     DIR_KEYWORD_TT, FALSE, TRUE
 523 };
 524
 525 keyword_group misc_keywords =
 526 { { "char", "name", "the", "a", "an", "The", "number",
 527     "roman", "reverse", "bold", "underline", "fixed", "on", "off",
 528     "to", "address", "string", "object", "near", "from", "property", "A", "" },
 529     MISC_KEYWORD_TT, FALSE, TRUE
 530 };
 531
 532 keyword_group statements =
 533 { { "box", "break", "continue", "default", "do", "else", "font", "for",
 534     "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
 535     "print", "print_ret", "quit", "read", "remove", "restore", "return",
 536     "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
 537     "until", "while", "" },
 538     STATEMENT_TT, FALSE, TRUE
 539 };
 540
 541 keyword_group conditions =
 542 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
 543     CND_TT, FALSE, TRUE
 544 };
 545
 546 keyword_group system_functions =
 547 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
 548     "sibling", "younger", "youngest", "metaclass", "glk", "" },
 549     SYSFUN_TT, FALSE, TRUE
 550 };
 551
 552 keyword_group system_constants =
 553 { { "adjectives_table", "actions_table", "classes_table",
 554     "identifiers_table", "preactions_table", "version_number",
 555     "largest_object", "strings_offset", "code_offset",
 556     "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
 557     "static_memory_offset", "array_names_offset", "readable_memory_offset",
 558     "cpv__start", "cpv__end", "ipv__start", "ipv__end",
 559     "array__start", "array__end",
 560     "lowest_attribute_number", "highest_attribute_number",
 561     "attribute_names_array",
 562     "lowest_property_number", "highest_property_number",
 563     "property_names_array",
 564     "lowest_action_number", "highest_action_number",
 565     "action_names_array",
 566     "lowest_fake_action_number", "highest_fake_action_number",
 567     "fake_action_names_array",
 568     "lowest_routine_number", "highest_routine_number", "routines_array",
 569     "routine_names_array", "routine_flags_array",
 570     "lowest_global_number", "highest_global_number", "globals_array",
 571     "global_names_array", "global_flags_array",
 572     "lowest_array_number", "highest_array_number", "arrays_array",
 573     "array_names_array", "array_flags_array",
 574     "lowest_constant_number", "highest_constant_number", "constants_array",
 575     "constant_names_array",
 576     "lowest_class_number", "highest_class_number", "class_objects_array",
 577     "lowest_object_number", "highest_object_number",
 578     "oddeven_packing",
 579     "grammar_table", "dictionary_table", "dynam_string_table",
 580     "" },
 581     SYSTEM_CONSTANT_TT, FALSE, TRUE
 582 };
 583
 584 keyword_group *keyword_groups[12]
 585 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
 586     &directive_keywords, &misc_keywords, &statements, &conditions,
 587     &system_functions, &system_constants, &opcode_macros};
 588
 589 /* These keywords are set to point to local_variable_names entries when
 590    a routine header is parsed. See construct_local_variable_tables().        */
 591 keyword_group local_variables =
 592 { { "" },
 593     LOCAL_VARIABLE_TT, FALSE, FALSE
 594 };
 595
 596 static int lexical_context(void)
 597 {
 598     /*  The lexical context is a number representing all of the context
 599         information in the lexical analyser: the same input text will
 600         always translate to the same output tokens whenever the context
 601         is the same.
 602
 603         In fact, for efficiency reasons this number omits the bit of
 604         information held in the variable "dont_enter_into_symbol_table".
 605         Inform never needs to backtrack through tokens parsed in that
 606         way (thankfully, as it would be expensive indeed to check
 607         the tokens).                                                         */
 608
 609     int c = 0;
 610     if (opcode_names.enabled)         c |= 1;
 611     if (directives.enabled)           c |= 2;
 612     if (trace_keywords.enabled)       c |= 4;
 613     if (segment_markers.enabled)      c |= 8;
 614     if (directive_keywords.enabled)   c |= 16;
 615     if (misc_keywords.enabled)        c |= 32;
 616     if (statements.enabled)           c |= 64;
 617     if (conditions.enabled)           c |= 128;
 618     if (system_functions.enabled)     c |= 256;
 619     if (system_constants.enabled)     c |= 512;
 620     if (local_variables.enabled)      c |= 1024;
 621
 622     if (return_sp_as_variable)        c |= 2048;
 623     return(c);
 624 }
 625
 626 static void print_context(int c)
 627 {
 628     if ((c & 1) != 0) printf("OPC ");
 629     if ((c & 2) != 0) printf("DIR ");
 630     if ((c & 4) != 0) printf("TK ");
 631     if ((c & 8) != 0) printf("SEG ");
 632     if ((c & 16) != 0) printf("DK ");
 633     if ((c & 32) != 0) printf("MK ");
 634     if ((c & 64) != 0) printf("STA ");
 635     if ((c & 128) != 0) printf("CND ");
 636     if ((c & 256) != 0) printf("SFUN ");
 637     if ((c & 512) != 0) printf("SCON ");
 638     if ((c & 1024) != 0) printf("LV ");
 639     if ((c & 2048) != 0) printf("sp ");
 640 }
 641
 642 static int *keywords_hash_table;
 643 static int *keywords_hash_ends_table;
 644 static int *keywords_data_table;
 645
 646 static int *local_variable_hash_table;
 647 static int *local_variable_hash_codes;
 648
 649 /* Note that MAX_LOCAL_VARIABLES is the maximum number of local variables
 650    for this VM, *including* "sp" (the stack pointer "local").
 651    This used to be a memory setting. Now it is a constant: 16 for Z-code,
 652    119 for Glulx.
 653 */
 654
 655 /* Names of local variables in the current routine.
 656    This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local"
 657    is not included in this array.)
 658
 659    (This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1.
 660    But right now we just allocate the max.)
 661  */
 662 identstruct *local_variable_names;
 663
 664 static char one_letter_locals[128];
 665
 666 static void make_keywords_tables(void)
 667 {   int i, j, h, tp=0;
 668     char **oplist, **maclist;
 669
 670     if (!glulx_mode) {
 671         oplist = opcode_list_z;
 672         maclist = opmacro_list_z;
 673     }
 674     else {
 675         oplist = opcode_list_g;
 676         maclist = opmacro_list_g;
 677     }
 678
 679     for (j=0; *(oplist[j]); j++) {
 680         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 681             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 682             compiler_error("opcode_list has overflowed opcode_names.keywords");
 683             break;
 684         }
 685         opcode_names.keywords[j] = oplist[j];
 686     }
 687     opcode_names.keywords[j] = "";
 688
 689     for (j=0; *(maclist[j]); j++) {
 690         if (j >= MAX_KEYWORD_GROUP_SIZE) {
 691             /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
 692             compiler_error("opmacro_list has overflowed opcode_macros.keywords");
 693             break;
 694         }
 695         opcode_macros.keywords[j] = maclist[j];
 696     }
 697     opcode_macros.keywords[j] = "";
 698
 699     for (i=0; i<HASH_TAB_SIZE; i++)
 700     {   keywords_hash_table[i] = -1;
 701         keywords_hash_ends_table[i] = -1;
 702     }
 703
 704     for (i=1; i<=11; i++)
 705     {   keyword_group *kg = keyword_groups[i];
 706         for (j=0; *(kg->keywords[j]) != 0; j++)
 707         {
 708             if (tp >= MAX_KEYWORDS) {
 709                 /* Gotta increase MAX_KEYWORDS */
 710                 compiler_error("keywords_data_table has overflowed MAX_KEYWORDS");
 711                 break;
 712             }
 713             h = hash_code_from_string(kg->keywords[j]);
 714             if (keywords_hash_table[h] == -1)
 715                 keywords_hash_table[h] = tp;
 716             else
 717               *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
 718             keywords_hash_ends_table[h] = tp;
 719             *(keywords_data_table + 3*tp) = i;
 720             *(keywords_data_table + 3*tp+1) = j;
 721             *(keywords_data_table + 3*tp+2) = -1;
 722             tp++;
 723         }
 724     }
 725 }
 726
 727 /* Look at the strings stored in local_variable_names (from 0 to no_locals).
 728    Set local_variables.keywords to point to these, and also prepare the
 729    hash tables. */
 730 extern void construct_local_variable_tables(void)
 731 {   int i, h;
 732     for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
 733     for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
 734
 735     for (i=0; i<no_locals; i++)
 736     {
 737         char *p = local_variable_names[i].text;
 738         local_variables.keywords[i] = p;
 739         if (p[1] == 0)
 740         {   one_letter_locals[(uchar)p[0]] = i;
 741             if (isupper(p[0])) one_letter_locals[tolower(p[0])] = i;
 742             if (islower(p[0])) one_letter_locals[toupper(p[0])] = i;
 743         }
 744         h = hash_code_from_string(p);
 745         if (local_variable_hash_table[h] == -1)
 746             local_variable_hash_table[h] = i;
 747         local_variable_hash_codes[i] = h;
 748     }
 749     /* Clear the rest. */
 750     for (;i<MAX_LOCAL_VARIABLES-1;i++) {
 751         local_variables.keywords[i] = "";
 752         local_variable_hash_codes[i] = 0;
 753     }
 754 }
 755
 756 static void interpret_identifier(char *p, int pos, int dirs_only_flag)
 757 {   int index, hashcode;
 758
 759     /*  An identifier is either a keyword or a "symbol", a name which the
 760         lexical analyser leaves to higher levels of Inform to understand.    */
 761
 762     hashcode = hash_code_from_string(p);
 763
 764     if (dirs_only_flag) goto KeywordSearch;
 765
 766     /*  If this is assembly language, perhaps it is "sp"?                    */
 767
 768     if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
 769     {   circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
 770         return;
 771     }
 772
 773     /*  Test for local variables first, quite quickly.                       */
 774
 775     if (local_variables.enabled)
 776     {   if (p[1]==0)
 777         {   index = one_letter_locals[(uchar)p[0]];
 778             if (index<MAX_LOCAL_VARIABLES)
 779             {   circle[pos].type = LOCAL_VARIABLE_TT;
 780                 circle[pos].value = index+1;
 781                 return;
 782             }
 783         }
 784         index = local_variable_hash_table[hashcode];
 785         if (index >= 0)
 786         {   for (;index<no_locals;index++)
 787             {   if (hashcode == local_variable_hash_codes[index])
 788                 {   if (strcmpcis(p, local_variable_names[index].text)==0)
 789                     {   circle[pos].type = LOCAL_VARIABLE_TT;
 790                         circle[pos].value = index+1;
 791                         return;
 792                     }
 793                 }
 794             }
 795         }
 796     }
 797
 798     /*  Now the bulk of the keywords.  Note that the lexer doesn't recognise
 799         the name of a system function which has been Replaced.               */
 800
 801     KeywordSearch:
 802     index = keywords_hash_table[hashcode];
 803     while (index >= 0)
 804     {   int *i = keywords_data_table + 3*index;
 805         keyword_group *kg = keyword_groups[*i];
 806         if (((!dirs_only_flag) && (kg->enabled))
 807             || (dirs_only_flag && (kg == &directives)))
 808         {   char *q = kg->keywords[*(i+1)];
 809             if (((kg->case_sensitive) && (strcmp(p, q)==0))
 810                 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
 811             {   if ((kg != &system_functions)
 812                     || (system_function_usage[*(i+1)]!=2))
 813                 {   circle[pos].type = kg->change_token_type;
 814                     circle[pos].value = *(i+1);
 815                     return;
 816                 }
 817             }
 818         }
 819         index = *(i+2);
 820     }
 821
 822     if (dirs_only_flag) return;
 823
 824     /*  Search for the name; create it if necessary.                         */
 825
 826     circle[pos].value = symbol_index(p, hashcode);
 827     circle[pos].type = SYMBOL_TT;
 828 }
 829
 830
 831 /* ------------------------------------------------------------------------- */
 832 /*   The tokeniser grid aids a rapid decision about the consequences of a    */
 833 /*   character reached in the buffer.  In effect it is an efficiently stored */
 834 /*   transition table using an algorithm similar to that of S. C. Johnson's  */
 835 /*   "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9).       */
 836 /*   My thanks to Dilip Sequeira for suggesting this.                        */
 837 /*                                                                           */
 838 /*       tokeniser_grid[c]   is (16*n + m) if c is the first character of    */
 839 /*                               separator numbers n, n+1, ..., n+m-1        */
 840 /*                           or certain special values (QUOTE_CODE, etc)     */
 841 /*                           or 0 otherwise                                  */
 842 /*                                                                           */
 843 /*   Since 1000/16 = 62, the code numbers below will need increasing if the  */
 844 /*   number of separators supported exceeds 61.                              */
 845 /* ------------------------------------------------------------------------- */
 846
 847 static int tokeniser_grid[256];
 848
 849 #define QUOTE_CODE      1000
 850 #define DQUOTE_CODE     1001
 851 #define NULL_CODE       1002
 852 #define SPACE_CODE      1003
 853 #define NEGATIVE_CODE   1004
 854 #define DIGIT_CODE      1005
 855 #define RADIX_CODE      1006
 856 #define KEYWORD_CODE    1007
 857 #define EOF_CODE        1008
 858 #define WHITESPACE_CODE 1009
 859 #define COMMENT_CODE    1010
 860 #define IDENTIFIER_CODE 1011
 861
 862 /*  This list cannot safely be changed without also changing the header
 863     separator #defines.  The ordering is significant in that (i) all entries
 864     beginning with the same character must be adjacent and (ii) that if
 865     X is a an initial substring of Y then X must come before Y.
 866
 867     E.g. --> must occur before -- to prevent "-->0" being tokenised
 868     wrongly as "--", ">", "0" rather than "-->", "0".                        */
 869
 870 static const char separators[NUMBER_SEPARATORS][4] =
 871 {   "->", "-->", "--", "-", "++", "+", "*", "/", "%",
 872     "||", "|", "&&", "&", "~~",
 873     "~=", "~", "==", "=", ">=", ">",
 874     "<=", "<", "(", ")", ",",
 875     ".&", ".#", "..&", "..#", "..", ".",
 876     "::", ":", "@", ";", "[", "]", "{", "}",
 877     "$", "?~", "?",
 878     "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
 879 };
 880
 881 static void make_tokeniser_grid(void)
 882 {
 883     /*  Construct the grid to the specification above.                       */
 884
 885     int i, j;
 886
 887     for (i=0; i<256; i++) tokeniser_grid[i]=0;
 888
 889     for (i=0; i<NUMBER_SEPARATORS; i++)
 890     {   j=separators[i][0];
 891         if (tokeniser_grid[j]==0)
 892             tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
 893     }
 894     tokeniser_grid['\''] = QUOTE_CODE;
 895     tokeniser_grid['\"'] = DQUOTE_CODE;
 896     tokeniser_grid[0]    = EOF_CODE;
 897     tokeniser_grid[' ']  = WHITESPACE_CODE;
 898     tokeniser_grid['\n'] = WHITESPACE_CODE;
 899     tokeniser_grid['$']  = RADIX_CODE;
 900     tokeniser_grid['!']  = COMMENT_CODE;
 901
 902     tokeniser_grid['0']  = DIGIT_CODE;
 903     tokeniser_grid['1']  = DIGIT_CODE;
 904     tokeniser_grid['2']  = DIGIT_CODE;
 905     tokeniser_grid['3']  = DIGIT_CODE;
 906     tokeniser_grid['4']  = DIGIT_CODE;
 907     tokeniser_grid['5']  = DIGIT_CODE;
 908     tokeniser_grid['6']  = DIGIT_CODE;
 909     tokeniser_grid['7']  = DIGIT_CODE;
 910     tokeniser_grid['8']  = DIGIT_CODE;
 911     tokeniser_grid['9']  = DIGIT_CODE;
 912
 913     tokeniser_grid['a']  = IDENTIFIER_CODE;
 914     tokeniser_grid['b']  = IDENTIFIER_CODE;
 915     tokeniser_grid['c']  = IDENTIFIER_CODE;
 916     tokeniser_grid['d']  = IDENTIFIER_CODE;
 917     tokeniser_grid['e']  = IDENTIFIER_CODE;
 918     tokeniser_grid['f']  = IDENTIFIER_CODE;
 919     tokeniser_grid['g']  = IDENTIFIER_CODE;
 920     tokeniser_grid['h']  = IDENTIFIER_CODE;
 921     tokeniser_grid['i']  = IDENTIFIER_CODE;
 922     tokeniser_grid['j']  = IDENTIFIER_CODE;
 923     tokeniser_grid['k']  = IDENTIFIER_CODE;
 924     tokeniser_grid['l']  = IDENTIFIER_CODE;
 925     tokeniser_grid['m']  = IDENTIFIER_CODE;
 926     tokeniser_grid['n']  = IDENTIFIER_CODE;
 927     tokeniser_grid['o']  = IDENTIFIER_CODE;
 928     tokeniser_grid['p']  = IDENTIFIER_CODE;
 929     tokeniser_grid['q']  = IDENTIFIER_CODE;
 930     tokeniser_grid['r']  = IDENTIFIER_CODE;
 931     tokeniser_grid['s']  = IDENTIFIER_CODE;
 932     tokeniser_grid['t']  = IDENTIFIER_CODE;
 933     tokeniser_grid['u']  = IDENTIFIER_CODE;
 934     tokeniser_grid['v']  = IDENTIFIER_CODE;
 935     tokeniser_grid['w']  = IDENTIFIER_CODE;
 936     tokeniser_grid['x']  = IDENTIFIER_CODE;
 937     tokeniser_grid['y']  = IDENTIFIER_CODE;
 938     tokeniser_grid['z']  = IDENTIFIER_CODE;
 939
 940     tokeniser_grid['A']  = IDENTIFIER_CODE;
 941     tokeniser_grid['B']  = IDENTIFIER_CODE;
 942     tokeniser_grid['C']  = IDENTIFIER_CODE;
 943     tokeniser_grid['D']  = IDENTIFIER_CODE;
 944     tokeniser_grid['E']  = IDENTIFIER_CODE;
 945     tokeniser_grid['F']  = IDENTIFIER_CODE;
 946     tokeniser_grid['G']  = IDENTIFIER_CODE;
 947     tokeniser_grid['H']  = IDENTIFIER_CODE;
 948     tokeniser_grid['I']  = IDENTIFIER_CODE;
 949     tokeniser_grid['J']  = IDENTIFIER_CODE;
 950     tokeniser_grid['K']  = IDENTIFIER_CODE;
 951     tokeniser_grid['L']  = IDENTIFIER_CODE;
 952     tokeniser_grid['M']  = IDENTIFIER_CODE;
 953     tokeniser_grid['N']  = IDENTIFIER_CODE;
 954     tokeniser_grid['O']  = IDENTIFIER_CODE;
 955     tokeniser_grid['P']  = IDENTIFIER_CODE;
 956     tokeniser_grid['Q']  = IDENTIFIER_CODE;
 957     tokeniser_grid['R']  = IDENTIFIER_CODE;
 958     tokeniser_grid['S']  = IDENTIFIER_CODE;
 959     tokeniser_grid['T']  = IDENTIFIER_CODE;
 960     tokeniser_grid['U']  = IDENTIFIER_CODE;
 961     tokeniser_grid['V']  = IDENTIFIER_CODE;
 962     tokeniser_grid['W']  = IDENTIFIER_CODE;
 963     tokeniser_grid['X']  = IDENTIFIER_CODE;
 964     tokeniser_grid['Y']  = IDENTIFIER_CODE;
 965     tokeniser_grid['Z']  = IDENTIFIER_CODE;
 966
 967     tokeniser_grid['_']  = IDENTIFIER_CODE;
 968 }
 969
 970 /* ------------------------------------------------------------------------- */
 971 /*   Definition of a lexical block: a source file or a string containing     */
 972 /*   text for lexical analysis; an independent source from the point of      */
 973 /*   view of issuing error reports.                                          */
 974 /* ------------------------------------------------------------------------- */
 975
 976 typedef struct LexicalBlock_s
 977 {   char *filename;                              /*  Full translated name    */
 978     int   main_flag;                             /*  TRUE if the main file
 979                                                      (the first one opened)  */
 980     int   sys_flag;                              /*  TRUE if a System_File   */
 981     int   source_line;                           /*  Line number count       */
 982     int   line_start;                            /*  Char number within file
 983                                                      where the current line
 984                                                      starts                  */
 985     int   chars_read;                            /*  Char number of read pos */
 986     int   file_no;                               /*  Or 255 if not from a
 987                                                      file; used for debug
 988                                                      information             */
 989     char *orig_source;                           /* From #origsource direct  */
 990     int orig_file;
 991     int32 orig_line;
 992     int32 orig_char;
 993 } LexicalBlock;
 994
 995 static LexicalBlock NoFileOpen =
 996     {   "<before compilation>",  FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
 997
 998 static LexicalBlock MakingOutput =
 999     {   "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1000
1001 static LexicalBlock StringLB =
1002     {   "<veneer routine>",      FALSE, TRUE,  0, 0, 0, 255, NULL, 0, 0, 0 };
1003
1004 static LexicalBlock *CurrentLB;                  /*  The current lexical
1005                                                      block of input text     */
1006
1007 extern void declare_systemfile(void)
1008 {   CurrentLB->sys_flag = TRUE;
1009 }
1010
1011 extern int is_systemfile(void)
1012 {   return ((CurrentLB->sys_flag)?1:0);
1013 }
1014
1015 extern void set_origsource_location(char *source, int32 line, int32 charnum)
1016 {
1017     int file_no;
1018
1019     if (!source) {
1020         /* Clear the Origsource declaration. */
1021         CurrentLB->orig_file = 0;
1022         CurrentLB->orig_source = NULL;
1023         CurrentLB->orig_line = 0;
1024         CurrentLB->orig_char = 0;
1025         return;
1026     }
1027
1028     /* Get the file number for a new or existing InputFiles entry. */
1029     file_no = register_orig_sourcefile(source);
1030
1031     CurrentLB->orig_file = file_no;
1032     CurrentLB->orig_source = InputFiles[file_no-1].filename;
1033     CurrentLB->orig_line = line;
1034     CurrentLB->orig_char = charnum;
1035 }
1036
1037 /* Error locations. */
1038
1039 extern debug_location get_current_debug_location(void)
1040 {   debug_location result;
1041     /* Assume that all input characters are one byte. */
1042     result.file_index = CurrentLB->file_no;
1043     result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
1044     result.end_byte_index = result.beginning_byte_index;
1045     result.beginning_line_number = CurrentLB->source_line;
1046     result.end_line_number = result.beginning_line_number;
1047     result.beginning_character_number =
1048         CurrentLB->chars_read - CurrentLB->line_start;
1049     result.end_character_number = result.beginning_character_number;
1050     result.orig_file_index = CurrentLB->orig_file;
1051     result.orig_beg_line_number = CurrentLB->orig_line;
1052     result.orig_beg_char_number = CurrentLB->orig_char;
1053     return result;
1054 }
1055
1056 static debug_location ErrorReport_debug_location;
1057
1058 extern void report_errors_at_current_line(void)
1059 {   ErrorReport.line_number = CurrentLB->source_line;
1060     ErrorReport.file_number = CurrentLB->file_no;
1061     if (ErrorReport.file_number == 255)
1062         ErrorReport.file_number = -1;
1063     ErrorReport.source      = CurrentLB->filename;
1064     ErrorReport.main_flag   = CurrentLB->main_flag;
1065     if (debugfile_switch)
1066         ErrorReport_debug_location = get_current_debug_location();
1067     ErrorReport.orig_file = CurrentLB->orig_file;
1068     ErrorReport.orig_source = CurrentLB->orig_source;
1069     ErrorReport.orig_line = CurrentLB->orig_line;
1070     ErrorReport.orig_char = CurrentLB->orig_char;
1071 }
1072
1073 extern debug_location get_error_report_debug_location(void)
1074 {   return ErrorReport_debug_location;
1075 }
1076
1077 extern int32 get_current_line_start(void)
1078 {   return CurrentLB->line_start;
1079 }
1080
1081 brief_location blank_brief_location;
1082
1083 extern brief_location get_brief_location(ErrorPosition *errpos)
1084 {
1085     brief_location loc;
1086     loc.file_index = errpos->file_number;
1087     loc.line_number = errpos->line_number;
1088     loc.orig_file_index = errpos->orig_file;
1089     loc.orig_line_number = errpos->orig_line;
1090     return loc;
1091 }
1092
1093 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1094 {
1095     if (loc.file_index != -1)
1096     {   errpos->file_number = loc.file_index;
1097         errpos->line_number = loc.line_number;
1098         errpos->main_flag = (errpos->file_number == 1);
1099         errpos->orig_source = NULL;
1100         errpos->orig_file = loc.orig_file_index;
1101         errpos->orig_line = loc.orig_line_number;
1102         errpos->orig_char = 0;
1103     }
1104 }
1105
1106 /* ------------------------------------------------------------------------- */
1107 /*   Hash printing and line counting                                         */
1108 /* ------------------------------------------------------------------------- */
1109
1110 static void print_hash(void)
1111 {
1112     /*  Hash-printing is the practice of printing a # character every 100
1113         lines of source code (the -x switch), reassuring the user that
1114         progress is being made                                               */
1115
1116     if (no_hash_printed_yet)
1117     {   printf("::"); no_hash_printed_yet = FALSE;
1118     }
1119     printf("#"); hash_printed_since_newline = TRUE;
1120
1121 #ifndef MAC_FACE
1122     /*  On some systems, text output is buffered to a line at a time, and
1123         this would frustrate the point of hash-printing, so:                 */
1124
1125     fflush(stdout);
1126 #endif
1127 }
1128
1129 static void reached_new_line(void)
1130 {
1131     /*  Called to signal that a new line has been reached in the source code */
1132
1133     forerrors_pointer = 0;
1134
1135     CurrentLB->source_line++;
1136     CurrentLB->line_start = CurrentLB->chars_read;
1137
1138     total_source_line_count++;
1139
1140     if (total_source_line_count%100==0)
1141     {   if (hash_switch) print_hash();
1142 #ifdef MAC_MPW
1143         SpinCursor(32);                    /* I.e., allow other tasks to run */
1144 #endif
1145     }
1146
1147 #ifdef MAC_FACE
1148     if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1149     {   ProcessEvents (&g_proc);
1150         if (g_proc != true)
1151         {   free_arrays();
1152             close_all_source();
1153             abort_transcript_file();
1154             longjmp (g_fallback, 1);
1155         }
1156     }
1157 #endif
1158 }
1159
1160 static void new_syntax_line(void)
1161 {   if (source_to_analyse != NULL) forerrors_pointer = 0;
1162     report_errors_at_current_line();
1163 }
1164
1165 /* Return 10 raised to the expo power.
1166  *
1167  * I'm avoiding the standard pow() function for a rather lame reason:
1168  * it's in the libmath (-lm) library, and I don't want to change the
1169  * build model for the compiler. So, this is implemented with a stupid
1170  * lookup table. It's faster than pow() for small values of expo.
1171  * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1172  * anyway, so I don't expect that to be a problem.
1173  *
1174  * (For some reason, frexp() and ldexp(), which are used later on, do
1175  * not require libmath to be linked in.)
1176  */
1177 static double pow10_cheap(int expo)
1178 {
1179     #define POW10_RANGE (8)
1180     static double powers[POW10_RANGE*2+1] = {
1181         0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1182         1.0,
1183         10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1184     };
1185
1186     double res = 1.0;
1187
1188     if (expo < 0) {
1189         for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1190             res *= powers[0];
1191         }
1192         return res * powers[POW10_RANGE+expo];
1193     }
1194     else {
1195         for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1196             res *= powers[POW10_RANGE*2];
1197         }
1198         return res * powers[POW10_RANGE+expo];
1199     }
1200 }
1201
1202 /* Return the IEEE-754 single-precision encoding of a floating-point
1203  * number.
1204  *
1205  * The number is provided in the pieces it was parsed in:
1206  *    [+|-] intv "." fracv "e" [+|-]expo
1207  *
1208  * If the magnitude is too large (beyond about 3.4e+38), this returns
1209  * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1210  * small (below about 1e-45), this returns a zero value (0x00000000 or
1211  * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1212  * lexer should never do that).
1213  *
1214  * Note that using a float constant does *not* set the uses_float_features
1215  * flag (which would cause the game file to be labelled 3.1.2). There's
1216  * no VM feature here, just an integer. Of course, any use of the float
1217  * *opcodes* will set the flag.
1218  *
1219  * The math functions in this routine require #including <math.h>, but
1220  * they should not require linking the math library (-lm). At least,
1221  * they do not on OSX and Linux.
1222  */
1223 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1224 {
1225     double absval = (intv + fracv) * pow10_cheap(expo);
1226     int32 sign = (signbit ? 0x80000000 : 0x0);
1227     double mant;
1228     int32 fbits;
1229
1230     if (isinf(absval)) {
1231         return sign | 0x7f800000; /* infinity */
1232     }
1233     if (isnan(absval)) {
1234         return sign | 0x7fc00000;
1235     }
1236
1237     mant = frexp(absval, &expo);
1238
1239     /* Normalize mantissa to be in the range [1.0, 2.0) */
1240     if (0.5 <= mant && mant < 1.0) {
1241         mant *= 2.0;
1242         expo--;
1243     }
1244     else if (mant == 0.0) {
1245         expo = 0;
1246     }
1247     else {
1248         return sign | 0x7f800000; /* infinity */
1249     }
1250
1251     if (expo >= 128) {
1252         return sign | 0x7f800000; /* infinity */
1253     }
1254     else if (expo < -126) {
1255         /* Denormalized (very small) number */
1256         mant = ldexp(mant, 126 + expo);
1257         expo = 0;
1258     }
1259     else if (!(expo == 0 && mant == 0.0)) {
1260         expo += 127;
1261         mant -= 1.0; /* Get rid of leading 1 */
1262     }
1263
1264     mant *= 8388608.0; /* 2^23 */
1265     fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1266     if (fbits >> 23) {
1267         /* The carry propagated out of a string of 23 1 bits. */
1268         fbits = 0;
1269         expo++;
1270         if (expo >= 255) {
1271             return sign | 0x7f800000; /* infinity */
1272         }
1273     }
1274
1275     return (sign) | ((int32)(expo << 23)) | (fbits);
1276 }
1277
1278 /* ------------------------------------------------------------------------- */
1279 /*   Characters are read via a "pipeline" of variables, allowing us to look  */
1280 /*       up to three characters ahead of the current position.               */
1281 /*                                                                           */
1282 /*   There are two possible sources: from the source files being loaded in,  */
1283 /*   and from a string inside Inform (which is where the code for veneer     */
1284 /*   routines comes from).  Each source has its own get-next-character       */
1285 /*   routine.                                                                */
1286 /* ------------------------------------------------------------------------- */
1287 /*   Source 1: from files                                                    */
1288 /*                                                                           */
1289 /*   Note that file_load_chars(p, size) loads "size" bytes into buffer "p"   */
1290 /*   from the current input file.  If the file runs out, then if it was      */
1291 /*   the last source file 4 EOF characters are placed in the buffer: if it   */
1292 /*   was only an Include file ending, then a '\n' character is placed there  */
1293 /*   (essentially to force termination of any comment line) followed by      */
1294 /*   three harmless spaces.                                                  */
1295 /*                                                                           */
1296 /*   The routine returns the number of characters it has written, and note   */
1297 /*   that this conveniently ensures that all characters in the buffer come   */
1298 /*   from the same file.                                                     */
1299 /* ------------------------------------------------------------------------- */
1300
1301 #define SOURCE_BUFFER_SIZE 4096                  /*  Typical disc block size */
1302
1303 typedef struct Sourcefile_s
1304 {   char *buffer;                                /*  Input buffer            */
1305     int   read_pos;                              /*  Read position in buffer */
1306     int   size;                                  /*  Number of meaningful
1307                                                      characters in buffer    */
1308     int   la, la2, la3;                          /*  Three characters of
1309                                                      lookahead pipeline      */
1310     int   file_no;                               /*  Internal file number
1311                                                      (1, 2, 3, ...)          */
1312     LexicalBlock LB;
1313 } Sourcefile;
1314
1315 static Sourcefile *FileStack;     /*  Allocated to FileStack_max */
1316 static memory_list FileStack_memlist;
1317 static int FileStack_max;         /*  The highest value that File_sp has
1318                                       reached
1319                                       (Filestack entries to this depth have
1320                                       a buffer allocated)                    */
1321
1322 static int File_sp;               /*  Current stack pointer                  */
1323 static Sourcefile *CF;            /*  Top entry on stack (always equal to
1324                                       FileStack[File_sp-1])                  */
1325
1326 static int last_input_file;
1327
1328 /* Set CF and CurrentLB.
1329    This does not increment File_sp; the caller must do that. */
1330 static void begin_buffering_file(int i, int file_no)
1331 {   int j, cnt; uchar *p;
1332
1333     CF = NULL;
1334     CurrentLB = NULL;
1335
1336     ensure_memory_list_available(&FileStack_memlist, i+1);
1337     while (i >= FileStack_max) {
1338         FileStack[FileStack_max++].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1339     }
1340
1341     p = (uchar *) FileStack[i].buffer;
1342
1343     if (i>0)
1344     {   FileStack[i-1].la  = lookahead;
1345         FileStack[i-1].la2 = lookahead2;
1346         FileStack[i-1].la3 = lookahead3;
1347     }
1348
1349     FileStack[i].file_no = file_no;
1350     FileStack[i].size = file_load_chars(file_no,
1351         (char *) p, SOURCE_BUFFER_SIZE);
1352     /* If the file is shorter than SOURCE_BUFFER_SIZE, it's now closed already. We still need to set up the file entry though. */
1353
1354     lookahead  = source_to_iso_grid[p[0]];
1355     lookahead2 = source_to_iso_grid[p[1]];
1356     lookahead3 = source_to_iso_grid[p[2]];
1357     if (LOOKAHEAD_SIZE != 3)
1358         compiler_error
1359             ("Lexer lookahead size does not match hard-coded lookahead code");
1360     FileStack[i].read_pos = LOOKAHEAD_SIZE;
1361
1362     if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1363                else FileStack[i].LB.main_flag = FALSE;
1364     FileStack[i].LB.sys_flag = FALSE;
1365     FileStack[i].LB.source_line = 1;
1366     FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1367     FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1368     FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1369     FileStack[i].LB.file_no = file_no;
1370     FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1371     FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1372
1373     InputFiles[file_no-1].initial_buffering = FALSE;
1374
1375     CurrentLB = &(FileStack[i].LB);
1376     CF = &(FileStack[i]);
1377
1378     /* Check for recursive inclusion */
1379     cnt = 0;
1380     for (j=0; j<i; j++)
1381     {   if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1382             cnt++;
1383     }
1384     if (cnt==1)
1385         warning_named("File included more than once",
1386             FileStack[j].LB.filename);
1387 }
1388
1389 static void create_char_pipeline(void)
1390 {
1391     File_sp = 0;
1392     begin_buffering_file(File_sp++, 1);
1393     pipeline_made = TRUE;
1394     last_input_file = current_input_file;
1395 }
1396
1397 static int get_next_char_from_pipeline(void)
1398 {   uchar *p;
1399
1400     while (last_input_file < current_input_file)
1401     {
1402         /*  An "Include" file must have opened since the last character
1403             was read. Perhaps more than one. We run forward through the
1404             list and add them to the include stack. But we ignore
1405             "Origsource" files (which were never actually opened for
1406             reading). */
1407
1408         last_input_file++;
1409         if (!InputFiles[last_input_file-1].is_input)
1410             continue;
1411
1412         begin_buffering_file(File_sp++, last_input_file);
1413     }
1414     if (last_input_file != current_input_file)
1415         compiler_error("last_input_file did not match after Include");
1416
1417     if (File_sp == 0)
1418     {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1419     }
1420
1421     if (CF->read_pos == CF->size)
1422     {   CF->size =
1423             file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1424         CF->read_pos = 0;
1425     }
1426     else
1427     if (CF->read_pos == -(CF->size))
1428     {   set_token_location(get_current_debug_location());
1429         File_sp--;
1430         if (File_sp == 0)
1431         {   lookahead  = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1432         }
1433         CF = &(FileStack[File_sp-1]);
1434         CurrentLB = &(FileStack[File_sp-1].LB);
1435         lookahead  = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1436         if (CF->read_pos == CF->size)
1437         {   CF->size =
1438                 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1439             CF->read_pos = 0;
1440         }
1441         set_token_location(get_current_debug_location());
1442     }
1443
1444     p = (uchar *) (CF->buffer);
1445
1446     current = lookahead;
1447     lookahead = lookahead2;
1448     lookahead2 = lookahead3;
1449     lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1450
1451     CurrentLB->chars_read++;
1452     if (forerrors_pointer < FORERRORS_SIZE-1)
1453         forerrors_buff[forerrors_pointer++] = current;
1454     if (current == '\n') reached_new_line();
1455     return(current);
1456 }
1457
1458 /* ------------------------------------------------------------------------- */
1459 /*   Source 2: from a string                                                 */
1460 /* ------------------------------------------------------------------------- */
1461
1462 static int source_to_analyse_pointer;            /*  Current read position   */
1463
1464 static int get_next_char_from_string(void)
1465 {   uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1466     current = source_to_iso_grid[p[0]];
1467
1468     if (current == 0)    lookahead  = 0;
1469                     else lookahead  = source_to_iso_grid[p[1]];
1470     if (lookahead == 0)  lookahead2 = 0;
1471                     else lookahead2 = source_to_iso_grid[p[2]];
1472     if (lookahead2 == 0) lookahead3 = 0;
1473                     else lookahead3 = source_to_iso_grid[p[3]];
1474
1475     CurrentLB->chars_read++;
1476     if (forerrors_pointer < FORERRORS_SIZE-1)
1477         forerrors_buff[forerrors_pointer++] = current;
1478     if (current == '\n') reached_new_line();
1479     return(current);
1480 }
1481
1482 /* ========================================================================= */
1483 /*   The interface between the lexer and Inform's higher levels:             */
1484 /*                                                                           */
1485 /*       put_token_back()            (effectively) move the read position    */
1486 /*                                       back by one token                   */
1487 /*                                                                           */
1488 /*       get_next_token()            copy the token at the current read      */
1489 /*                                       position into the triple            */
1490 /*                                   (token_type, token_value, token_text)   */
1491 /*                                       and move the read position forward  */
1492 /*                                       by one                              */
1493 /*                                                                           */
1494 /*       release_token_texts()       discard all the tokens that have been   */
1495 /*                                       read in, except for put-back ones   */
1496 /*                                                                           */
1497 /*       restart_lexer(source, name) if source is NULL, initialise the lexer */
1498 /*                                       to read from source files;          */
1499 /*                                   otherwise, to read from this string.    */
1500 /* ------------------------------------------------------------------------- */
1501
1502 extern void release_token_texts(void)
1503 {
1504     /* This is called at the beginning of every (top-level) directive and
1505        every statement. It drops all token usage so that the lextexts
1506        array can be reused.
1507
1508        Call this immediately before a get_next_token() call.
1509
1510        This should *not* be called within parse_expression(). Expression
1511        code generation relies on token data being retained across the whole
1512        expression.
1513     */
1514     int ix;
1515
1516     token_text = NULL;
1517
1518     if (tokens_put_back == 0) {
1519         cur_lextexts = 0;
1520         return;
1521     }
1522
1523     /* If any tokens have been put back, we have to preserve their text.
1524        Move their lextext usage to the head of the lextexts array. */
1525
1526     for (ix=0; ix<tokens_put_back; ix++) {
1527         int oldpos;
1528         lextext temp;
1529         int pos = circle_position - tokens_put_back + 1 + ix;
1530         if (pos < 0) pos += CIRCLE_SIZE;
1531
1532         oldpos = circle[pos].lextext;
1533         circle[pos].lextext = ix;
1534         /* Swap the entire lextext structure (including size) */
1535         temp = lextexts[ix];
1536         lextexts[ix] = lextexts[oldpos];
1537         lextexts[oldpos] = temp;
1538     }
1539     cur_lextexts = tokens_put_back;
1540 }
1541
1542 extern void put_token_back(void)
1543 {   tokens_put_back++;
1544
1545     if (tokens_trace_level > 0)
1546     {   if (tokens_trace_level == 1) printf("<- ");
1547         else printf("<-\n");
1548     }
1549
1550     /*  The following error, of course, should never happen!                 */
1551
1552     if (tokens_put_back == CIRCLE_SIZE)
1553     {   compiler_error("The lexical analyser has collapsed because of a wrong \
1554 assumption inside Inform");
1555         tokens_put_back--;
1556         return;
1557     }
1558 }
1559
1560 /* The get_next_token() code reads characters into the current lextext,
1561    which is lextexts[lex_index]. It uses these routines to add and remove
1562    characters, reallocing when necessary.
1563
1564    lex_pos is the current number of characters in the lextext. It is
1565    not necessarily null-terminated until get_next_token() completes.
1566  */
1567
1568 /* Add one character */
1569 static void lexaddc(char ch)
1570 {
1571     if ((size_t)lex_pos >= lextexts[lex_index].size) {
1572         size_t newsize = lextexts[lex_index].size * 2;
1573         my_realloc(&lextexts[lex_index].text, lextexts[lex_index].size, newsize, "one lexeme text");
1574         lextexts[lex_index].size = newsize;
1575     }
1576     lextexts[lex_index].text[lex_pos++] = ch;
1577 }
1578
1579 /* Remove the last character and ensure it's null-terminated */
1580 static void lexdelc(void)
1581 {
1582     if (lex_pos > 0) {
1583         lex_pos--;
1584     }
1585     lextexts[lex_index].text[lex_pos] = 0;
1586 }
1587
1588 /* Return the last character */
1589 static char lexlastc(void)
1590 {
1591     if (lex_pos == 0) {
1592         return 0;
1593     }
1594     return lextexts[lex_index].text[lex_pos-1];
1595 }
1596
1597 /* Add a string of characters (including the null) */
1598 static void lexadds(char *str)
1599 {
1600     while (*str) {
1601         lexaddc(*str);
1602         str++;
1603     }
1604     lexaddc(0);
1605 }
1606
1607 extern void get_next_token(void)
1608 {   int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1609     int returning_a_put_back_token = TRUE;
1610
1611     context = lexical_context();
1612
1613     if (tokens_put_back > 0)
1614     {   i = circle_position - tokens_put_back + 1;
1615         if (i<0) i += CIRCLE_SIZE;
1616         tokens_put_back--;
1617         if (context != circle[i].context)
1618         {   j = circle[i].type;
1619             if ((j==0) || ((j>=100) && (j<200)))
1620                 interpret_identifier(circle[i].text, i, FALSE);
1621             circle[i].context = context;
1622         }
1623         goto ReturnBack;
1624     }
1625     returning_a_put_back_token = FALSE;
1626
1627     if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1628     else circle_position++;
1629
1630     lex_index = cur_lextexts++;
1631     if (lex_index >= no_lextexts) {
1632         /* fresh lextext block; must init it */
1633         no_lextexts = lex_index+1;
1634         ensure_memory_list_available(&lextexts_memlist, no_lextexts);
1635         lextexts[lex_index].size = MAX_IDENTIFIER_LENGTH + 1;
1636         lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text");
1637     }
1638     lex_pos = 0;
1639     lextexts[lex_index].text[0] = 0; /* start with an empty string */
1640
1641     circle[circle_position].lextext = lex_index;
1642     circle[circle_position].text = NULL; /* will fill in later */
1643     circle[circle_position].value = 0;
1644     circle[circle_position].type = 0;
1645     circle[circle_position].context = context;
1646
1647     StartTokenAgain:
1648     d = (*get_next_char)();
1649     e = tokeniser_grid[d];
1650
1651     if (next_token_begins_syntax_line)
1652     {   if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1653         {   new_syntax_line();
1654             next_token_begins_syntax_line = FALSE;
1655         }
1656     }
1657
1658     circle[circle_position].location = get_current_debug_location();
1659
1660     switch(e)
1661     {   case 0: char_error("Illegal character found in source:", d);
1662             goto StartTokenAgain;
1663
1664         case WHITESPACE_CODE:
1665             while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1666                 (*get_next_char)();
1667             goto StartTokenAgain;
1668
1669         case COMMENT_CODE:
1670             while ((lookahead != '\n') && (lookahead != 0))
1671                 (*get_next_char)();
1672             goto StartTokenAgain;
1673
1674         case EOF_CODE:
1675             circle[circle_position].type = EOF_TT;
1676             lexadds("<end of file>");
1677             break;
1678
1679         case DIGIT_CODE:
1680             radix = 10;
1681             ReturnNumber:
1682             n=0;
1683             do
1684             {   n = n*radix + character_digit_value[d];
1685                 lexaddc(d);
1686             } while ((character_digit_value[lookahead] < radix)
1687                      && (d = (*get_next_char)(), TRUE));
1688
1689             lexaddc(0);
1690             circle[circle_position].type = NUMBER_TT;
1691             circle[circle_position].value = n;
1692             break;
1693
1694             FloatNumber:
1695             {   int expo=0; double intv=0, fracv=0;
1696                 int expocount=0, intcount=0, fraccount=0;
1697                 int signbit = (d == '-');
1698                 lexaddc(d);
1699                 while (character_digit_value[lookahead] < 10) {
1700                     intv = 10.0*intv + character_digit_value[lookahead];
1701                     intcount++;
1702                     lexaddc(lookahead);
1703                     (*get_next_char)();
1704                 }
1705                 if (lookahead == '.') {
1706                     double fracpow = 1.0;
1707                     lexaddc(lookahead);
1708                     (*get_next_char)();
1709                     while (character_digit_value[lookahead] < 10) {
1710                         fracpow *= 0.1;
1711                         fracv = fracv + fracpow*character_digit_value[lookahead];
1712                         fraccount++;
1713                         lexaddc(lookahead);
1714                         (*get_next_char)();
1715                     }
1716                 }
1717                 if (lookahead == 'e' || lookahead == 'E') {
1718                     int exposign = 0;
1719                     lexaddc(lookahead);
1720                     (*get_next_char)();
1721                     if (lookahead == '+' || lookahead == '-') {
1722                         exposign = (lookahead == '-');
1723                         lexaddc(lookahead);
1724                         (*get_next_char)();
1725                     }
1726                     while (character_digit_value[lookahead] < 10) {
1727                         expo = 10*expo + character_digit_value[lookahead];
1728                         expocount++;
1729                         lexaddc(lookahead);
1730                         (*get_next_char)();
1731                     }
1732                     if (expocount == 0)
1733                         error("Floating-point literal must have digits after the 'e'");
1734                     if (exposign) { expo = -expo; }
1735                 }
1736                 if (intcount + fraccount == 0)
1737                     error("Floating-point literal must have digits");
1738                 n = construct_float(signbit, intv, fracv, expo);
1739             }
1740             lexaddc(0);
1741             circle[circle_position].type = NUMBER_TT;
1742             circle[circle_position].value = n;
1743             if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1744             break;
1745
1746         case RADIX_CODE:
1747             radix = 16; d = (*get_next_char)();
1748             if (d == '-' || d == '+') { goto FloatNumber; }
1749             if (d == '$') { d = (*get_next_char)(); radix = 2; }
1750             if (character_digit_value[d] >= radix)
1751             {   if (radix == 2)
1752                     error("Binary number expected after '$$'");
1753                 else
1754                     error("Hexadecimal number expected after '$'");
1755             }
1756             goto ReturnNumber;
1757
1758         case QUOTE_CODE:     /* Single-quotes: scan a literal string */
1759             quoted_size=0;
1760             do
1761             {   e = d; d = (*get_next_char)(); lexaddc(d);
1762                 if (quoted_size++==64)
1763                 {   error(
1764                     "Too much text for one pair of quotations '...' to hold");
1765                     lexaddc('\''); break;
1766                 }
1767                 if ((d == '\'') && (e != '@'))
1768                 {   if (quoted_size == 1)
1769                     {   d = (*get_next_char)(); lexaddc(d);
1770                         if (d != '\'')
1771                             error("No text between quotation marks ''");
1772                     }
1773                     break;
1774                 }
1775             } while (d != EOF);
1776             if (d==EOF) ebf_error("'\''", "end of file");
1777             lexdelc();
1778             circle[circle_position].type = SQ_TT;
1779             break;
1780
1781         case DQUOTE_CODE:    /* Double-quotes: scan a literal string */
1782             quoted_size=0;
1783             do
1784             {   d = (*get_next_char)(); lexaddc(d);
1785                 if (d == '\n')
1786                 {   lex_pos--;
1787                     while (lexlastc() == ' ') lex_pos--;
1788                     if (lexlastc() != '^') lexaddc(' ');
1789                     while ((lookahead != EOF) &&
1790                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1791                     (*get_next_char)();
1792                 }
1793                 else if (d == '\\')
1794                 {   int newline_passed = FALSE;
1795                     lex_pos--;
1796                     while ((lookahead != EOF) &&
1797                           (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1798                         if ((d = (*get_next_char)()) == '\n')
1799                             newline_passed = TRUE;
1800                     if (!newline_passed)
1801                     {   char chb[4];
1802                         chb[0] = '\"'; chb[1] = lookahead;
1803                         chb[2] = '\"'; chb[3] = 0;
1804                         ebf_error("empty rest of line after '\\' in string",
1805                             chb);
1806                     }
1807                 }
1808             }   while ((d != EOF) && (d!='\"'));
1809             if (d==EOF) ebf_error("'\"'", "end of file");
1810             lexdelc();
1811             circle[circle_position].type = DQ_TT;
1812             break;
1813
1814         case IDENTIFIER_CODE:    /* Letter or underscore: an identifier */
1815
1816             lexaddc(d); n=1;
1817             while ((n<=MAX_IDENTIFIER_LENGTH)
1818                    && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1819                    || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1820                 n++, lexaddc((*get_next_char)());
1821
1822             lexaddc(0);
1823
1824             if (n > MAX_IDENTIFIER_LENGTH)
1825             {   char bad_length[100];
1826                 sprintf(bad_length,
1827                     "Name exceeds the maximum length of %d characters:",
1828                          MAX_IDENTIFIER_LENGTH);
1829                 error_named(bad_length, lextexts[lex_index].text);
1830                 /* Eat any further extra characters in the identifier */
1831                 while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1832                         || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1833                     (*get_next_char)();
1834                 /* Trim token so that it doesn't violate
1835                    MAX_IDENTIFIER_LENGTH during error recovery */
1836                 lextexts[lex_index].text[MAX_IDENTIFIER_LENGTH] = 0;
1837             }
1838
1839             if (dont_enter_into_symbol_table)
1840             {   circle[circle_position].type = DQ_TT;
1841                 circle[circle_position].value = 0;
1842                 if (dont_enter_into_symbol_table == -2)
1843                     interpret_identifier(lextexts[lex_index].text, circle_position, TRUE);
1844                 break;
1845             }
1846
1847             interpret_identifier(lextexts[lex_index].text, circle_position, FALSE);
1848             break;
1849
1850         default:
1851
1852             /*  The character is initial to at least one of the separators  */
1853
1854             for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1855             {   r = (char *) separators[j];
1856                 if (r[1]==0)
1857                 {   lexaddc(d);
1858                     lexaddc(0);
1859                     goto SeparatorMatched;
1860                 }
1861                 else
1862                 if (r[2]==0)
1863                 {   if (*(r+1) == lookahead)
1864                     {   lexaddc(d);
1865                         lexaddc((*get_next_char)());
1866                         lexaddc(0);
1867                         goto SeparatorMatched;
1868                     }
1869                 }
1870                 else
1871                 {   if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1872                     {   lexaddc(d);
1873                         lexaddc((*get_next_char)());
1874                         lexaddc((*get_next_char)());
1875                         lexaddc(0);
1876                         goto SeparatorMatched;
1877                     }
1878                 }
1879             }
1880
1881             /*  The following contingency never in fact arises with the
1882                 current set of separators, but might in future  */
1883
1884             lexaddc(d); lexaddc(lookahead); lexaddc(lookahead2);
1885             lexaddc(0);
1886             error_named("Unrecognised combination in source:",
1887                 lextexts[lex_index].text);
1888             goto StartTokenAgain;
1889
1890             SeparatorMatched:
1891
1892             circle[circle_position].type = SEP_TT;
1893             circle[circle_position].value = j;
1894             switch(j)
1895             {   case SEMICOLON_SEP: break;
1896                 case HASHNDOLLAR_SEP:
1897                 case HASHWDOLLAR_SEP:
1898                     if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1899                     {   error_named("Character expected after",
1900                             lextexts[lex_index].text);
1901                         break;
1902                     }
1903                     lex_pos--;
1904                     lexaddc((*get_next_char)());
1905                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1906                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
1907                         lexaddc((*get_next_char)());
1908                     lexaddc(0);
1909                     break;
1910                 case HASHADOLLAR_SEP:
1911                 case HASHGDOLLAR_SEP:
1912                 case HASHRDOLLAR_SEP:
1913                 case HASHHASH_SEP:
1914                     if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1915                     {   error_named("Alphabetic character expected after",
1916                             lextexts[lex_index].text);
1917                         break;
1918                     }
1919                     lex_pos--;
1920                     while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1921                            || (tokeniser_grid[lookahead] == DIGIT_CODE))
1922                         lexaddc((*get_next_char)());
1923                     lexaddc(0);
1924                     break;
1925             }
1926             break;
1927     }
1928
1929     /* We can now assign the text pointer, since the lextext has finished reallocing. */
1930     circle[circle_position].text = lextexts[lex_index].text;
1931     i = circle_position;
1932
1933     ReturnBack:
1934     /* We've either parsed a new token or selected a put-back one.
1935        i is the circle-position of the token in question. Time to
1936        export the token data where higher-level code can find it. */
1937     token_value = circle[i].value;
1938     token_type = circle[i].type;
1939     token_text = circle[i].text;
1940     if (!returning_a_put_back_token)
1941     {   set_token_location(circle[i].location);
1942     }
1943
1944     if (tokens_trace_level > 0)
1945     {   if (tokens_trace_level == 1) {
1946             printf("'%s' ", circle[i].text);
1947             if (circle[i].type == EOF_TT) printf("\n");
1948         }
1949         else
1950         {   printf("-> "); describe_token(&circle[i]);
1951             printf(" ");
1952             if (tokens_trace_level > 2) print_context(circle[i].context);
1953             printf("\n");
1954         }
1955     }
1956 }
1957
1958 static char veneer_error_title[64];
1959
1960 extern void restart_lexer(char *lexical_source, char *name)
1961 {   int i;
1962     circle_position = 0;
1963     for (i=0; i<CIRCLE_SIZE; i++)
1964     {   circle[i].type = 0;
1965         circle[i].value = 0;
1966         circle[i].text = "(if this is ever visible, there is a bug)";
1967         circle[i].lextext = -1;
1968         circle[i].context = 0;
1969     }
1970
1971     cur_lextexts = 0;
1972     /* But we don't touch no_lextexts; those allocated blocks can be reused */
1973     lex_index = -1;
1974     lex_pos = -1;
1975
1976     tokens_put_back = 0;
1977     forerrors_pointer = 0;
1978     dont_enter_into_symbol_table = FALSE;
1979     return_sp_as_variable = FALSE;
1980     next_token_begins_syntax_line = TRUE;
1981
1982     source_to_analyse = lexical_source;
1983
1984     if (source_to_analyse == NULL)
1985     {   get_next_char = get_next_char_from_pipeline;
1986         if (!pipeline_made) create_char_pipeline();
1987         forerrors_buff[0] = 0; forerrors_pointer = 0;
1988     }
1989     else
1990     {   get_next_char = get_next_char_from_string;
1991         source_to_analyse_pointer = 0;
1992         CurrentLB = &StringLB;
1993         sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1994         StringLB.filename = veneer_error_title;
1995
1996         CurrentLB->source_line = 1;
1997         CurrentLB->line_start  = 0;
1998         CurrentLB->chars_read  = 0;
1999     }
2000 }
2001
2002 /* ========================================================================= */
2003 /*   Data structure management routines                                      */
2004 /* ------------------------------------------------------------------------- */
2005
2006 extern void init_lexer_vars(void)
2007 {
2008     FileStack = NULL;
2009     FileStack_max = 0;
2010     CF = NULL;
2011     CurrentLB = NULL;
2012
2013     lextexts = NULL;
2014     no_lextexts = 0;
2015     cur_lextexts = 0;
2016     lex_index = -1;
2017     lex_pos = -1;
2018
2019     blank_brief_location.file_index = -1;
2020     blank_brief_location.line_number = 0;
2021     blank_brief_location.orig_file_index = 0;
2022     blank_brief_location.orig_line_number = 0;
2023 }
2024
2025 extern void lexer_begin_prepass(void)
2026 {   total_source_line_count = 0;
2027     CurrentLB = &NoFileOpen;
2028     report_errors_at_current_line();
2029 }
2030
2031 extern void lexer_begin_pass(void)
2032 {   no_hash_printed_yet = TRUE;
2033     hash_printed_since_newline = FALSE;
2034
2035     pipeline_made = FALSE;
2036
2037     restart_lexer(NULL, NULL);
2038 }
2039
2040 extern void lexer_endpass(void)
2041 {   CurrentLB = &MakingOutput;
2042     report_errors_at_current_line();
2043 }
2044
2045 extern void lexer_allocate_arrays(void)
2046 {
2047     initialise_memory_list(&FileStack_memlist,
2048         sizeof(Sourcefile), 4, (void**)&FileStack,
2049         "source file stack");
2050     FileStack_max = 0;
2051
2052     initialise_memory_list(&lextexts_memlist,
2053         sizeof(lextext), 200, (void**)&lextexts,
2054         "lexeme texts list");
2055     cur_lextexts = 0;
2056
2057     keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2058         "keyword hash table");
2059     keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2060         "keyword hash end table");
2061     keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
2062         "keyword hashing linked list");
2063
2064     local_variable_names = my_calloc(sizeof(identstruct), MAX_LOCAL_VARIABLES-1,
2065         "text of local variable names");
2066     local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2067         "local variable hash table");
2068     local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
2069         "local variable hash codes");
2070
2071     make_tokeniser_grid();
2072     make_keywords_tables();
2073
2074     first_token_locations =
2075         my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
2076     first_token_locations->location.file_index = 0;
2077     first_token_locations->location.beginning_byte_index = 0;
2078     first_token_locations->location.end_byte_index = 0;
2079     first_token_locations->location.beginning_line_number = 0;
2080     first_token_locations->location.end_line_number = 0;
2081     first_token_locations->location.beginning_character_number = 0;
2082     first_token_locations->location.end_character_number = 0;
2083     first_token_locations->location.orig_file_index = 0;
2084     first_token_locations->location.orig_beg_line_number = 0;
2085     first_token_locations->location.orig_beg_char_number = 0;
2086     first_token_locations->next = NULL;
2087     first_token_locations->reference_count = 0;
2088     last_token_location = first_token_locations;
2089 }
2090
2091 extern void lexer_free_arrays(void)
2092 {   int ix;
2093     CF = NULL;
2094     CurrentLB = NULL;
2095
2096     for (ix=0; ix<FileStack_max; ix++) {
2097         my_free(&FileStack[ix].buffer, "source file buffer");
2098     }
2099     deallocate_memory_list(&FileStack_memlist);
2100
2101     for (ix=0; ix<no_lextexts; ix++) {
2102         my_free(&lextexts[ix].text, "one lexeme text");
2103     }
2104     deallocate_memory_list(&lextexts_memlist);
2105
2106     my_free(&keywords_hash_table, "keyword hash table");
2107     my_free(&keywords_hash_ends_table, "keyword hash end table");
2108     my_free(&keywords_data_table, "keyword hashing linked list");
2109
2110     my_free(&local_variable_names, "text of local variable names");
2111     my_free(&local_variable_hash_table, "local variable hash table");
2112     my_free(&local_variable_hash_codes, "local variable hash codes");
2113
2114     cleanup_token_locations(NULL);
2115 }
2116
2117 /* ========================================================================= */