--- /dev/null
+/* ------------------------------------------------------------------------- */
+/* "lexer" : Lexical analyser */
+/* */
+/* Copyright (c) Graham Nelson 1993 - 2018 */
+/* */
+/* This file is part of Inform. */
+/* */
+/* Inform is free software: you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation, either version 3 of the License, or */
+/* (at your option) any later version. */
+/* */
+/* Inform is distributed in the hope that it will be useful, */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
+/* GNU General Public License for more details. */
+/* */
+/* You should have received a copy of the GNU General Public License */
+/* along with Inform. If not, see https://gnu.org/licenses/ */
+/* */
+/* ------------------------------------------------------------------------- */
+
+#include "header.h"
+
+int total_source_line_count, /* Number of source lines so far */
+
+ no_hash_printed_yet, /* Have not yet printed the first # */
+ hash_printed_since_newline, /* A hash has been printed since the
+ most recent new-line was printed
+ (generally as a result of an error
+ message or the start of pass) */
+ dont_enter_into_symbol_table, /* Return names as text (with
+ token type DQ_TT, i.e., as if
+ they had double-quotes around)
+ and not as entries in the symbol
+ table, when TRUE. If -2, only the
+ keyword table is searched. */
+ return_sp_as_variable; /* When TRUE, the word "sp" denotes
+ the stack pointer variable
+ (used in assembly language only) */
+int next_token_begins_syntax_line; /* When TRUE, start a new syntax
+ line (for error reporting, etc.)
+ on the source code line where
+ the next token appears */
+
+int32 last_mapped_line; /* Last syntax line reported to debugging file */
+
+/* ------------------------------------------------------------------------- */
+/* The lexer's output is a sequence of triples, each called a "token", */
+/* representing one lexical unit (or "lexeme") each. Instead of providing */
+/* "lookahead" (that is, always having available the next token after the */
+/* current one, so that syntax analysers higher up in Inform can have */
+/* advance knowledge of what is coming), the lexer instead has a system */
+/* where tokens can be read in and then "put back again". */
+/* The meaning of the number (and to some extent the text) supplied with */
+/* a token depends on its type: see "header.h" for the list of types. */
+/* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
+/* number, and translated to the token: */
+/* type NUMBER_TT, value 483, text "$1e3" */
+/* ------------------------------------------------------------------------- */
+/* These three variables are set to the current token on a call to */
+/* get_next_token() (but are not changed by a call to put_token_back()). */
+/* ------------------------------------------------------------------------- */
+
+int token_type;
+int32 token_value;
+char *token_text;
+
+/* ------------------------------------------------------------------------- */
+/* The next two variables are the head and tail of a singly linked list. */
+/* The tail stores the portion most recently read from the current */
+/* lexical block; its end values therefore describe the location of the */
+/* current token, and are updated whenever the three variables above are */
+/* via set_token_location(...). Earlier vertices, if any, represent the */
+/* regions of lexical blocks read beforehand, where new vertices are */
+/* only introduced by interruptions like a file inclusion or an EOF. */
+/* Vertices are deleted off of the front of the list once they are no */
+/* longer referenced by pending debug information records. */
+/* ------------------------------------------------------------------------- */
+
+static debug_locations *first_token_locations;
+static debug_locations *last_token_location;
+
+extern debug_location get_token_location(void)
+{ debug_location result;
+ debug_location *location = &(last_token_location->location);
+ result.file_index = location->file_index;
+ result.beginning_byte_index = location->end_byte_index;
+ result.end_byte_index = location->end_byte_index;
+ result.beginning_line_number = location->end_line_number;
+ result.end_line_number = location->end_line_number;
+ result.beginning_character_number = location->end_character_number;
+ result.end_character_number = location->end_character_number;
+ result.orig_file_index = location->orig_file_index;
+ result.orig_beg_line_number = location->orig_beg_line_number;
+ result.orig_beg_char_number = location->orig_beg_char_number;
+ return result;
+}
+
+extern debug_locations get_token_locations(void)
+{ debug_locations result;
+ result.location = get_token_location();
+ result.next = NULL;
+ result.reference_count = 0;
+ return result;
+}
+
+static void set_token_location(debug_location location)
+{ if (location.file_index == last_token_location->location.file_index)
+ { last_token_location->location.end_byte_index =
+ location.end_byte_index;
+ last_token_location->location.end_line_number =
+ location.end_line_number;
+ last_token_location->location.end_character_number =
+ location.end_character_number;
+ last_token_location->location.orig_file_index =
+ location.orig_file_index;
+ last_token_location->location.orig_beg_line_number =
+ location.orig_beg_line_number;
+ last_token_location->location.orig_beg_char_number =
+ location.orig_beg_char_number;
+ } else
+ { debug_locations*successor =
+ my_malloc
+ (sizeof(debug_locations),
+ "debug locations of recent tokens");
+ successor->location = location;
+ successor->next = NULL;
+ successor->reference_count = 0;
+ last_token_location->next = successor;
+ last_token_location = successor;
+ }
+}
+
+extern debug_location_beginning get_token_location_beginning(void)
+{ debug_location_beginning result;
+ ++(last_token_location->reference_count);
+ result.head = last_token_location;
+ result.beginning_byte_index =
+ last_token_location->location.end_byte_index;
+ result.beginning_line_number =
+ last_token_location->location.end_line_number;
+ result.beginning_character_number =
+ last_token_location->location.end_character_number;
+ result.orig_file_index = last_token_location->location.orig_file_index;
+ result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
+ result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
+
+ return result;
+}
+
+static void cleanup_token_locations(debug_location_beginning*beginning)
+{ if (first_token_locations)
+ { while (first_token_locations &&
+ !first_token_locations->reference_count)
+ { debug_locations*moribund = first_token_locations;
+ first_token_locations = moribund->next;
+ my_free(&moribund, "debug locations of recent tokens");
+ if (beginning &&
+ (beginning->head == moribund || !first_token_locations))
+ { compiler_error
+ ("Records needed by a debug_location_beginning are no "
+ "longer allocated, perhaps because of an invalid reuse "
+ "of this or an earlier beginning");
+ }
+ }
+ } else
+ { if (beginning)
+ { compiler_error
+ ("Attempt to use a debug_location_beginning when no token "
+ "locations are defined");
+ } else
+ { compiler_error
+ ("Attempt to clean up token locations when no token locations "
+ "are defined");
+ }
+ }
+}
+
+extern void discard_token_location(debug_location_beginning beginning)
+{ --(beginning.head->reference_count);
+}
+
+extern debug_locations get_token_location_end
+ (debug_location_beginning beginning)
+{ debug_locations result;
+ cleanup_token_locations(&beginning);
+ --(beginning.head->reference_count);
+ /* Sometimes we know what we'll read before we switch to the lexical block
+ where we'll read it. In such cases the beginning will be placed in the
+ prior block and last exactly zero bytes there. It's misleading to
+ include such ranges, so we gobble them. */
+ if (beginning.head->location.end_byte_index ==
+ beginning.beginning_byte_index &&
+ beginning.head->next)
+ { beginning.head = beginning.head->next;
+ result.location = beginning.head->location;
+ result.location.beginning_byte_index = 0;
+ result.location.beginning_line_number = 1;
+ result.location.beginning_character_number = 1;
+ } else
+ { result.location = beginning.head->location;
+ result.location.beginning_byte_index =
+ beginning.beginning_byte_index;
+ result.location.beginning_line_number =
+ beginning.beginning_line_number;
+ result.location.beginning_character_number =
+ beginning.beginning_character_number;
+ }
+
+ result.location.orig_file_index =
+ beginning.orig_file_index;
+ result.location.orig_beg_line_number =
+ beginning.orig_beg_line_number;
+ result.location.orig_beg_char_number =
+ beginning.orig_beg_char_number;
+
+ result.next = beginning.head->next;
+ result.reference_count = 0;
+ return result;
+}
+
+/* ------------------------------------------------------------------------- */
+/* In order to be able to put tokens back efficiently, the lexer stores */
+/* tokens in a "circle": the variable circle_position ranges between */
+/* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
+/* maximum number of tokens ever put back at once, plus 1 (in effect, the */
+/* maximum token lookahead ever needed in syntax analysis, plus 1). */
+/* */
+/* Unlike some compilers, Inform does not have a context-free lexer: in */
+/* fact it has 12288 different possible states. However, the context only */
+/* affects the interpretation of "identifiers": lexemes beginning with a */
+/* letter and containing up to 32 chars of alphanumeric and underscore */
+/* chars. (For example, "default" may refer to the directive or statement */
+/* of that name, and which token values are returned depends on the */
+/* current lexical context.) */
+/* */
+/* Along with each token, we also store the lexical context it was */
+/* translated under; because if it is called for again, there may need */
+/* to be a fresh interpretation of it if the context has changed. */
+/* ------------------------------------------------------------------------- */
+
+#define CIRCLE_SIZE 6
+
+/* (The worst case for token lookahead is distinguishing between an
+ old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
+
+static int circle_position;
+static token_data circle[CIRCLE_SIZE];
+
+static int token_contexts[CIRCLE_SIZE];
+
+/* ------------------------------------------------------------------------- */
+/* A complication, however, is that the text of some lexemes needs to be */
+/* held in Inform's memory for much longer periods: for example, a */
+/* dictionary word lexeme (like "'south'") must have its text preserved */
+/* until the code generation time for the expression it occurs in, when */
+/* the dictionary reference is actually made. Code generation in general */
+/* occurs as early as possible in Inform: pending some better method of */
+/* garbage collection, we simply use a buffer so large that unless */
+/* expressions spread across 10K of source code are found, there can be */
+/* no problem. */
+/* ------------------------------------------------------------------------- */
+
+static char *lexeme_memory;
+static char *lex_p; /* Current write position */
+
+/* ------------------------------------------------------------------------- */
+/* The lexer itself needs up to 3 characters of lookahead (it uses an */
+/* LR(3) grammar to translate characters into tokens). */
+/* ------------------------------------------------------------------------- */
+
+#define LOOKAHEAD_SIZE 3
+
+static int current, lookahead, /* The latest character read, and */
+ lookahead2, lookahead3; /* the three characters following it */
+
+static int pipeline_made; /* Whether or not the pipeline of
+ characters has been constructed
+ yet (this pass) */
+
+static int (* get_next_char)(void); /* Routine for reading the stream of
+ characters: the lexer does not
+ need any "ungetc" routine for
+ putting them back again. End of
+ stream is signalled by returning
+ zero. */
+
+static char *source_to_analyse; /* The current lexical source:
+ NULL for "load from source files",
+ otherwise this points to a string
+ containing Inform code */
+
+static int tokens_put_back; /* Count of the number of backward
+ moves made from the last-read
+ token */
+
+extern void describe_token(token_data t)
+{
+ /* Many of the token types are not set in this file, but later on in
+ Inform's higher stages (for example, in the expression evaluator);
+ but this routine describes them all. */
+
+ printf("{ ");
+
+ switch(t.type)
+ {
+ /* The following token types occur in lexer output: */
+
+ case SYMBOL_TT: printf("symbol ");
+ describe_symbol(t.value);
+ break;
+ case NUMBER_TT: printf("literal number %d", t.value);
+ break;
+ case DQ_TT: printf("string \"%s\"", t.text);
+ break;
+ case SQ_TT: printf("string '%s'", t.text);
+ break;
+ case SEP_TT: printf("separator '%s'", t.text);
+ break;
+ case EOF_TT: printf("end of file");
+ break;
+
+ case STATEMENT_TT: printf("statement name '%s'", t.text);
+ break;
+ case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
+ break;
+ case DIRECTIVE_TT: printf("directive name '%s'", t.text);
+ break;
+ case CND_TT: printf("textual conditional '%s'", t.text);
+ break;
+ case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
+ break;
+ case SYSFUN_TT: printf("built-in function name '%s'", t.text);
+ break;
+ case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
+ break;
+ case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
+ break;
+ case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
+ break;
+ case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
+ break;
+ case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
+ break;
+
+ /* The remaining are etoken types, not set by the lexer */
+
+ case OP_TT: printf("operator '%s'",
+ operators[t.value].description);
+ break;
+ case ENDEXP_TT: printf("end of expression");
+ break;
+ case SUBOPEN_TT: printf("open bracket");
+ break;
+ case SUBCLOSE_TT: printf("close bracket");
+ break;
+ case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
+ break;
+ case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
+ break;
+ case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
+ break;
+ case DICTWORD_TT: printf("dictionary word '%s'", t.text);
+ break;
+ case ACTION_TT: printf("action name '%s'", t.text);
+ break;
+
+ default:
+ printf("** unknown token type %d, text='%s', value=%d **",
+ t.type, t.text, t.value);
+ }
+ printf(" }");
+}
+
+/* ------------------------------------------------------------------------- */
+/* All but one of the 280 Inform keywords (118 of them opcode names used */
+/* only by the assembler). (The one left over is "sp", a keyword used in */
+/* assembly language only.) */
+/* */
+/* A "keyword group" is a set of keywords to be searched for. If a match */
+/* is made on an identifier, the token type becomes that given in the KG */
+/* and the token value is its index in the KG. */
+/* */
+/* The keyword ordering must correspond with the appropriate #define's in */
+/* "header.h" but is otherwise not significant. */
+/* ------------------------------------------------------------------------- */
+
+#define MAX_KEYWORDS 350
+
+/* The values will be filled in at compile time, when we know
+ which opcode set to use. */
+keyword_group opcode_names =
+{ { "" },
+ OPCODE_NAME_TT, FALSE, TRUE
+};
+
+static char *opcode_list_z[] = {
+ "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
+ "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
+ "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
+ "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
+ "print_char", "print_num", "random", "push", "pull", "split_window",
+ "set_window", "output_stream", "input_stream", "sound_effect", "jz",
+ "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
+ "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
+ "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
+ "restore", "restart", "ret_popped", "pop", "quit", "new_line",
+ "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
+ "erase_window", "erase_line", "set_cursor", "get_cursor",
+ "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
+ "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
+ "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
+ "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
+ "restore_undo", "draw_picture", "picture_data", "erase_picture",
+ "set_margins", "move_window", "window_size", "window_style",
+ "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
+ "mouse_window", "push_stack", "put_wind_prop", "print_form",
+ "make_menu", "picture_table", "print_unicode", "check_unicode",
+ ""
+};
+
+static char *opcode_list_g[] = {
+ "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
+ "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
+ "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
+ "jltu", "jgeu", "jgtu", "jleu",
+ "call", "return",
+ "catch", "throw", "tailcall",
+ "copy", "copys", "copyb", "sexs", "sexb", "aload",
+ "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
+ "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
+ "streamchar", "streamnum", "streamstr",
+ "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
+ "random", "setrandom", "quit", "verify",
+ "restart", "save", "restore", "saveundo", "restoreundo", "protect",
+ "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
+ "linearsearch", "binarysearch", "linkedsearch",
+ "callf", "callfi", "callfii", "callfiii",
+ "streamunichar",
+ "mzero", "mcopy", "malloc", "mfree",
+ "accelfunc", "accelparam",
+ "numtof", "ftonumz", "ftonumn", "ceil", "floor",
+ "fadd", "fsub", "fmul", "fdiv", "fmod",
+ "sqrt", "exp", "log", "pow",
+ "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
+ "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
+ ""
+};
+
+keyword_group opcode_macros =
+{ { "" },
+ OPCODE_MACRO_TT, FALSE, TRUE
+};
+
+static char *opmacro_list_z[] = { "" };
+
+static char *opmacro_list_g[] = {
+ "pull", "push",
+ ""
+};
+
+keyword_group directives =
+{ { "abbreviate", "array", "attribute", "class", "constant",
+ "default", "dictionary", "end", "endif", "extend", "fake_action",
+ "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
+ "iffalse", "import", "include", "link", "lowstring", "message",
+ "nearby", "object", "origsource", "property", "release", "replace",
+ "serial", "switches", "statusline", "stub", "system_file", "trace",
+ "undef", "verb", "version", "zcharacter",
+ "" },
+ DIRECTIVE_TT, FALSE, FALSE
+};
+
+keyword_group trace_keywords =
+{ { "dictionary", "symbols", "objects", "verbs",
+ "assembly", "expressions", "lines", "tokens", "linker",
+ "on", "off", "" },
+ TRACE_KEYWORD_TT, FALSE, TRUE
+};
+
+keyword_group segment_markers =
+{ { "class", "has", "private", "with", "" },
+ SEGMENT_MARKER_TT, FALSE, TRUE
+};
+
+keyword_group directive_keywords =
+{ { "alias", "long", "additive",
+ "score", "time",
+ "noun", "held", "multi", "multiheld", "multiexcept",
+ "multiinside", "creature", "special", "number", "scope", "topic",
+ "reverse", "meta", "only", "replace", "first", "last",
+ "string", "table", "buffer", "data", "initial", "initstr",
+ "with", "private", "has", "class",
+ "error", "fatalerror", "warning",
+ "terminating",
+ "" },
+ DIR_KEYWORD_TT, FALSE, TRUE
+};
+
+keyword_group misc_keywords =
+{ { "char", "name", "the", "a", "an", "The", "number",
+ "roman", "reverse", "bold", "underline", "fixed", "on", "off",
+ "to", "address", "string", "object", "near", "from", "property", "A", "" },
+ MISC_KEYWORD_TT, FALSE, TRUE
+};
+
+keyword_group statements =
+{ { "box", "break", "continue", "default", "do", "else", "font", "for",
+ "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
+ "print", "print_ret", "quit", "read", "remove", "restore", "return",
+ "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
+ "until", "while", "" },
+ STATEMENT_TT, FALSE, TRUE
+};
+
+keyword_group conditions =
+{ { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
+ CND_TT, FALSE, TRUE
+};
+
+keyword_group system_functions =
+{ { "child", "children", "elder", "eldest", "indirect", "parent", "random",
+ "sibling", "younger", "youngest", "metaclass", "glk", "" },
+ SYSFUN_TT, FALSE, TRUE
+};
+
+keyword_group system_constants =
+{ { "adjectives_table", "actions_table", "classes_table",
+ "identifiers_table", "preactions_table", "version_number",
+ "largest_object", "strings_offset", "code_offset",
+ "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
+ "static_memory_offset", "array_names_offset", "readable_memory_offset",
+ "cpv__start", "cpv__end", "ipv__start", "ipv__end",
+ "array__start", "array__end",
+ "lowest_attribute_number", "highest_attribute_number",
+ "attribute_names_array",
+ "lowest_property_number", "highest_property_number",
+ "property_names_array",
+ "lowest_action_number", "highest_action_number",
+ "action_names_array",
+ "lowest_fake_action_number", "highest_fake_action_number",
+ "fake_action_names_array",
+ "lowest_routine_number", "highest_routine_number", "routines_array",
+ "routine_names_array", "routine_flags_array",
+ "lowest_global_number", "highest_global_number", "globals_array",
+ "global_names_array", "global_flags_array",
+ "lowest_array_number", "highest_array_number", "arrays_array",
+ "array_names_array", "array_flags_array",
+ "lowest_constant_number", "highest_constant_number", "constants_array",
+ "constant_names_array",
+ "lowest_class_number", "highest_class_number", "class_objects_array",
+ "lowest_object_number", "highest_object_number",
+ "oddeven_packing",
+ "grammar_table", "dictionary_table", "dynam_string_table",
+ "" },
+ SYSTEM_CONSTANT_TT, FALSE, TRUE
+};
+
+keyword_group *keyword_groups[12]
+= { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
+ &directive_keywords, &misc_keywords, &statements, &conditions,
+ &system_functions, &system_constants, &opcode_macros};
+
+keyword_group local_variables =
+{ { "" }, /* Filled in when routine declared */
+ LOCAL_VARIABLE_TT, FALSE, FALSE
+};
+
+static int lexical_context(void)
+{
+ /* The lexical context is a number representing all of the context
+ information in the lexical analyser: the same input text will
+ always translate to the same output tokens whenever the context
+ is the same.
+
+ In fact, for efficiency reasons this number omits the bit of
+ information held in the variable "dont_enter_into_symbol_table".
+ Inform never needs to backtrack through tokens parsed in that
+ way (thankfully, as it would be expensive indeed to check
+ the tokens). */
+
+ int c = 0;
+ if (opcode_names.enabled) c |= 1;
+ if (directives.enabled) c |= 2;
+ if (trace_keywords.enabled) c |= 4;
+ if (segment_markers.enabled) c |= 8;
+ if (directive_keywords.enabled) c |= 16;
+ if (misc_keywords.enabled) c |= 32;
+ if (statements.enabled) c |= 64;
+ if (conditions.enabled) c |= 128;
+ if (system_functions.enabled) c |= 256;
+ if (system_constants.enabled) c |= 512;
+ if (local_variables.enabled) c |= 1024;
+
+ if (return_sp_as_variable) c |= 2048;
+ return(c);
+}
+
+static void print_context(int c)
+{
+ if ((c & 1) != 0) printf("OPC ");
+ if ((c & 2) != 0) printf("DIR ");
+ if ((c & 4) != 0) printf("TK ");
+ if ((c & 8) != 0) printf("SEG ");
+ if ((c & 16) != 0) printf("DK ");
+ if ((c & 32) != 0) printf("MK ");
+ if ((c & 64) != 0) printf("STA ");
+ if ((c & 128) != 0) printf("CND ");
+ if ((c & 256) != 0) printf("SFUN ");
+ if ((c & 512) != 0) printf("SCON ");
+ if ((c & 1024) != 0) printf("LV ");
+ if ((c & 2048) != 0) printf("sp ");
+}
+
+static int *keywords_hash_table;
+static int *keywords_hash_ends_table;
+static int *keywords_data_table;
+
+static int *local_variable_hash_table;
+static int *local_variable_hash_codes;
+char **local_variable_texts;
+static char *local_variable_text_table;
+
+static char one_letter_locals[128];
+
+static void make_keywords_tables(void)
+{ int i, j, h, tp=0;
+ char **oplist, **maclist;
+
+ if (!glulx_mode) {
+ oplist = opcode_list_z;
+ maclist = opmacro_list_z;
+ }
+ else {
+ oplist = opcode_list_g;
+ maclist = opmacro_list_g;
+ }
+
+ for (j=0; *(oplist[j]); j++) {
+ opcode_names.keywords[j] = oplist[j];
+ }
+ opcode_names.keywords[j] = "";
+
+ for (j=0; *(maclist[j]); j++) {
+ opcode_macros.keywords[j] = maclist[j];
+ }
+ opcode_macros.keywords[j] = "";
+
+ for (i=0; i<HASH_TAB_SIZE; i++)
+ { keywords_hash_table[i] = -1;
+ keywords_hash_ends_table[i] = -1;
+ }
+
+ for (i=1; i<=11; i++)
+ { keyword_group *kg = keyword_groups[i];
+ for (j=0; *(kg->keywords[j]) != 0; j++)
+ { h = hash_code_from_string(kg->keywords[j]);
+ if (keywords_hash_table[h] == -1)
+ keywords_hash_table[h] = tp;
+ else
+ *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
+ keywords_hash_ends_table[h] = tp;
+ *(keywords_data_table + 3*tp) = i;
+ *(keywords_data_table + 3*tp+1) = j;
+ *(keywords_data_table + 3*tp+2) = -1;
+ tp++;
+ }
+ }
+}
+
+extern void construct_local_variable_tables(void)
+{ int i, h; char *p = local_variable_text_table;
+ for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
+ for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
+
+ for (i=0; i<no_locals; i++)
+ { char *q = local_variables.keywords[i];
+ if (q[1] == 0)
+ { one_letter_locals[(uchar)q[0]] = i;
+ if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
+ if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
+ }
+ h = hash_code_from_string(q);
+ if (local_variable_hash_table[h] == -1)
+ local_variable_hash_table[h] = i;
+ local_variable_hash_codes[i] = h;
+ local_variable_texts[i] = p;
+ strcpy(p, q);
+ p += strlen(p)+1;
+ }
+ for (;i<MAX_LOCAL_VARIABLES-1;i++)
+ local_variable_texts[i] = "<no such local variable>";
+}
+
+static void interpret_identifier(int pos, int dirs_only_flag)
+{ int index, hashcode; char *p = circle[pos].text;
+
+ /* An identifier is either a keyword or a "symbol", a name which the
+ lexical analyser leaves to higher levels of Inform to understand. */
+
+ hashcode = hash_code_from_string(p);
+
+ if (dirs_only_flag) goto KeywordSearch;
+
+ /* If this is assembly language, perhaps it is "sp"? */
+
+ if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
+ { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
+ return;
+ }
+
+ /* Test for local variables first, quite quickly. */
+
+ if (local_variables.enabled)
+ { if (p[1]==0)
+ { index = one_letter_locals[(uchar)p[0]];
+ if (index<MAX_LOCAL_VARIABLES)
+ { circle[pos].type = LOCAL_VARIABLE_TT;
+ circle[pos].value = index+1;
+ return;
+ }
+ }
+ index = local_variable_hash_table[hashcode];
+ if (index >= 0)
+ { for (;index<no_locals;index++)
+ { if (hashcode == local_variable_hash_codes[index])
+ { if (strcmpcis(p, local_variable_texts[index])==0)
+ { circle[pos].type = LOCAL_VARIABLE_TT;
+ circle[pos].value = index+1;
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ /* Now the bulk of the keywords. Note that the lexer doesn't recognise
+ the name of a system function which has been Replaced. */
+
+ KeywordSearch:
+ index = keywords_hash_table[hashcode];
+ while (index >= 0)
+ { int *i = keywords_data_table + 3*index;
+ keyword_group *kg = keyword_groups[*i];
+ if (((!dirs_only_flag) && (kg->enabled))
+ || (dirs_only_flag && (kg == &directives)))
+ { char *q = kg->keywords[*(i+1)];
+ if (((kg->case_sensitive) && (strcmp(p, q)==0))
+ || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
+ { if ((kg != &system_functions)
+ || (system_function_usage[*(i+1)]!=2))
+ { circle[pos].type = kg->change_token_type;
+ circle[pos].value = *(i+1);
+ return;
+ }
+ }
+ }
+ index = *(i+2);
+ }
+
+ if (dirs_only_flag) return;
+
+ /* Search for the name; create it if necessary. */
+
+ circle[pos].value = symbol_index(p, hashcode);
+ circle[pos].type = SYMBOL_TT;
+}
+
+
+/* ------------------------------------------------------------------------- */
+/* The tokeniser grid aids a rapid decision about the consequences of a */
+/* character reached in the buffer. In effect it is an efficiently stored */
+/* transition table using an algorithm similar to that of S. C. Johnson's */
+/* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
+/* My thanks to Dilip Sequeira for suggesting this. */
+/* */
+/* tokeniser_grid[c] is (16*n + m) if c is the first character of */
+/* separator numbers n, n+1, ..., n+m-1 */
+/* or certain special values (QUOTE_CODE, etc) */
+/* or 0 otherwise */
+/* */
+/* Since 1000/16 = 62, the code numbers below will need increasing if the */
+/* number of separators supported exceeds 61. */
+/* ------------------------------------------------------------------------- */
+
+static int tokeniser_grid[256];
+
+#define QUOTE_CODE 1000
+#define DQUOTE_CODE 1001
+#define NULL_CODE 1002
+#define SPACE_CODE 1003
+#define NEGATIVE_CODE 1004
+#define DIGIT_CODE 1005
+#define RADIX_CODE 1006
+#define KEYWORD_CODE 1007
+#define EOF_CODE 1008
+#define WHITESPACE_CODE 1009
+#define COMMENT_CODE 1010
+#define IDENTIFIER_CODE 1011
+
+/* This list cannot safely be changed without also changing the header
+ separator #defines. The ordering is significant in that (i) all entries
+ beginning with the same character must be adjacent and (ii) that if
+ X is a an initial substring of Y then X must come before Y.
+
+ E.g. --> must occur before -- to prevent "-->0" being tokenised
+ wrongly as "--", ">", "0" rather than "-->", "0". */
+
+static const char separators[NUMBER_SEPARATORS][4] =
+{ "->", "-->", "--", "-", "++", "+", "*", "/", "%",
+ "||", "|", "&&", "&", "~~",
+ "~=", "~", "==", "=", ">=", ">",
+ "<=", "<", "(", ")", ",",
+ ".&", ".#", "..&", "..#", "..", ".",
+ "::", ":", "@", ";", "[", "]", "{", "}",
+ "$", "?~", "?",
+ "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
+};
+
+static void make_tokeniser_grid(void)
+{
+ /* Construct the grid to the specification above. */
+
+ int i, j;
+
+ for (i=0; i<256; i++) tokeniser_grid[i]=0;
+
+ for (i=0; i<NUMBER_SEPARATORS; i++)
+ { j=separators[i][0];
+ if (tokeniser_grid[j]==0)
+ tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
+ }
+ tokeniser_grid['\''] = QUOTE_CODE;
+ tokeniser_grid['\"'] = DQUOTE_CODE;
+ tokeniser_grid[0] = EOF_CODE;
+ tokeniser_grid[' '] = WHITESPACE_CODE;
+ tokeniser_grid['\n'] = WHITESPACE_CODE;
+ tokeniser_grid['$'] = RADIX_CODE;
+ tokeniser_grid['!'] = COMMENT_CODE;
+
+ tokeniser_grid['0'] = DIGIT_CODE;
+ tokeniser_grid['1'] = DIGIT_CODE;
+ tokeniser_grid['2'] = DIGIT_CODE;
+ tokeniser_grid['3'] = DIGIT_CODE;
+ tokeniser_grid['4'] = DIGIT_CODE;
+ tokeniser_grid['5'] = DIGIT_CODE;
+ tokeniser_grid['6'] = DIGIT_CODE;
+ tokeniser_grid['7'] = DIGIT_CODE;
+ tokeniser_grid['8'] = DIGIT_CODE;
+ tokeniser_grid['9'] = DIGIT_CODE;
+
+ tokeniser_grid['a'] = IDENTIFIER_CODE;
+ tokeniser_grid['b'] = IDENTIFIER_CODE;
+ tokeniser_grid['c'] = IDENTIFIER_CODE;
+ tokeniser_grid['d'] = IDENTIFIER_CODE;
+ tokeniser_grid['e'] = IDENTIFIER_CODE;
+ tokeniser_grid['f'] = IDENTIFIER_CODE;
+ tokeniser_grid['g'] = IDENTIFIER_CODE;
+ tokeniser_grid['h'] = IDENTIFIER_CODE;
+ tokeniser_grid['i'] = IDENTIFIER_CODE;
+ tokeniser_grid['j'] = IDENTIFIER_CODE;
+ tokeniser_grid['k'] = IDENTIFIER_CODE;
+ tokeniser_grid['l'] = IDENTIFIER_CODE;
+ tokeniser_grid['m'] = IDENTIFIER_CODE;
+ tokeniser_grid['n'] = IDENTIFIER_CODE;
+ tokeniser_grid['o'] = IDENTIFIER_CODE;
+ tokeniser_grid['p'] = IDENTIFIER_CODE;
+ tokeniser_grid['q'] = IDENTIFIER_CODE;
+ tokeniser_grid['r'] = IDENTIFIER_CODE;
+ tokeniser_grid['s'] = IDENTIFIER_CODE;
+ tokeniser_grid['t'] = IDENTIFIER_CODE;
+ tokeniser_grid['u'] = IDENTIFIER_CODE;
+ tokeniser_grid['v'] = IDENTIFIER_CODE;
+ tokeniser_grid['w'] = IDENTIFIER_CODE;
+ tokeniser_grid['x'] = IDENTIFIER_CODE;
+ tokeniser_grid['y'] = IDENTIFIER_CODE;
+ tokeniser_grid['z'] = IDENTIFIER_CODE;
+
+ tokeniser_grid['A'] = IDENTIFIER_CODE;
+ tokeniser_grid['B'] = IDENTIFIER_CODE;
+ tokeniser_grid['C'] = IDENTIFIER_CODE;
+ tokeniser_grid['D'] = IDENTIFIER_CODE;
+ tokeniser_grid['E'] = IDENTIFIER_CODE;
+ tokeniser_grid['F'] = IDENTIFIER_CODE;
+ tokeniser_grid['G'] = IDENTIFIER_CODE;
+ tokeniser_grid['H'] = IDENTIFIER_CODE;
+ tokeniser_grid['I'] = IDENTIFIER_CODE;
+ tokeniser_grid['J'] = IDENTIFIER_CODE;
+ tokeniser_grid['K'] = IDENTIFIER_CODE;
+ tokeniser_grid['L'] = IDENTIFIER_CODE;
+ tokeniser_grid['M'] = IDENTIFIER_CODE;
+ tokeniser_grid['N'] = IDENTIFIER_CODE;
+ tokeniser_grid['O'] = IDENTIFIER_CODE;
+ tokeniser_grid['P'] = IDENTIFIER_CODE;
+ tokeniser_grid['Q'] = IDENTIFIER_CODE;
+ tokeniser_grid['R'] = IDENTIFIER_CODE;
+ tokeniser_grid['S'] = IDENTIFIER_CODE;
+ tokeniser_grid['T'] = IDENTIFIER_CODE;
+ tokeniser_grid['U'] = IDENTIFIER_CODE;
+ tokeniser_grid['V'] = IDENTIFIER_CODE;
+ tokeniser_grid['W'] = IDENTIFIER_CODE;
+ tokeniser_grid['X'] = IDENTIFIER_CODE;
+ tokeniser_grid['Y'] = IDENTIFIER_CODE;
+ tokeniser_grid['Z'] = IDENTIFIER_CODE;
+
+ tokeniser_grid['_'] = IDENTIFIER_CODE;
+}
+
+/* ------------------------------------------------------------------------- */
+/* Definition of a lexical block: a source file or a string containing */
+/* text for lexical analysis; an independent source from the point of */
+/* view of issuing error reports. */
+/* ------------------------------------------------------------------------- */
+
+typedef struct LexicalBlock_s
+{ char *filename; /* Full translated name */
+ int main_flag; /* TRUE if the main file
+ (the first one opened) */
+ int sys_flag; /* TRUE if a System_File */
+ int source_line; /* Line number count */
+ int line_start; /* Char number within file
+ where the current line
+ starts */
+ int chars_read; /* Char number of read pos */
+ int file_no; /* Or 255 if not from a
+ file; used for debug
+ information */
+ char *orig_source; /* From #origsource direct */
+ int orig_file;
+ int32 orig_line;
+ int32 orig_char;
+} LexicalBlock;
+
+static LexicalBlock NoFileOpen =
+ { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
+
+static LexicalBlock MakingOutput =
+ { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
+
+static LexicalBlock StringLB =
+ { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255, NULL, 0, 0, 0 };
+
+static LexicalBlock *CurrentLB; /* The current lexical
+ block of input text */
+
+extern void declare_systemfile(void)
+{ CurrentLB->sys_flag = TRUE;
+}
+
+extern int is_systemfile(void)
+{ return ((CurrentLB->sys_flag)?1:0);
+}
+
+extern void set_origsource_location(char *source, int32 line, int32 charnum)
+{
+ if (!source) {
+ /* Clear the Origsource declaration. */
+ CurrentLB->orig_file = 0;
+ CurrentLB->orig_source = NULL;
+ CurrentLB->orig_line = 0;
+ CurrentLB->orig_char = 0;
+ return;
+ }
+
+ /* Get the file number for a new or existing InputFiles entry. */
+ int file_no = register_orig_sourcefile(source);
+
+ CurrentLB->orig_file = file_no;
+ CurrentLB->orig_source = InputFiles[file_no-1].filename;
+ CurrentLB->orig_line = line;
+ CurrentLB->orig_char = charnum;
+}
+
+/* Error locations. */
+
+extern debug_location get_current_debug_location(void)
+{ debug_location result;
+ /* Assume that all input characters are one byte. */
+ result.file_index = CurrentLB->file_no;
+ result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
+ result.end_byte_index = result.beginning_byte_index;
+ result.beginning_line_number = CurrentLB->source_line;
+ result.end_line_number = result.beginning_line_number;
+ result.beginning_character_number =
+ CurrentLB->chars_read - CurrentLB->line_start;
+ result.end_character_number = result.beginning_character_number;
+ result.orig_file_index = CurrentLB->orig_file;
+ result.orig_beg_line_number = CurrentLB->orig_line;
+ result.orig_beg_char_number = CurrentLB->orig_char;
+ return result;
+}
+
+static debug_location ErrorReport_debug_location;
+
+extern void report_errors_at_current_line(void)
+{ ErrorReport.line_number = CurrentLB->source_line;
+ ErrorReport.file_number = CurrentLB->file_no;
+ if (ErrorReport.file_number == 255)
+ ErrorReport.file_number = -1;
+ ErrorReport.source = CurrentLB->filename;
+ ErrorReport.main_flag = CurrentLB->main_flag;
+ if (debugfile_switch)
+ ErrorReport_debug_location = get_current_debug_location();
+ ErrorReport.orig_file = CurrentLB->orig_file;
+ ErrorReport.orig_source = CurrentLB->orig_source;
+ ErrorReport.orig_line = CurrentLB->orig_line;
+ ErrorReport.orig_char = CurrentLB->orig_char;
+}
+
+extern debug_location get_error_report_debug_location(void)
+{ return ErrorReport_debug_location;
+}
+
+extern int32 get_current_line_start(void)
+{ return CurrentLB->line_start;
+}
+
+brief_location blank_brief_location;
+
+extern brief_location get_brief_location(ErrorPosition *errpos)
+{
+ brief_location loc;
+ loc.file_index = errpos->file_number;
+ loc.line_number = errpos->line_number;
+ loc.orig_file_index = errpos->orig_file;
+ loc.orig_line_number = errpos->orig_line;
+ return loc;
+}
+
+extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
+{
+ if (loc.file_index != -1)
+ { errpos->file_number = loc.file_index;
+ errpos->line_number = loc.line_number;
+ errpos->main_flag = (errpos->file_number == 1);
+ errpos->orig_source = NULL;
+ errpos->orig_file = loc.orig_file_index;
+ errpos->orig_line = loc.orig_line_number;
+ errpos->orig_char = 0;
+ }
+}
+
+/* ------------------------------------------------------------------------- */
+/* Hash printing and line counting */
+/* ------------------------------------------------------------------------- */
+
+static void print_hash(void)
+{
+ /* Hash-printing is the practice of printing a # character every 100
+ lines of source code (the -x switch), reassuring the user that
+ progress is being made */
+
+ if (no_hash_printed_yet)
+ { printf("::"); no_hash_printed_yet = FALSE;
+ }
+ printf("#"); hash_printed_since_newline = TRUE;
+
+#ifndef MAC_FACE
+ /* On some systems, text output is buffered to a line at a time, and
+ this would frustrate the point of hash-printing, so: */
+
+ fflush(stdout);
+#endif
+}
+
+static void reached_new_line(void)
+{
+ /* Called to signal that a new line has been reached in the source code */
+
+ forerrors_pointer = 0;
+
+ CurrentLB->source_line++;
+ CurrentLB->line_start = CurrentLB->chars_read;
+
+ total_source_line_count++;
+
+ if (total_source_line_count%100==0)
+ { if (hash_switch) print_hash();
+#ifdef MAC_MPW
+ SpinCursor(32); /* I.e., allow other tasks to run */
+#endif
+ }
+
+#ifdef MAC_FACE
+ if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
+ { ProcessEvents (&g_proc);
+ if (g_proc != true)
+ { free_arrays();
+ close_all_source();
+ if (temporary_files_switch)
+ remove_temp_files();
+ if (store_the_text)
+ my_free(&all_text,"transcription text");
+ abort_transcript_file();
+ longjmp (g_fallback, 1);
+ }
+ }
+#endif
+}
+
+static void new_syntax_line(void)
+{ if (source_to_analyse != NULL) forerrors_pointer = 0;
+ report_errors_at_current_line();
+}
+
+/* Return 10 raised to the expo power.
+ *
+ * I'm avoiding the standard pow() function for a rather lame reason:
+ * it's in the libmath (-lm) library, and I don't want to change the
+ * build model for the compiler. So, this is implemented with a stupid
+ * lookup table. It's faster than pow() for small values of expo.
+ * Probably not as fast if expo is 200, but "$+1e200" is an overflow
+ * anyway, so I don't expect that to be a problem.
+ *
+ * (For some reason, frexp() and ldexp(), which are used later on, do
+ * not require libmath to be linked in.)
+ */
+static double pow10_cheap(int expo)
+{
+ #define POW10_RANGE (8)
+ static double powers[POW10_RANGE*2+1] = {
+ 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
+ 1.0,
+ 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
+ };
+
+ double res = 1.0;
+
+ if (expo < 0) {
+ for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
+ res *= powers[0];
+ }
+ return res * powers[POW10_RANGE+expo];
+ }
+ else {
+ for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
+ res *= powers[POW10_RANGE*2];
+ }
+ return res * powers[POW10_RANGE+expo];
+ }
+}
+
+/* Return the IEEE-754 single-precision encoding of a floating-point
+ * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
+ * for an explanation.
+ *
+ * The number is provided in the pieces it was parsed in:
+ * [+|-] intv "." fracv "e" [+|-]expo
+ *
+ * If the magnitude is too large (beyond about 3.4e+38), this returns
+ * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
+ * small (below about 1e-45), this returns a zero value (0x00000000 or
+ * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
+ * lexer should never do that).
+ *
+ * Note that using a float constant does *not* set the uses_float_features
+ * flag (which would cause the game file to be labelled 3.1.2). There's
+ * no VM feature here, just an integer. Of course, any use of the float
+ * *opcodes* will set the flag.
+ *
+ * The math functions in this routine require #including <math.h>, but
+ * they should not require linking the math library (-lm). At least,
+ * they do not on OSX and Linux.
+ */
+static int32 construct_float(int signbit, double intv, double fracv, int expo)
+{
+ double absval = (intv + fracv) * pow10_cheap(expo);
+ int32 sign = (signbit ? 0x80000000 : 0x0);
+ double mant;
+ int32 fbits;
+
+ if (isinf(absval)) {
+ return sign | 0x7f800000; /* infinity */
+ }
+ if (isnan(absval)) {
+ return sign | 0x7fc00000;
+ }
+
+ mant = frexp(absval, &expo);
+
+ /* Normalize mantissa to be in the range [1.0, 2.0) */
+ if (0.5 <= mant && mant < 1.0) {
+ mant *= 2.0;
+ expo--;
+ }
+ else if (mant == 0.0) {
+ expo = 0;
+ }
+ else {
+ return sign | 0x7f800000; /* infinity */
+ }
+
+ if (expo >= 128) {
+ return sign | 0x7f800000; /* infinity */
+ }
+ else if (expo < -126) {
+ /* Denormalized (very small) number */
+ mant = ldexp(mant, 126 + expo);
+ expo = 0;
+ }
+ else if (!(expo == 0 && mant == 0.0)) {
+ expo += 127;
+ mant -= 1.0; /* Get rid of leading 1 */
+ }
+
+ mant *= 8388608.0; /* 2^23 */
+ fbits = (int32)(mant + 0.5); /* round mant to nearest int */
+ if (fbits >> 23) {
+ /* The carry propagated out of a string of 23 1 bits. */
+ fbits = 0;
+ expo++;
+ if (expo >= 255) {
+ return sign | 0x7f800000; /* infinity */
+ }
+ }
+
+ return (sign) | ((int32)(expo << 23)) | (fbits);
+}
+
+/* ------------------------------------------------------------------------- */
+/* Characters are read via a "pipeline" of variables, allowing us to look */
+/* up to three characters ahead of the current position. */
+/* */
+/* There are two possible sources: from the source files being loaded in, */
+/* and from a string inside Inform (which is where the code for veneer */
+/* routines comes from). Each source has its own get-next-character */
+/* routine. */
+/* ------------------------------------------------------------------------- */
+/* Source 1: from files */
+/* */
+/* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
+/* from the current input file. If the file runs out, then if it was */
+/* the last source file 4 EOF characters are placed in the buffer: if it */
+/* was only an Include file ending, then a '\n' character is placed there */
+/* (essentially to force termination of any comment line) followed by */
+/* three harmless spaces. */
+/* */
+/* The routine returns the number of characters it has written, and note */
+/* that this conveniently ensures that all characters in the buffer come */
+/* from the same file. */
+/* ------------------------------------------------------------------------- */
+
+#define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
+
+typedef struct Sourcefile_s
+{ char *buffer; /* Input buffer */
+ int read_pos; /* Read position in buffer */
+ int size; /* Number of meaningful
+ characters in buffer */
+ int la, la2, la3; /* Three characters of
+ lookahead pipeline */
+ int file_no; /* Internal file number
+ (1, 2, 3, ...) */
+ LexicalBlock LB;
+} Sourcefile;
+
+static Sourcefile *FileStack;
+static int File_sp; /* Stack pointer */
+
+static Sourcefile *CF; /* Top entry on stack */
+
+static int last_input_file;
+
+static void begin_buffering_file(int i, int file_no)
+{ int j, cnt; uchar *p;
+
+ if (i >= MAX_INCLUSION_DEPTH)
+ memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
+
+ p = (uchar *) FileStack[i].buffer;
+
+ if (i>0)
+ { FileStack[i-1].la = lookahead;
+ FileStack[i-1].la2 = lookahead2;
+ FileStack[i-1].la3 = lookahead3;
+ }
+
+ FileStack[i].file_no = file_no;
+ FileStack[i].size = file_load_chars(file_no,
+ (char *) p, SOURCE_BUFFER_SIZE);
+ lookahead = source_to_iso_grid[p[0]];
+ lookahead2 = source_to_iso_grid[p[1]];
+ lookahead3 = source_to_iso_grid[p[2]];
+ if (LOOKAHEAD_SIZE != 3)
+ compiler_error
+ ("Lexer lookahead size does not match hard-coded lookahead code");
+ FileStack[i].read_pos = LOOKAHEAD_SIZE;
+
+ if (file_no==1) FileStack[i].LB.main_flag = TRUE;
+ else FileStack[i].LB.main_flag = FALSE;
+ FileStack[i].LB.sys_flag = FALSE;
+ FileStack[i].LB.source_line = 1;
+ FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
+ FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
+ FileStack[i].LB.filename = InputFiles[file_no-1].filename;
+ FileStack[i].LB.file_no = file_no;
+ FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
+ FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
+
+ CurrentLB = &(FileStack[i].LB);
+ CF = &(FileStack[i]);
+
+ /* Check for recursive inclusion */
+ cnt = 0;
+ for (j=0; j<i; j++)
+ { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
+ cnt++;
+ }
+ if (cnt==1)
+ warning_named("File included more than once",
+ FileStack[j].LB.filename);
+}
+
+static void create_char_pipeline(void)
+{
+ File_sp = 0;
+ begin_buffering_file(File_sp++, 1);
+ pipeline_made = TRUE;
+ last_input_file = current_input_file;
+}
+
+static int get_next_char_from_pipeline(void)
+{ uchar *p;
+
+ while (last_input_file < current_input_file)
+ {
+ /* An "Include" file must have opened since the last character
+ was read. Perhaps more than one. We run forward through the
+ list and add them to the include stack. But we ignore
+ "Origsource" files (which were never actually opened for
+ reading). */
+
+ last_input_file++;
+ if (!InputFiles[last_input_file-1].is_input)
+ continue;
+
+ begin_buffering_file(File_sp++, last_input_file);
+ }
+ if (last_input_file != current_input_file)
+ compiler_error("last_input_file did not match after Include");
+
+ if (File_sp == 0)
+ { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
+ }
+
+ if (CF->read_pos == CF->size)
+ { CF->size =
+ file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
+ CF->read_pos = 0;
+ }
+ else
+ if (CF->read_pos == -(CF->size))
+ { set_token_location(get_current_debug_location());
+ File_sp--;
+ if (File_sp == 0)
+ { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
+ }
+ CF = &(FileStack[File_sp-1]);
+ CurrentLB = &(FileStack[File_sp-1].LB);
+ lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
+ if (CF->read_pos == CF->size)
+ { CF->size =
+ file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
+ CF->read_pos = 0;
+ }
+ set_token_location(get_current_debug_location());
+ }
+
+ p = (uchar *) (CF->buffer);
+
+ current = lookahead;
+ lookahead = lookahead2;
+ lookahead2 = lookahead3;
+ lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
+
+ CurrentLB->chars_read++;
+ if (forerrors_pointer < 511)
+ forerrors_buff[forerrors_pointer++] = current;
+ if (current == '\n') reached_new_line();
+ return(current);
+}
+
+/* ------------------------------------------------------------------------- */
+/* Source 2: from a string */
+/* ------------------------------------------------------------------------- */
+
+static int source_to_analyse_pointer; /* Current read position */
+
+static int get_next_char_from_string(void)
+{ uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
+ current = source_to_iso_grid[p[0]];
+
+ if (current == 0) lookahead = 0;
+ else lookahead = source_to_iso_grid[p[1]];
+ if (lookahead == 0) lookahead2 = 0;
+ else lookahead2 = source_to_iso_grid[p[2]];
+ if (lookahead2 == 0) lookahead3 = 0;
+ else lookahead3 = source_to_iso_grid[p[3]];
+
+ CurrentLB->chars_read++;
+ if (forerrors_pointer < 511)
+ forerrors_buff[forerrors_pointer++] = current;
+ if (current == '\n') reached_new_line();
+ return(current);
+}
+
+/* ========================================================================= */
+/* The interface between the lexer and Inform's higher levels: */
+/* */
+/* put_token_back() (effectively) move the read position */
+/* back by one token */
+/* */
+/* get_next_token() copy the token at the current read */
+/* position into the triple */
+/* (token_type, token_value, token_text) */
+/* and move the read position forward */
+/* by one */
+/* */
+/* restart_lexer(source, name) if source is NULL, initialise the lexer */
+/* to read from source files; */
+/* otherwise, to read from this string. */
+/* ------------------------------------------------------------------------- */
+
+extern void put_token_back(void)
+{ tokens_put_back++;
+
+ if (tokens_trace_level > 0)
+ { if (tokens_trace_level == 1) printf("<- ");
+ else printf("<-\n");
+ }
+
+ /* The following error, of course, should never happen! */
+
+ if (tokens_put_back == CIRCLE_SIZE)
+ { compiler_error("The lexical analyser has collapsed because of a wrong \
+assumption inside Inform");
+ tokens_put_back--;
+ return;
+ }
+}
+
+extern void get_next_token(void)
+{ int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
+ int returning_a_put_back_token = TRUE;
+
+ context = lexical_context();
+
+ if (tokens_put_back > 0)
+ { i = circle_position - tokens_put_back + 1;
+ if (i<0) i += CIRCLE_SIZE;
+ tokens_put_back--;
+ if (context != token_contexts[i])
+ { j = circle[i].type;
+ if ((j==0) || ((j>=100) && (j<200)))
+ interpret_identifier(i, FALSE);
+ }
+ goto ReturnBack;
+ }
+ returning_a_put_back_token = FALSE;
+
+ if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
+ else circle_position++;
+
+ if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
+ lex_p = lexeme_memory;
+
+ circle[circle_position].text = lex_p;
+ circle[circle_position].value = 0;
+ *lex_p = 0;
+
+ StartTokenAgain:
+ d = (*get_next_char)();
+ e = tokeniser_grid[d];
+
+ if (next_token_begins_syntax_line)
+ { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
+ { new_syntax_line();
+ next_token_begins_syntax_line = FALSE;
+ }
+ }
+
+ circle[circle_position].location = get_current_debug_location();
+
+ switch(e)
+ { case 0: char_error("Illegal character found in source:", d);
+ goto StartTokenAgain;
+
+ case WHITESPACE_CODE:
+ while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
+ (*get_next_char)();
+ goto StartTokenAgain;
+
+ case COMMENT_CODE:
+ while ((lookahead != '\n') && (lookahead != 0))
+ (*get_next_char)();
+ goto StartTokenAgain;
+
+ case EOF_CODE:
+ circle[circle_position].type = EOF_TT;
+ strcpy(lex_p, "<end of file>");
+ lex_p += strlen(lex_p) + 1;
+ break;
+
+ case DIGIT_CODE:
+ radix = 10;
+ ReturnNumber:
+ n=0;
+ do
+ { n = n*radix + character_digit_value[d];
+ *lex_p++ = d;
+ } while ((character_digit_value[lookahead] < radix)
+ && (d = (*get_next_char)(), TRUE));
+
+ *lex_p++ = 0;
+ circle[circle_position].type = NUMBER_TT;
+ circle[circle_position].value = n;
+ break;
+
+ FloatNumber:
+ { int expo=0; double intv=0, fracv=0;
+ int expocount=0, intcount=0, fraccount=0;
+ int signbit = (d == '-');
+ *lex_p++ = d;
+ while (character_digit_value[lookahead] < 10) {
+ intv = 10.0*intv + character_digit_value[lookahead];
+ intcount++;
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ }
+ if (lookahead == '.') {
+ double fracpow = 1.0;
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ while (character_digit_value[lookahead] < 10) {
+ fracpow *= 0.1;
+ fracv = fracv + fracpow*character_digit_value[lookahead];
+ fraccount++;
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ }
+ }
+ if (lookahead == 'e' || lookahead == 'E') {
+ int exposign = 0;
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ if (lookahead == '+' || lookahead == '-') {
+ exposign = (lookahead == '-');
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ }
+ while (character_digit_value[lookahead] < 10) {
+ expo = 10*expo + character_digit_value[lookahead];
+ expocount++;
+ *lex_p++ = lookahead;
+ (*get_next_char)();
+ }
+ if (expocount == 0)
+ error("Floating-point literal must have digits after the 'e'");
+ if (exposign) { expo = -expo; }
+ }
+ if (intcount + fraccount == 0)
+ error("Floating-point literal must have digits");
+ n = construct_float(signbit, intv, fracv, expo);
+ }
+ *lex_p++ = 0;
+ circle[circle_position].type = NUMBER_TT;
+ circle[circle_position].value = n;
+ if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
+ break;
+
+ case RADIX_CODE:
+ radix = 16; d = (*get_next_char)();
+ if (d == '-' || d == '+') { goto FloatNumber; }
+ if (d == '$') { d = (*get_next_char)(); radix = 2; }
+ if (character_digit_value[d] >= radix)
+ { if (radix == 2)
+ error("Binary number expected after '$$'");
+ else
+ error("Hexadecimal number expected after '$'");
+ }
+ goto ReturnNumber;
+
+ case QUOTE_CODE: /* Single-quotes: scan a literal string */
+ quoted_size=0;
+ do
+ { e = d; d = (*get_next_char)(); *lex_p++ = d;
+ if (quoted_size++==64)
+ { error(
+ "Too much text for one pair of quotations '...' to hold");
+ *lex_p='\''; break;
+ }
+ if ((d == '\'') && (e != '@'))
+ { if (quoted_size == 1)
+ { d = (*get_next_char)(); *lex_p++ = d;
+ if (d != '\'')
+ error("No text between quotation marks ''");
+ }
+ break;
+ }
+ } while (d != EOF);
+ if (d==EOF) ebf_error("'\''", "end of file");
+ *(lex_p-1) = 0;
+ circle[circle_position].type = SQ_TT;
+ break;
+
+ case DQUOTE_CODE: /* Double-quotes: scan a literal string */
+ quoted_size=0;
+ do
+ { d = (*get_next_char)(); *lex_p++ = d;
+ if (quoted_size++==MAX_QTEXT_SIZE)
+ { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
+ break;
+ }
+ if (d == '\n')
+ { lex_p--;
+ while (*(lex_p-1) == ' ') lex_p--;
+ if (*(lex_p-1) != '^') *lex_p++ = ' ';
+ while ((lookahead != EOF) &&
+ (tokeniser_grid[lookahead] == WHITESPACE_CODE))
+ (*get_next_char)();
+ }
+ else if (d == '\\')
+ { int newline_passed = FALSE;
+ lex_p--;
+ while ((lookahead != EOF) &&
+ (tokeniser_grid[lookahead] == WHITESPACE_CODE))
+ if ((d = (*get_next_char)()) == '\n')
+ newline_passed = TRUE;
+ if (!newline_passed)
+ { char chb[4];
+ chb[0] = '\"'; chb[1] = lookahead;
+ chb[2] = '\"'; chb[3] = 0;
+ ebf_error("empty rest of line after '\\' in string",
+ chb);
+ }
+ }
+ } while ((d != EOF) && (d!='\"'));
+ if (d==EOF) ebf_error("'\"'", "end of file");
+ *(lex_p-1) = 0;
+ circle[circle_position].type = DQ_TT;
+ break;
+
+ case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
+
+ *lex_p++ = d; n=1;
+ while ((n<=MAX_IDENTIFIER_LENGTH)
+ && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
+ || (tokeniser_grid[lookahead] == DIGIT_CODE)))
+ n++, *lex_p++ = (*get_next_char)();
+
+ *lex_p++ = 0;
+
+ if (n > MAX_IDENTIFIER_LENGTH)
+ { char bad_length[100];
+ sprintf(bad_length,
+ "Name exceeds the maximum length of %d characters:",
+ MAX_IDENTIFIER_LENGTH);
+ error_named(bad_length, circle[circle_position].text);
+ /* Trim token so that it doesn't violate
+ MAX_IDENTIFIER_LENGTH during error recovery */
+ circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
+ }
+
+ if (dont_enter_into_symbol_table)
+ { circle[circle_position].type = DQ_TT;
+ circle[circle_position].value = 0;
+ if (dont_enter_into_symbol_table == -2)
+ interpret_identifier(circle_position, TRUE);
+ break;
+ }
+
+ interpret_identifier(circle_position, FALSE);
+ break;
+
+ default:
+
+ /* The character is initial to at least one of the separators */
+
+ for (j=e>>4, k=j+(e&0x0f); j<k; j++)
+ { r = (char *) separators[j];
+ if (r[1]==0)
+ { *lex_p++=d; *lex_p++=0;
+ goto SeparatorMatched;
+ }
+ else
+ if (r[2]==0)
+ { if (*(r+1) == lookahead)
+ { *lex_p++=d;
+ *lex_p++=(*get_next_char)();
+ *lex_p++=0;
+ goto SeparatorMatched;
+ }
+ }
+ else
+ { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
+ { *lex_p++=d;
+ *lex_p++=(*get_next_char)();
+ *lex_p++=(*get_next_char)();
+ *lex_p++=0;
+ goto SeparatorMatched;
+ }
+ }
+ }
+
+ /* The following contingency never in fact arises with the
+ current set of separators, but might in future */
+
+ *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
+ *lex_p++ = 0;
+ error_named("Unrecognised combination in source:", lex_p);
+ goto StartTokenAgain;
+
+ SeparatorMatched:
+
+ circle[circle_position].type = SEP_TT;
+ circle[circle_position].value = j;
+ switch(j)
+ { case SEMICOLON_SEP: break;
+ case HASHNDOLLAR_SEP:
+ case HASHWDOLLAR_SEP:
+ if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
+ { error_named("Character expected after",
+ circle[circle_position].text);
+ break;
+ }
+ lex_p--;
+ *lex_p++ = (*get_next_char)();
+ while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
+ || (tokeniser_grid[lookahead] == DIGIT_CODE))
+ *lex_p++ = (*get_next_char)();
+ *lex_p++ = 0;
+ break;
+ case HASHADOLLAR_SEP:
+ case HASHGDOLLAR_SEP:
+ case HASHRDOLLAR_SEP:
+ case HASHHASH_SEP:
+ if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
+ { error_named("Alphabetic character expected after",
+ circle[circle_position].text);
+ break;
+ }
+ lex_p--;
+ while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
+ || (tokeniser_grid[lookahead] == DIGIT_CODE))
+ *lex_p++ = (*get_next_char)();
+ *lex_p++ = 0;
+ break;
+ }
+ break;
+ }
+
+ i = circle_position;
+
+ ReturnBack:
+ token_value = circle[i].value;
+ token_type = circle[i].type;
+ token_text = circle[i].text;
+ if (!returning_a_put_back_token)
+ { set_token_location(circle[i].location);
+ }
+ token_contexts[i] = context;
+
+ if (tokens_trace_level > 0)
+ { if (tokens_trace_level == 1)
+ printf("'%s' ", circle[i].text);
+ else
+ { printf("-> "); describe_token(circle[i]);
+ printf(" ");
+ if (tokens_trace_level > 2) print_context(token_contexts[i]);
+ printf("\n");
+ }
+ }
+}
+
+static char veneer_error_title[64];
+
+extern void restart_lexer(char *lexical_source, char *name)
+{ int i;
+ circle_position = 0;
+ for (i=0; i<CIRCLE_SIZE; i++)
+ { circle[i].type = 0;
+ circle[i].value = 0;
+ circle[i].text = "(if this is ever visible, there is a bug)";
+ token_contexts[i] = 0;
+ }
+
+ lex_p = lexeme_memory;
+ tokens_put_back = 0;
+ forerrors_pointer = 0;
+ dont_enter_into_symbol_table = FALSE;
+ return_sp_as_variable = FALSE;
+ next_token_begins_syntax_line = TRUE;
+
+ source_to_analyse = lexical_source;
+
+ if (source_to_analyse == NULL)
+ { get_next_char = get_next_char_from_pipeline;
+ if (!pipeline_made) create_char_pipeline();
+ forerrors_buff[0] = 0; forerrors_pointer = 0;
+ }
+ else
+ { get_next_char = get_next_char_from_string;
+ source_to_analyse_pointer = 0;
+ CurrentLB = &StringLB;
+ sprintf(veneer_error_title, "<veneer routine '%s'>", name);
+ StringLB.filename = veneer_error_title;
+
+ CurrentLB->source_line = 1;
+ CurrentLB->line_start = 0;
+ CurrentLB->chars_read = 0;
+ }
+}
+
+/* ========================================================================= */
+/* Data structure management routines */
+/* ------------------------------------------------------------------------- */
+
+extern void init_lexer_vars(void)
+{
+ blank_brief_location.file_index = -1;
+ blank_brief_location.line_number = 0;
+ blank_brief_location.orig_file_index = 0;
+ blank_brief_location.orig_line_number = 0;
+}
+
+extern void lexer_begin_prepass(void)
+{ total_source_line_count = 0;
+ CurrentLB = &NoFileOpen;
+ report_errors_at_current_line();
+}
+
+extern void lexer_begin_pass(void)
+{ no_hash_printed_yet = TRUE;
+ hash_printed_since_newline = FALSE;
+
+ pipeline_made = FALSE;
+
+ restart_lexer(NULL, NULL);
+}
+
+extern void lexer_endpass(void)
+{ CurrentLB = &MakingOutput;
+ report_errors_at_current_line();
+}
+
+extern void lexer_allocate_arrays(void)
+{ int i;
+
+ FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
+ "filestack buffer");
+
+ for (i=0; i<MAX_INCLUSION_DEPTH; i++)
+ FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
+
+ lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
+
+ keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
+ "keyword hash table");
+ keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
+ "keyword hash end table");
+ keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
+ "keyword hashing linked list");
+ local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
+ "local variable hash table");
+ local_variable_text_table = my_malloc(
+ (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
+ "text of local variable names");
+
+ local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
+ "local variable hash codes");
+ local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
+ "local variable text pointers");
+
+ make_tokeniser_grid();
+ make_keywords_tables();
+
+ first_token_locations =
+ my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
+ first_token_locations->location.file_index = 0;
+ first_token_locations->location.beginning_byte_index = 0;
+ first_token_locations->location.end_byte_index = 0;
+ first_token_locations->location.beginning_line_number = 0;
+ first_token_locations->location.end_line_number = 0;
+ first_token_locations->location.beginning_character_number = 0;
+ first_token_locations->location.end_character_number = 0;
+ first_token_locations->location.orig_file_index = 0;
+ first_token_locations->location.orig_beg_line_number = 0;
+ first_token_locations->location.orig_beg_char_number = 0;
+ first_token_locations->next = NULL;
+ first_token_locations->reference_count = 0;
+ last_token_location = first_token_locations;
+}
+
+extern void lexer_free_arrays(void)
+{ int i; char *p;
+
+ for (i=0; i<MAX_INCLUSION_DEPTH; i++)
+ { p = FileStack[i].buffer;
+ my_free(&p, "source file buffer");
+ }
+ my_free(&FileStack, "filestack buffer");
+ my_free(&lexeme_memory, "lexeme memory");
+
+ my_free(&keywords_hash_table, "keyword hash table");
+ my_free(&keywords_hash_ends_table, "keyword hash end table");
+ my_free(&keywords_data_table, "keyword hashing linked list");
+ my_free(&local_variable_hash_table, "local variable hash table");
+ my_free(&local_variable_text_table, "text of local variable names");
+
+ my_free(&local_variable_hash_codes, "local variable hash codes");
+ my_free(&local_variable_texts, "local variable text pointers");
+
+ cleanup_token_locations(NULL);
+}
+
+/* ========================================================================= */