1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
4 /* Part of Inform 6.40 */
5 /* copyright (c) Graham Nelson 1993 - 2022 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ */
20 /* ------------------------------------------------------------------------- */
24 int total_source_line_count, /* Number of source lines so far */
26 no_hash_printed_yet, /* Have not yet printed the first # */
27 hash_printed_since_newline, /* A hash has been printed since the
28 most recent new-line was printed
29 (generally as a result of an error
30 message or the start of pass) */
31 dont_enter_into_symbol_table, /* Return names as text (with
32 token type DQ_TT, i.e., as if
33 they had double-quotes around)
34 and not as entries in the symbol
35 table, when TRUE. If -2, only the
36 keyword table is searched. */
37 return_sp_as_variable; /* When TRUE, the word "sp" denotes
38 the stack pointer variable
39 (used in assembly language only) */
40 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
41 line (for error reporting, etc.)
42 on the source code line where
43 the next token appears */
45 int32 last_mapped_line; /* Last syntax line reported to debugging file */
47 /* ------------------------------------------------------------------------- */
48 /* The lexer's output is a sequence of structs, each called a "token", */
49 /* representing one lexical unit (or "lexeme") each. Instead of providing */
50 /* "lookahead" (that is, always having available the next token after the */
51 /* current one, so that syntax analysers higher up in Inform can have */
52 /* advance knowledge of what is coming), the lexer instead has a system */
53 /* where tokens can be read in and then "put back again". */
54 /* The meaning of the number (and to some extent the text) supplied with */
55 /* a token depends on its type: see "header.h" for the list of types. */
56 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
57 /* number, and translated to the token: */
58 /* type NUMBER_TT, value 483, text "$1e3" */
59 /* ------------------------------------------------------------------------- */
60 /* These three variables are set to the current token on a call to */
61 /* get_next_token() (but are not changed by a call to put_token_back()). */
62 /* (It would be tidier to use a token_data structure, rather than having */
63 /* get_next_token() unpack three values. But this is the way it is.) */
64 /* ------------------------------------------------------------------------- */
70 /* ------------------------------------------------------------------------- */
71 /* The next two variables are the head and tail of a singly linked list. */
72 /* The tail stores the portion most recently read from the current */
73 /* lexical block; its end values therefore describe the location of the */
74 /* current token, and are updated whenever the three variables above are */
75 /* via set_token_location(...). Earlier vertices, if any, represent the */
76 /* regions of lexical blocks read beforehand, where new vertices are */
77 /* only introduced by interruptions like a file inclusion or an EOF. */
78 /* Vertices are deleted off of the front of the list once they are no */
79 /* longer referenced by pending debug information records. */
80 /* ------------------------------------------------------------------------- */
82 static debug_locations *first_token_locations;
83 static debug_locations *last_token_location;
85 extern debug_location get_token_location(void)
86 { debug_location result;
87 debug_location *location = &(last_token_location->location);
88 result.file_index = location->file_index;
89 result.beginning_byte_index = location->end_byte_index;
90 result.end_byte_index = location->end_byte_index;
91 result.beginning_line_number = location->end_line_number;
92 result.end_line_number = location->end_line_number;
93 result.beginning_character_number = location->end_character_number;
94 result.end_character_number = location->end_character_number;
95 result.orig_file_index = location->orig_file_index;
96 result.orig_beg_line_number = location->orig_beg_line_number;
97 result.orig_beg_char_number = location->orig_beg_char_number;
101 extern debug_locations get_token_locations(void)
102 { debug_locations result;
103 result.location = get_token_location();
105 result.reference_count = 0;
109 static void set_token_location(debug_location location)
110 { if (location.file_index == last_token_location->location.file_index)
111 { last_token_location->location.end_byte_index =
112 location.end_byte_index;
113 last_token_location->location.end_line_number =
114 location.end_line_number;
115 last_token_location->location.end_character_number =
116 location.end_character_number;
117 last_token_location->location.orig_file_index =
118 location.orig_file_index;
119 last_token_location->location.orig_beg_line_number =
120 location.orig_beg_line_number;
121 last_token_location->location.orig_beg_char_number =
122 location.orig_beg_char_number;
124 { debug_locations*successor =
126 (sizeof(debug_locations),
127 "debug locations of recent tokens");
128 successor->location = location;
129 successor->next = NULL;
130 successor->reference_count = 0;
131 last_token_location->next = successor;
132 last_token_location = successor;
136 extern debug_location_beginning get_token_location_beginning(void)
137 { debug_location_beginning result;
138 ++(last_token_location->reference_count);
139 result.head = last_token_location;
140 result.beginning_byte_index =
141 last_token_location->location.end_byte_index;
142 result.beginning_line_number =
143 last_token_location->location.end_line_number;
144 result.beginning_character_number =
145 last_token_location->location.end_character_number;
146 result.orig_file_index = last_token_location->location.orig_file_index;
147 result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
148 result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
153 static void cleanup_token_locations(debug_location_beginning*beginning)
154 { if (first_token_locations)
155 { while (first_token_locations &&
156 !first_token_locations->reference_count)
157 { debug_locations*moribund = first_token_locations;
158 first_token_locations = moribund->next;
159 my_free(&moribund, "debug locations of recent tokens");
161 (beginning->head == moribund || !first_token_locations))
163 ("Records needed by a debug_location_beginning are no "
164 "longer allocated, perhaps because of an invalid reuse "
165 "of this or an earlier beginning");
171 ("Attempt to use a debug_location_beginning when no token "
172 "locations are defined");
175 ("Attempt to clean up token locations when no token locations "
181 extern void discard_token_location(debug_location_beginning beginning)
182 { --(beginning.head->reference_count);
185 extern debug_locations get_token_location_end
186 (debug_location_beginning beginning)
187 { debug_locations result;
188 cleanup_token_locations(&beginning);
189 --(beginning.head->reference_count);
190 /* Sometimes we know what we'll read before we switch to the lexical block
191 where we'll read it. In such cases the beginning will be placed in the
192 prior block and last exactly zero bytes there. It's misleading to
193 include such ranges, so we gobble them. */
194 if (beginning.head->location.end_byte_index ==
195 beginning.beginning_byte_index &&
196 beginning.head->next)
197 { beginning.head = beginning.head->next;
198 result.location = beginning.head->location;
199 result.location.beginning_byte_index = 0;
200 result.location.beginning_line_number = 1;
201 result.location.beginning_character_number = 1;
203 { result.location = beginning.head->location;
204 result.location.beginning_byte_index =
205 beginning.beginning_byte_index;
206 result.location.beginning_line_number =
207 beginning.beginning_line_number;
208 result.location.beginning_character_number =
209 beginning.beginning_character_number;
212 result.location.orig_file_index =
213 beginning.orig_file_index;
214 result.location.orig_beg_line_number =
215 beginning.orig_beg_line_number;
216 result.location.orig_beg_char_number =
217 beginning.orig_beg_char_number;
219 result.next = beginning.head->next;
220 result.reference_count = 0;
224 /* ------------------------------------------------------------------------- */
225 /* In order to be able to put tokens back efficiently, the lexer stores */
226 /* tokens in a "circle": the variable circle_position ranges between */
227 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
228 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
229 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
231 /* Note that the circle struct type is lexeme_data, whereas the expression */
232 /* code all works in token_data. They have slightly different needs. The */
233 /* data is exported through the token_text, token_value, token_type */
234 /* globals, so there's no need to use the same struct at both levels. */
236 /* Unlike some compilers, Inform does not have a context-free lexer: in */
237 /* fact it has 12288 different possible states. However, the context only */
238 /* affects the interpretation of "identifiers": lexemes beginning with a */
239 /* letter and containing up to 32 chars of alphanumeric and underscore */
240 /* chars. (For example, "default" may refer to the directive or statement */
241 /* of that name, and which token values are returned depends on the */
242 /* current lexical context.) */
244 /* Along with each token, we also store the lexical context it was */
245 /* translated under; because if it is called for again, there may need */
246 /* to be a fresh interpretation of it if the context has changed. */
247 /* ------------------------------------------------------------------------- */
249 #define CIRCLE_SIZE 6
251 /* (The worst case for token lookahead is distinguishing between an
252 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
254 static int circle_position;
255 static lexeme_data circle[CIRCLE_SIZE];
257 /* ------------------------------------------------------------------------- */
258 /* A complication, however, is that the text of some lexemes needs to be */
259 /* held in Inform's memory for much longer periods: for example, a */
260 /* dictionary word lexeme (like "'south'") must have its text preserved */
261 /* until the code generation time for the expression it occurs in, when */
262 /* the dictionary reference is actually made. We handle this by keeping */
263 /* all lexeme text until the end of the statement (or, for top-level */
264 /* directives, until the end of the directive). Then we call */
265 /* release_token_texts() to start over. The lextexts array will therefore */
266 /* grow to the largest number of lexemes in a single statement or */
268 /* ------------------------------------------------------------------------- */
270 typedef struct lextext_s {
272 size_t size; /* Allocated size (including terminal null)
273 This is always at least MAX_IDENTIFIER_LENGTH+1 */
276 static lextext *lextexts; /* Allocated to no_lextexts */
277 static memory_list lextexts_memlist;
278 static int no_lextexts;
280 static int cur_lextexts; /* Number of lextexts in current use
281 (cur_lextexts <= no_lextexts) */
283 static int lex_index; /* Index of lextext being written to */
284 static int lex_pos; /* Current write position in that lextext */
286 /* ------------------------------------------------------------------------- */
287 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
288 /* LR(3) grammar to translate characters into tokens). */
289 /* ------------------------------------------------------------------------- */
291 #define LOOKAHEAD_SIZE 3
293 static int current, lookahead, /* The latest character read, and */
294 lookahead2, lookahead3; /* the three characters following it */
296 static int pipeline_made; /* Whether or not the pipeline of
297 characters has been constructed
300 static int (* get_next_char)(void); /* Routine for reading the stream of
301 characters: the lexer does not
302 need any "ungetc" routine for
303 putting them back again. End of
304 stream is signalled by returning
307 static char *source_to_analyse; /* The current lexical source:
308 NULL for "load from source files",
309 otherwise this points to a string
310 containing Inform code */
312 static int tokens_put_back; /* Count of the number of backward
313 moves made from the last-read
316 /* This gets called for both token_data and lexeme_data structs. It prints
317 a description of the common part (the text, value, type fields).
319 extern void describe_token_triple(const char *text, int32 value, int type)
321 /* Many of the token types are not set in this file, but later on in
322 Inform's higher stages (for example, in the expression evaluator);
323 but this routine describes them all. */
329 /* The following token types occur in lexer output: */
331 case SYMBOL_TT: printf("symbol ");
332 describe_symbol(value);
334 case NUMBER_TT: printf("literal number %d", value);
336 case DQ_TT: printf("string \"%s\"", text);
338 case SQ_TT: printf("string '%s'", text);
340 case SEP_TT: printf("separator '%s'", text);
342 case EOF_TT: printf("end of file");
345 case STATEMENT_TT: printf("statement name '%s'", text);
347 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", text);
349 case DIRECTIVE_TT: printf("directive name '%s'", text);
351 case CND_TT: printf("textual conditional '%s'", text);
353 case OPCODE_NAME_TT: printf("opcode name '%s'", text);
355 case SYSFUN_TT: printf("built-in function name '%s'", text);
357 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", text);
359 case MISC_KEYWORD_TT: printf("statement keyword '%s'", text);
361 case DIR_KEYWORD_TT: printf("directive keyword '%s'", text);
363 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", text);
365 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", text);
368 /* The remaining are etoken types, not set by the lexer */
370 case OP_TT: printf("operator '%s'",
371 operators[value].description);
373 case ENDEXP_TT: printf("end of expression");
375 case SUBOPEN_TT: printf("open bracket");
377 case SUBCLOSE_TT: printf("close bracket");
379 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",text,value);
381 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",text,value);
383 case VARIABLE_TT: printf("variable '%s'=%d", text, value);
385 case DICTWORD_TT: printf("dictionary word '%s'", text);
387 case ACTION_TT: printf("action name '%s'", text);
391 printf("** unknown token type %d, text='%s', value=%d **",
397 /* ------------------------------------------------------------------------- */
398 /* All but one of the Inform keywords (most of them opcode names used */
399 /* only by the assembler). (The one left over is "sp", a keyword used in */
400 /* assembly language only.) */
402 /* A "keyword group" is a set of keywords to be searched for. If a match */
403 /* is made on an identifier, the token type becomes that given in the KG */
404 /* and the token value is its index in the KG. */
406 /* The keyword ordering must correspond with the appropriate #define's in */
407 /* "header.h" but is otherwise not significant. */
408 /* ------------------------------------------------------------------------- */
410 /* This must exceed the total number of keywords across all groups,
411 including opcodes. */
412 #define MAX_KEYWORDS (350)
414 /* The values will be filled in at compile time, when we know
415 which opcode set to use. */
416 keyword_group opcode_names =
418 OPCODE_NAME_TT, FALSE, TRUE
421 static char *opcode_list_z[] = {
422 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
423 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
424 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
425 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
426 "print_char", "print_num", "random", "push", "pull", "split_window",
427 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
428 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
429 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
430 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
431 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
432 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
433 "erase_window", "erase_line", "set_cursor", "get_cursor",
434 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
435 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
436 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
437 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
438 "restore_undo", "draw_picture", "picture_data", "erase_picture",
439 "set_margins", "move_window", "window_size", "window_style",
440 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
441 "mouse_window", "push_stack", "put_wind_prop", "print_form",
442 "make_menu", "picture_table", "print_unicode", "check_unicode",
446 static char *opcode_list_g[] = {
447 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
448 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
449 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
450 "jltu", "jgeu", "jgtu", "jleu",
452 "catch", "throw", "tailcall",
453 "copy", "copys", "copyb", "sexs", "sexb", "aload",
454 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
455 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
456 "streamchar", "streamnum", "streamstr",
457 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
458 "random", "setrandom", "quit", "verify",
459 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
460 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
461 "linearsearch", "binarysearch", "linkedsearch",
462 "callf", "callfi", "callfii", "callfiii",
464 "mzero", "mcopy", "malloc", "mfree",
465 "accelfunc", "accelparam",
466 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
467 "fadd", "fsub", "fmul", "fdiv", "fmod",
468 "sqrt", "exp", "log", "pow",
469 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
470 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
471 "hasundo", "discardundo",
475 keyword_group opcode_macros =
477 OPCODE_MACRO_TT, FALSE, TRUE
480 static char *opmacro_list_z[] = { "" };
482 static char *opmacro_list_g[] = {
487 keyword_group directives =
488 { { "abbreviate", "array", "attribute", "class", "constant",
489 "default", "dictionary", "end", "endif", "extend", "fake_action",
490 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
491 "iffalse", "import", "include", "link", "lowstring", "message",
492 "nearby", "object", "origsource", "property", "release", "replace",
493 "serial", "switches", "statusline", "stub", "system_file", "trace",
494 "undef", "verb", "version", "zcharacter",
496 DIRECTIVE_TT, FALSE, FALSE
499 keyword_group trace_keywords =
500 { { "dictionary", "symbols", "objects", "verbs",
501 "assembly", "expressions", "lines", "tokens", "linker",
503 TRACE_KEYWORD_TT, FALSE, TRUE
506 keyword_group segment_markers =
507 { { "class", "has", "private", "with", "" },
508 SEGMENT_MARKER_TT, FALSE, TRUE
511 keyword_group directive_keywords =
512 { { "alias", "long", "additive",
514 "noun", "held", "multi", "multiheld", "multiexcept",
515 "multiinside", "creature", "special", "number", "scope", "topic",
516 "reverse", "meta", "only", "replace", "first", "last",
517 "string", "table", "buffer", "data", "initial", "initstr",
518 "with", "private", "has", "class",
519 "error", "fatalerror", "warning",
520 "terminating", "static", "individual",
522 DIR_KEYWORD_TT, FALSE, TRUE
525 keyword_group misc_keywords =
526 { { "char", "name", "the", "a", "an", "The", "number",
527 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
528 "to", "address", "string", "object", "near", "from", "property", "A", "" },
529 MISC_KEYWORD_TT, FALSE, TRUE
532 keyword_group statements =
533 { { "box", "break", "continue", "default", "do", "else", "font", "for",
534 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
535 "print", "print_ret", "quit", "read", "remove", "restore", "return",
536 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
537 "until", "while", "" },
538 STATEMENT_TT, FALSE, TRUE
541 keyword_group conditions =
542 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
546 keyword_group system_functions =
547 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
548 "sibling", "younger", "youngest", "metaclass", "glk", "" },
549 SYSFUN_TT, FALSE, TRUE
552 keyword_group system_constants =
553 { { "adjectives_table", "actions_table", "classes_table",
554 "identifiers_table", "preactions_table", "version_number",
555 "largest_object", "strings_offset", "code_offset",
556 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
557 "static_memory_offset", "array_names_offset", "readable_memory_offset",
558 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
559 "array__start", "array__end",
560 "lowest_attribute_number", "highest_attribute_number",
561 "attribute_names_array",
562 "lowest_property_number", "highest_property_number",
563 "property_names_array",
564 "lowest_action_number", "highest_action_number",
565 "action_names_array",
566 "lowest_fake_action_number", "highest_fake_action_number",
567 "fake_action_names_array",
568 "lowest_routine_number", "highest_routine_number", "routines_array",
569 "routine_names_array", "routine_flags_array",
570 "lowest_global_number", "highest_global_number", "globals_array",
571 "global_names_array", "global_flags_array",
572 "lowest_array_number", "highest_array_number", "arrays_array",
573 "array_names_array", "array_flags_array",
574 "lowest_constant_number", "highest_constant_number", "constants_array",
575 "constant_names_array",
576 "lowest_class_number", "highest_class_number", "class_objects_array",
577 "lowest_object_number", "highest_object_number",
579 "grammar_table", "dictionary_table", "dynam_string_table",
581 SYSTEM_CONSTANT_TT, FALSE, TRUE
584 keyword_group *keyword_groups[12]
585 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
586 &directive_keywords, &misc_keywords, &statements, &conditions,
587 &system_functions, &system_constants, &opcode_macros};
589 /* These keywords are set to point to local_variable_names entries when
590 a routine header is parsed. See construct_local_variable_tables(). */
591 keyword_group local_variables =
593 LOCAL_VARIABLE_TT, FALSE, FALSE
596 static int lexical_context(void)
598 /* The lexical context is a number representing all of the context
599 information in the lexical analyser: the same input text will
600 always translate to the same output tokens whenever the context
603 In fact, for efficiency reasons this number omits the bit of
604 information held in the variable "dont_enter_into_symbol_table".
605 Inform never needs to backtrack through tokens parsed in that
606 way (thankfully, as it would be expensive indeed to check
610 if (opcode_names.enabled) c |= 1;
611 if (directives.enabled) c |= 2;
612 if (trace_keywords.enabled) c |= 4;
613 if (segment_markers.enabled) c |= 8;
614 if (directive_keywords.enabled) c |= 16;
615 if (misc_keywords.enabled) c |= 32;
616 if (statements.enabled) c |= 64;
617 if (conditions.enabled) c |= 128;
618 if (system_functions.enabled) c |= 256;
619 if (system_constants.enabled) c |= 512;
620 if (local_variables.enabled) c |= 1024;
622 if (return_sp_as_variable) c |= 2048;
626 static void print_context(int c)
628 if ((c & 1) != 0) printf("OPC ");
629 if ((c & 2) != 0) printf("DIR ");
630 if ((c & 4) != 0) printf("TK ");
631 if ((c & 8) != 0) printf("SEG ");
632 if ((c & 16) != 0) printf("DK ");
633 if ((c & 32) != 0) printf("MK ");
634 if ((c & 64) != 0) printf("STA ");
635 if ((c & 128) != 0) printf("CND ");
636 if ((c & 256) != 0) printf("SFUN ");
637 if ((c & 512) != 0) printf("SCON ");
638 if ((c & 1024) != 0) printf("LV ");
639 if ((c & 2048) != 0) printf("sp ");
642 static int *keywords_hash_table;
643 static int *keywords_hash_ends_table;
644 static int *keywords_data_table;
646 static int *local_variable_hash_table;
647 static int *local_variable_hash_codes;
649 /* Note that MAX_LOCAL_VARIABLES is the maximum number of local variables
650 for this VM, *including* "sp" (the stack pointer "local").
651 This used to be a memory setting. Now it is a constant: 16 for Z-code,
655 /* Names of local variables in the current routine.
656 This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local"
657 is not included in this array.)
659 (This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1.
660 But right now we just allocate the max.)
662 identstruct *local_variable_names;
664 static char one_letter_locals[128];
666 static void make_keywords_tables(void)
668 char **oplist, **maclist;
671 oplist = opcode_list_z;
672 maclist = opmacro_list_z;
675 oplist = opcode_list_g;
676 maclist = opmacro_list_g;
679 for (j=0; *(oplist[j]); j++) {
680 if (j >= MAX_KEYWORD_GROUP_SIZE) {
681 /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
682 compiler_error("opcode_list has overflowed opcode_names.keywords");
685 opcode_names.keywords[j] = oplist[j];
687 opcode_names.keywords[j] = "";
689 for (j=0; *(maclist[j]); j++) {
690 if (j >= MAX_KEYWORD_GROUP_SIZE) {
691 /* Gotta increase MAX_KEYWORD_GROUP_SIZE */
692 compiler_error("opmacro_list has overflowed opcode_macros.keywords");
695 opcode_macros.keywords[j] = maclist[j];
697 opcode_macros.keywords[j] = "";
699 for (i=0; i<HASH_TAB_SIZE; i++)
700 { keywords_hash_table[i] = -1;
701 keywords_hash_ends_table[i] = -1;
704 for (i=1; i<=11; i++)
705 { keyword_group *kg = keyword_groups[i];
706 for (j=0; *(kg->keywords[j]) != 0; j++)
708 if (tp >= MAX_KEYWORDS) {
709 /* Gotta increase MAX_KEYWORDS */
710 compiler_error("keywords_data_table has overflowed MAX_KEYWORDS");
713 h = hash_code_from_string(kg->keywords[j]);
714 if (keywords_hash_table[h] == -1)
715 keywords_hash_table[h] = tp;
717 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
718 keywords_hash_ends_table[h] = tp;
719 *(keywords_data_table + 3*tp) = i;
720 *(keywords_data_table + 3*tp+1) = j;
721 *(keywords_data_table + 3*tp+2) = -1;
727 /* Look at the strings stored in local_variable_names (from 0 to no_locals).
728 Set local_variables.keywords to point to these, and also prepare the
730 extern void construct_local_variable_tables(void)
732 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
733 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
735 for (i=0; i<no_locals; i++)
737 char *p = local_variable_names[i].text;
738 local_variables.keywords[i] = p;
740 { one_letter_locals[(uchar)p[0]] = i;
741 if (isupper(p[0])) one_letter_locals[tolower(p[0])] = i;
742 if (islower(p[0])) one_letter_locals[toupper(p[0])] = i;
744 h = hash_code_from_string(p);
745 if (local_variable_hash_table[h] == -1)
746 local_variable_hash_table[h] = i;
747 local_variable_hash_codes[i] = h;
749 /* Clear the rest. */
750 for (;i<MAX_LOCAL_VARIABLES-1;i++) {
751 local_variables.keywords[i] = "";
752 local_variable_hash_codes[i] = 0;
756 static void interpret_identifier(char *p, int pos, int dirs_only_flag)
757 { int index, hashcode;
759 /* An identifier is either a keyword or a "symbol", a name which the
760 lexical analyser leaves to higher levels of Inform to understand. */
762 hashcode = hash_code_from_string(p);
764 if (dirs_only_flag) goto KeywordSearch;
766 /* If this is assembly language, perhaps it is "sp"? */
768 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
769 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
773 /* Test for local variables first, quite quickly. */
775 if (local_variables.enabled)
777 { index = one_letter_locals[(uchar)p[0]];
778 if (index<MAX_LOCAL_VARIABLES)
779 { circle[pos].type = LOCAL_VARIABLE_TT;
780 circle[pos].value = index+1;
784 index = local_variable_hash_table[hashcode];
786 { for (;index<no_locals;index++)
787 { if (hashcode == local_variable_hash_codes[index])
788 { if (strcmpcis(p, local_variable_names[index].text)==0)
789 { circle[pos].type = LOCAL_VARIABLE_TT;
790 circle[pos].value = index+1;
798 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
799 the name of a system function which has been Replaced. */
802 index = keywords_hash_table[hashcode];
804 { int *i = keywords_data_table + 3*index;
805 keyword_group *kg = keyword_groups[*i];
806 if (((!dirs_only_flag) && (kg->enabled))
807 || (dirs_only_flag && (kg == &directives)))
808 { char *q = kg->keywords[*(i+1)];
809 if (((kg->case_sensitive) && (strcmp(p, q)==0))
810 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
811 { if ((kg != &system_functions)
812 || (system_function_usage[*(i+1)]!=2))
813 { circle[pos].type = kg->change_token_type;
814 circle[pos].value = *(i+1);
822 if (dirs_only_flag) return;
824 /* Search for the name; create it if necessary. */
826 circle[pos].value = symbol_index(p, hashcode);
827 circle[pos].type = SYMBOL_TT;
831 /* ------------------------------------------------------------------------- */
832 /* The tokeniser grid aids a rapid decision about the consequences of a */
833 /* character reached in the buffer. In effect it is an efficiently stored */
834 /* transition table using an algorithm similar to that of S. C. Johnson's */
835 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
836 /* My thanks to Dilip Sequeira for suggesting this. */
838 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
839 /* separator numbers n, n+1, ..., n+m-1 */
840 /* or certain special values (QUOTE_CODE, etc) */
843 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
844 /* number of separators supported exceeds 61. */
845 /* ------------------------------------------------------------------------- */
847 static int tokeniser_grid[256];
849 #define QUOTE_CODE 1000
850 #define DQUOTE_CODE 1001
851 #define NULL_CODE 1002
852 #define SPACE_CODE 1003
853 #define NEGATIVE_CODE 1004
854 #define DIGIT_CODE 1005
855 #define RADIX_CODE 1006
856 #define KEYWORD_CODE 1007
857 #define EOF_CODE 1008
858 #define WHITESPACE_CODE 1009
859 #define COMMENT_CODE 1010
860 #define IDENTIFIER_CODE 1011
862 /* This list cannot safely be changed without also changing the header
863 separator #defines. The ordering is significant in that (i) all entries
864 beginning with the same character must be adjacent and (ii) that if
865 X is a an initial substring of Y then X must come before Y.
867 E.g. --> must occur before -- to prevent "-->0" being tokenised
868 wrongly as "--", ">", "0" rather than "-->", "0". */
870 static const char separators[NUMBER_SEPARATORS][4] =
871 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
872 "||", "|", "&&", "&", "~~",
873 "~=", "~", "==", "=", ">=", ">",
874 "<=", "<", "(", ")", ",",
875 ".&", ".#", "..&", "..#", "..", ".",
876 "::", ":", "@", ";", "[", "]", "{", "}",
878 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
881 static void make_tokeniser_grid(void)
883 /* Construct the grid to the specification above. */
887 for (i=0; i<256; i++) tokeniser_grid[i]=0;
889 for (i=0; i<NUMBER_SEPARATORS; i++)
890 { j=separators[i][0];
891 if (tokeniser_grid[j]==0)
892 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
894 tokeniser_grid['\''] = QUOTE_CODE;
895 tokeniser_grid['\"'] = DQUOTE_CODE;
896 tokeniser_grid[0] = EOF_CODE;
897 tokeniser_grid[' '] = WHITESPACE_CODE;
898 tokeniser_grid['\n'] = WHITESPACE_CODE;
899 tokeniser_grid['$'] = RADIX_CODE;
900 tokeniser_grid['!'] = COMMENT_CODE;
902 tokeniser_grid['0'] = DIGIT_CODE;
903 tokeniser_grid['1'] = DIGIT_CODE;
904 tokeniser_grid['2'] = DIGIT_CODE;
905 tokeniser_grid['3'] = DIGIT_CODE;
906 tokeniser_grid['4'] = DIGIT_CODE;
907 tokeniser_grid['5'] = DIGIT_CODE;
908 tokeniser_grid['6'] = DIGIT_CODE;
909 tokeniser_grid['7'] = DIGIT_CODE;
910 tokeniser_grid['8'] = DIGIT_CODE;
911 tokeniser_grid['9'] = DIGIT_CODE;
913 tokeniser_grid['a'] = IDENTIFIER_CODE;
914 tokeniser_grid['b'] = IDENTIFIER_CODE;
915 tokeniser_grid['c'] = IDENTIFIER_CODE;
916 tokeniser_grid['d'] = IDENTIFIER_CODE;
917 tokeniser_grid['e'] = IDENTIFIER_CODE;
918 tokeniser_grid['f'] = IDENTIFIER_CODE;
919 tokeniser_grid['g'] = IDENTIFIER_CODE;
920 tokeniser_grid['h'] = IDENTIFIER_CODE;
921 tokeniser_grid['i'] = IDENTIFIER_CODE;
922 tokeniser_grid['j'] = IDENTIFIER_CODE;
923 tokeniser_grid['k'] = IDENTIFIER_CODE;
924 tokeniser_grid['l'] = IDENTIFIER_CODE;
925 tokeniser_grid['m'] = IDENTIFIER_CODE;
926 tokeniser_grid['n'] = IDENTIFIER_CODE;
927 tokeniser_grid['o'] = IDENTIFIER_CODE;
928 tokeniser_grid['p'] = IDENTIFIER_CODE;
929 tokeniser_grid['q'] = IDENTIFIER_CODE;
930 tokeniser_grid['r'] = IDENTIFIER_CODE;
931 tokeniser_grid['s'] = IDENTIFIER_CODE;
932 tokeniser_grid['t'] = IDENTIFIER_CODE;
933 tokeniser_grid['u'] = IDENTIFIER_CODE;
934 tokeniser_grid['v'] = IDENTIFIER_CODE;
935 tokeniser_grid['w'] = IDENTIFIER_CODE;
936 tokeniser_grid['x'] = IDENTIFIER_CODE;
937 tokeniser_grid['y'] = IDENTIFIER_CODE;
938 tokeniser_grid['z'] = IDENTIFIER_CODE;
940 tokeniser_grid['A'] = IDENTIFIER_CODE;
941 tokeniser_grid['B'] = IDENTIFIER_CODE;
942 tokeniser_grid['C'] = IDENTIFIER_CODE;
943 tokeniser_grid['D'] = IDENTIFIER_CODE;
944 tokeniser_grid['E'] = IDENTIFIER_CODE;
945 tokeniser_grid['F'] = IDENTIFIER_CODE;
946 tokeniser_grid['G'] = IDENTIFIER_CODE;
947 tokeniser_grid['H'] = IDENTIFIER_CODE;
948 tokeniser_grid['I'] = IDENTIFIER_CODE;
949 tokeniser_grid['J'] = IDENTIFIER_CODE;
950 tokeniser_grid['K'] = IDENTIFIER_CODE;
951 tokeniser_grid['L'] = IDENTIFIER_CODE;
952 tokeniser_grid['M'] = IDENTIFIER_CODE;
953 tokeniser_grid['N'] = IDENTIFIER_CODE;
954 tokeniser_grid['O'] = IDENTIFIER_CODE;
955 tokeniser_grid['P'] = IDENTIFIER_CODE;
956 tokeniser_grid['Q'] = IDENTIFIER_CODE;
957 tokeniser_grid['R'] = IDENTIFIER_CODE;
958 tokeniser_grid['S'] = IDENTIFIER_CODE;
959 tokeniser_grid['T'] = IDENTIFIER_CODE;
960 tokeniser_grid['U'] = IDENTIFIER_CODE;
961 tokeniser_grid['V'] = IDENTIFIER_CODE;
962 tokeniser_grid['W'] = IDENTIFIER_CODE;
963 tokeniser_grid['X'] = IDENTIFIER_CODE;
964 tokeniser_grid['Y'] = IDENTIFIER_CODE;
965 tokeniser_grid['Z'] = IDENTIFIER_CODE;
967 tokeniser_grid['_'] = IDENTIFIER_CODE;
970 /* ------------------------------------------------------------------------- */
971 /* Definition of a lexical block: a source file or a string containing */
972 /* text for lexical analysis; an independent source from the point of */
973 /* view of issuing error reports. */
974 /* ------------------------------------------------------------------------- */
976 typedef struct LexicalBlock_s
977 { char *filename; /* Full translated name */
978 int main_flag; /* TRUE if the main file
979 (the first one opened) */
980 int sys_flag; /* TRUE if a System_File */
981 int source_line; /* Line number count */
982 int line_start; /* Char number within file
983 where the current line
985 int chars_read; /* Char number of read pos */
986 int file_no; /* Or 255 if not from a
989 char *orig_source; /* From #origsource direct */
995 static LexicalBlock NoFileOpen =
996 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
998 static LexicalBlock MakingOutput =
999 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1001 static LexicalBlock StringLB =
1002 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255, NULL, 0, 0, 0 };
1004 static LexicalBlock *CurrentLB; /* The current lexical
1005 block of input text */
1007 extern void declare_systemfile(void)
1008 { CurrentLB->sys_flag = TRUE;
1011 extern int is_systemfile(void)
1012 { return ((CurrentLB->sys_flag)?1:0);
1015 extern void set_origsource_location(char *source, int32 line, int32 charnum)
1020 /* Clear the Origsource declaration. */
1021 CurrentLB->orig_file = 0;
1022 CurrentLB->orig_source = NULL;
1023 CurrentLB->orig_line = 0;
1024 CurrentLB->orig_char = 0;
1028 /* Get the file number for a new or existing InputFiles entry. */
1029 file_no = register_orig_sourcefile(source);
1031 CurrentLB->orig_file = file_no;
1032 CurrentLB->orig_source = InputFiles[file_no-1].filename;
1033 CurrentLB->orig_line = line;
1034 CurrentLB->orig_char = charnum;
1037 /* Error locations. */
1039 extern debug_location get_current_debug_location(void)
1040 { debug_location result;
1041 /* Assume that all input characters are one byte. */
1042 result.file_index = CurrentLB->file_no;
1043 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
1044 result.end_byte_index = result.beginning_byte_index;
1045 result.beginning_line_number = CurrentLB->source_line;
1046 result.end_line_number = result.beginning_line_number;
1047 result.beginning_character_number =
1048 CurrentLB->chars_read - CurrentLB->line_start;
1049 result.end_character_number = result.beginning_character_number;
1050 result.orig_file_index = CurrentLB->orig_file;
1051 result.orig_beg_line_number = CurrentLB->orig_line;
1052 result.orig_beg_char_number = CurrentLB->orig_char;
1056 static debug_location ErrorReport_debug_location;
1058 extern void report_errors_at_current_line(void)
1059 { ErrorReport.line_number = CurrentLB->source_line;
1060 ErrorReport.file_number = CurrentLB->file_no;
1061 if (ErrorReport.file_number == 255)
1062 ErrorReport.file_number = -1;
1063 ErrorReport.source = CurrentLB->filename;
1064 ErrorReport.main_flag = CurrentLB->main_flag;
1065 if (debugfile_switch)
1066 ErrorReport_debug_location = get_current_debug_location();
1067 ErrorReport.orig_file = CurrentLB->orig_file;
1068 ErrorReport.orig_source = CurrentLB->orig_source;
1069 ErrorReport.orig_line = CurrentLB->orig_line;
1070 ErrorReport.orig_char = CurrentLB->orig_char;
1073 extern debug_location get_error_report_debug_location(void)
1074 { return ErrorReport_debug_location;
1077 extern int32 get_current_line_start(void)
1078 { return CurrentLB->line_start;
1081 brief_location blank_brief_location;
1083 extern brief_location get_brief_location(ErrorPosition *errpos)
1086 loc.file_index = errpos->file_number;
1087 loc.line_number = errpos->line_number;
1088 loc.orig_file_index = errpos->orig_file;
1089 loc.orig_line_number = errpos->orig_line;
1093 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1095 if (loc.file_index != -1)
1096 { errpos->file_number = loc.file_index;
1097 errpos->line_number = loc.line_number;
1098 errpos->main_flag = (errpos->file_number == 1);
1099 errpos->orig_source = NULL;
1100 errpos->orig_file = loc.orig_file_index;
1101 errpos->orig_line = loc.orig_line_number;
1102 errpos->orig_char = 0;
1106 /* ------------------------------------------------------------------------- */
1107 /* Hash printing and line counting */
1108 /* ------------------------------------------------------------------------- */
1110 static void print_hash(void)
1112 /* Hash-printing is the practice of printing a # character every 100
1113 lines of source code (the -x switch), reassuring the user that
1114 progress is being made */
1116 if (no_hash_printed_yet)
1117 { printf("::"); no_hash_printed_yet = FALSE;
1119 printf("#"); hash_printed_since_newline = TRUE;
1122 /* On some systems, text output is buffered to a line at a time, and
1123 this would frustrate the point of hash-printing, so: */
1129 static void reached_new_line(void)
1131 /* Called to signal that a new line has been reached in the source code */
1133 forerrors_pointer = 0;
1135 CurrentLB->source_line++;
1136 CurrentLB->line_start = CurrentLB->chars_read;
1138 total_source_line_count++;
1140 if (total_source_line_count%100==0)
1141 { if (hash_switch) print_hash();
1143 SpinCursor(32); /* I.e., allow other tasks to run */
1148 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1149 { ProcessEvents (&g_proc);
1153 abort_transcript_file();
1154 longjmp (g_fallback, 1);
1160 static void new_syntax_line(void)
1161 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1162 report_errors_at_current_line();
1165 /* Return 10 raised to the expo power.
1167 * I'm avoiding the standard pow() function for a rather lame reason:
1168 * it's in the libmath (-lm) library, and I don't want to change the
1169 * build model for the compiler. So, this is implemented with a stupid
1170 * lookup table. It's faster than pow() for small values of expo.
1171 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1172 * anyway, so I don't expect that to be a problem.
1174 * (For some reason, frexp() and ldexp(), which are used later on, do
1175 * not require libmath to be linked in.)
1177 static double pow10_cheap(int expo)
1179 #define POW10_RANGE (8)
1180 static double powers[POW10_RANGE*2+1] = {
1181 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1183 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1189 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1192 return res * powers[POW10_RANGE+expo];
1195 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1196 res *= powers[POW10_RANGE*2];
1198 return res * powers[POW10_RANGE+expo];
1202 /* Return the IEEE-754 single-precision encoding of a floating-point
1205 * The number is provided in the pieces it was parsed in:
1206 * [+|-] intv "." fracv "e" [+|-]expo
1208 * If the magnitude is too large (beyond about 3.4e+38), this returns
1209 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1210 * small (below about 1e-45), this returns a zero value (0x00000000 or
1211 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1212 * lexer should never do that).
1214 * Note that using a float constant does *not* set the uses_float_features
1215 * flag (which would cause the game file to be labelled 3.1.2). There's
1216 * no VM feature here, just an integer. Of course, any use of the float
1217 * *opcodes* will set the flag.
1219 * The math functions in this routine require #including <math.h>, but
1220 * they should not require linking the math library (-lm). At least,
1221 * they do not on OSX and Linux.
1223 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1225 double absval = (intv + fracv) * pow10_cheap(expo);
1226 int32 sign = (signbit ? 0x80000000 : 0x0);
1230 if (isinf(absval)) {
1231 return sign | 0x7f800000; /* infinity */
1233 if (isnan(absval)) {
1234 return sign | 0x7fc00000;
1237 mant = frexp(absval, &expo);
1239 /* Normalize mantissa to be in the range [1.0, 2.0) */
1240 if (0.5 <= mant && mant < 1.0) {
1244 else if (mant == 0.0) {
1248 return sign | 0x7f800000; /* infinity */
1252 return sign | 0x7f800000; /* infinity */
1254 else if (expo < -126) {
1255 /* Denormalized (very small) number */
1256 mant = ldexp(mant, 126 + expo);
1259 else if (!(expo == 0 && mant == 0.0)) {
1261 mant -= 1.0; /* Get rid of leading 1 */
1264 mant *= 8388608.0; /* 2^23 */
1265 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1267 /* The carry propagated out of a string of 23 1 bits. */
1271 return sign | 0x7f800000; /* infinity */
1275 return (sign) | ((int32)(expo << 23)) | (fbits);
1278 /* ------------------------------------------------------------------------- */
1279 /* Characters are read via a "pipeline" of variables, allowing us to look */
1280 /* up to three characters ahead of the current position. */
1282 /* There are two possible sources: from the source files being loaded in, */
1283 /* and from a string inside Inform (which is where the code for veneer */
1284 /* routines comes from). Each source has its own get-next-character */
1286 /* ------------------------------------------------------------------------- */
1287 /* Source 1: from files */
1289 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1290 /* from the current input file. If the file runs out, then if it was */
1291 /* the last source file 4 EOF characters are placed in the buffer: if it */
1292 /* was only an Include file ending, then a '\n' character is placed there */
1293 /* (essentially to force termination of any comment line) followed by */
1294 /* three harmless spaces. */
1296 /* The routine returns the number of characters it has written, and note */
1297 /* that this conveniently ensures that all characters in the buffer come */
1298 /* from the same file. */
1299 /* ------------------------------------------------------------------------- */
1301 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1303 typedef struct Sourcefile_s
1304 { char *buffer; /* Input buffer */
1305 int read_pos; /* Read position in buffer */
1306 int size; /* Number of meaningful
1307 characters in buffer */
1308 int la, la2, la3; /* Three characters of
1309 lookahead pipeline */
1310 int file_no; /* Internal file number
1315 static Sourcefile *FileStack; /* Allocated to FileStack_max */
1316 static memory_list FileStack_memlist;
1317 static int FileStack_max; /* The highest value that File_sp has
1319 (Filestack entries to this depth have
1320 a buffer allocated) */
1322 static int File_sp; /* Current stack pointer */
1323 static Sourcefile *CF; /* Top entry on stack (always equal to
1324 FileStack[File_sp-1]) */
1326 static int last_input_file;
1328 /* Set CF and CurrentLB.
1329 This does not increment File_sp; the caller must do that. */
1330 static void begin_buffering_file(int i, int file_no)
1331 { int j, cnt; uchar *p;
1336 ensure_memory_list_available(&FileStack_memlist, i+1);
1337 while (i >= FileStack_max) {
1338 FileStack[FileStack_max++].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1341 p = (uchar *) FileStack[i].buffer;
1344 { FileStack[i-1].la = lookahead;
1345 FileStack[i-1].la2 = lookahead2;
1346 FileStack[i-1].la3 = lookahead3;
1349 FileStack[i].file_no = file_no;
1350 FileStack[i].size = file_load_chars(file_no,
1351 (char *) p, SOURCE_BUFFER_SIZE);
1352 /* If the file is shorter than SOURCE_BUFFER_SIZE, it's now closed already. We still need to set up the file entry though. */
1354 lookahead = source_to_iso_grid[p[0]];
1355 lookahead2 = source_to_iso_grid[p[1]];
1356 lookahead3 = source_to_iso_grid[p[2]];
1357 if (LOOKAHEAD_SIZE != 3)
1359 ("Lexer lookahead size does not match hard-coded lookahead code");
1360 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1362 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1363 else FileStack[i].LB.main_flag = FALSE;
1364 FileStack[i].LB.sys_flag = FALSE;
1365 FileStack[i].LB.source_line = 1;
1366 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1367 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1368 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1369 FileStack[i].LB.file_no = file_no;
1370 FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1371 FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1373 InputFiles[file_no-1].initial_buffering = FALSE;
1375 CurrentLB = &(FileStack[i].LB);
1376 CF = &(FileStack[i]);
1378 /* Check for recursive inclusion */
1381 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1385 warning_named("File included more than once",
1386 FileStack[j].LB.filename);
1389 static void create_char_pipeline(void)
1392 begin_buffering_file(File_sp++, 1);
1393 pipeline_made = TRUE;
1394 last_input_file = current_input_file;
1397 static int get_next_char_from_pipeline(void)
1400 while (last_input_file < current_input_file)
1402 /* An "Include" file must have opened since the last character
1403 was read. Perhaps more than one. We run forward through the
1404 list and add them to the include stack. But we ignore
1405 "Origsource" files (which were never actually opened for
1409 if (!InputFiles[last_input_file-1].is_input)
1412 begin_buffering_file(File_sp++, last_input_file);
1414 if (last_input_file != current_input_file)
1415 compiler_error("last_input_file did not match after Include");
1418 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1421 if (CF->read_pos == CF->size)
1423 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1427 if (CF->read_pos == -(CF->size))
1428 { set_token_location(get_current_debug_location());
1431 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1433 CF = &(FileStack[File_sp-1]);
1434 CurrentLB = &(FileStack[File_sp-1].LB);
1435 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1436 if (CF->read_pos == CF->size)
1438 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1441 set_token_location(get_current_debug_location());
1444 p = (uchar *) (CF->buffer);
1446 current = lookahead;
1447 lookahead = lookahead2;
1448 lookahead2 = lookahead3;
1449 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1451 CurrentLB->chars_read++;
1452 if (forerrors_pointer < FORERRORS_SIZE-1)
1453 forerrors_buff[forerrors_pointer++] = current;
1454 if (current == '\n') reached_new_line();
1458 /* ------------------------------------------------------------------------- */
1459 /* Source 2: from a string */
1460 /* ------------------------------------------------------------------------- */
1462 static int source_to_analyse_pointer; /* Current read position */
1464 static int get_next_char_from_string(void)
1465 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1466 current = source_to_iso_grid[p[0]];
1468 if (current == 0) lookahead = 0;
1469 else lookahead = source_to_iso_grid[p[1]];
1470 if (lookahead == 0) lookahead2 = 0;
1471 else lookahead2 = source_to_iso_grid[p[2]];
1472 if (lookahead2 == 0) lookahead3 = 0;
1473 else lookahead3 = source_to_iso_grid[p[3]];
1475 CurrentLB->chars_read++;
1476 if (forerrors_pointer < FORERRORS_SIZE-1)
1477 forerrors_buff[forerrors_pointer++] = current;
1478 if (current == '\n') reached_new_line();
1482 /* ========================================================================= */
1483 /* The interface between the lexer and Inform's higher levels: */
1485 /* put_token_back() (effectively) move the read position */
1486 /* back by one token */
1488 /* get_next_token() copy the token at the current read */
1489 /* position into the triple */
1490 /* (token_type, token_value, token_text) */
1491 /* and move the read position forward */
1494 /* release_token_texts() discard all the tokens that have been */
1495 /* read in, except for put-back ones */
1497 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1498 /* to read from source files; */
1499 /* otherwise, to read from this string. */
1500 /* ------------------------------------------------------------------------- */
1502 extern void release_token_texts(void)
1504 /* This is called at the beginning of every (top-level) directive and
1505 every statement. It drops all token usage so that the lextexts
1506 array can be reused.
1508 Call this immediately before a get_next_token() call.
1510 This should *not* be called within parse_expression(). Expression
1511 code generation relies on token data being retained across the whole
1518 if (tokens_put_back == 0) {
1523 /* If any tokens have been put back, we have to preserve their text.
1524 Move their lextext usage to the head of the lextexts array. */
1526 for (ix=0; ix<tokens_put_back; ix++) {
1529 int pos = circle_position - tokens_put_back + 1 + ix;
1530 if (pos < 0) pos += CIRCLE_SIZE;
1532 oldpos = circle[pos].lextext;
1533 circle[pos].lextext = ix;
1534 /* Swap the entire lextext structure (including size) */
1535 temp = lextexts[ix];
1536 lextexts[ix] = lextexts[oldpos];
1537 lextexts[oldpos] = temp;
1539 cur_lextexts = tokens_put_back;
1542 extern void put_token_back(void)
1543 { tokens_put_back++;
1545 if (tokens_trace_level > 0)
1546 { if (tokens_trace_level == 1) printf("<- ");
1547 else printf("<-\n");
1550 /* The following error, of course, should never happen! */
1552 if (tokens_put_back == CIRCLE_SIZE)
1553 { compiler_error("The lexical analyser has collapsed because of a wrong \
1554 assumption inside Inform");
1560 /* The get_next_token() code reads characters into the current lextext,
1561 which is lextexts[lex_index]. It uses these routines to add and remove
1562 characters, reallocing when necessary.
1564 lex_pos is the current number of characters in the lextext. It is
1565 not necessarily null-terminated until get_next_token() completes.
1568 /* Add one character */
1569 static void lexaddc(char ch)
1571 if ((size_t)lex_pos >= lextexts[lex_index].size) {
1572 size_t newsize = lextexts[lex_index].size * 2;
1573 my_realloc(&lextexts[lex_index].text, lextexts[lex_index].size, newsize, "one lexeme text");
1574 lextexts[lex_index].size = newsize;
1576 lextexts[lex_index].text[lex_pos++] = ch;
1579 /* Remove the last character and ensure it's null-terminated */
1580 static void lexdelc(void)
1585 lextexts[lex_index].text[lex_pos] = 0;
1588 /* Return the last character */
1589 static char lexlastc(void)
1594 return lextexts[lex_index].text[lex_pos-1];
1597 /* Add a string of characters (including the null) */
1598 static void lexadds(char *str)
1607 extern void get_next_token(void)
1608 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1609 int returning_a_put_back_token = TRUE;
1611 context = lexical_context();
1613 if (tokens_put_back > 0)
1614 { i = circle_position - tokens_put_back + 1;
1615 if (i<0) i += CIRCLE_SIZE;
1617 if (context != circle[i].context)
1618 { j = circle[i].type;
1619 if ((j==0) || ((j>=100) && (j<200)))
1620 interpret_identifier(circle[i].text, i, FALSE);
1621 circle[i].context = context;
1625 returning_a_put_back_token = FALSE;
1627 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1628 else circle_position++;
1630 lex_index = cur_lextexts++;
1631 if (lex_index >= no_lextexts) {
1632 /* fresh lextext block; must init it */
1633 no_lextexts = lex_index+1;
1634 ensure_memory_list_available(&lextexts_memlist, no_lextexts);
1635 lextexts[lex_index].size = MAX_IDENTIFIER_LENGTH + 1;
1636 lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text");
1639 lextexts[lex_index].text[0] = 0; /* start with an empty string */
1641 circle[circle_position].lextext = lex_index;
1642 circle[circle_position].text = NULL; /* will fill in later */
1643 circle[circle_position].value = 0;
1644 circle[circle_position].type = 0;
1645 circle[circle_position].context = context;
1648 d = (*get_next_char)();
1649 e = tokeniser_grid[d];
1651 if (next_token_begins_syntax_line)
1652 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1653 { new_syntax_line();
1654 next_token_begins_syntax_line = FALSE;
1658 circle[circle_position].location = get_current_debug_location();
1661 { case 0: char_error("Illegal character found in source:", d);
1662 goto StartTokenAgain;
1664 case WHITESPACE_CODE:
1665 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1667 goto StartTokenAgain;
1670 while ((lookahead != '\n') && (lookahead != 0))
1672 goto StartTokenAgain;
1675 circle[circle_position].type = EOF_TT;
1676 lexadds("<end of file>");
1684 { n = n*radix + character_digit_value[d];
1686 } while ((character_digit_value[lookahead] < radix)
1687 && (d = (*get_next_char)(), TRUE));
1690 circle[circle_position].type = NUMBER_TT;
1691 circle[circle_position].value = n;
1695 { int expo=0; double intv=0, fracv=0;
1696 int expocount=0, intcount=0, fraccount=0;
1697 int signbit = (d == '-');
1699 while (character_digit_value[lookahead] < 10) {
1700 intv = 10.0*intv + character_digit_value[lookahead];
1705 if (lookahead == '.') {
1706 double fracpow = 1.0;
1709 while (character_digit_value[lookahead] < 10) {
1711 fracv = fracv + fracpow*character_digit_value[lookahead];
1717 if (lookahead == 'e' || lookahead == 'E') {
1721 if (lookahead == '+' || lookahead == '-') {
1722 exposign = (lookahead == '-');
1726 while (character_digit_value[lookahead] < 10) {
1727 expo = 10*expo + character_digit_value[lookahead];
1733 error("Floating-point literal must have digits after the 'e'");
1734 if (exposign) { expo = -expo; }
1736 if (intcount + fraccount == 0)
1737 error("Floating-point literal must have digits");
1738 n = construct_float(signbit, intv, fracv, expo);
1741 circle[circle_position].type = NUMBER_TT;
1742 circle[circle_position].value = n;
1743 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1747 radix = 16; d = (*get_next_char)();
1748 if (d == '-' || d == '+') { goto FloatNumber; }
1749 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1750 if (character_digit_value[d] >= radix)
1752 error("Binary number expected after '$$'");
1754 error("Hexadecimal number expected after '$'");
1758 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1761 { e = d; d = (*get_next_char)(); lexaddc(d);
1762 if (quoted_size++==64)
1764 "Too much text for one pair of quotations '...' to hold");
1765 lexaddc('\''); break;
1767 if ((d == '\'') && (e != '@'))
1768 { if (quoted_size == 1)
1769 { d = (*get_next_char)(); lexaddc(d);
1771 error("No text between quotation marks ''");
1776 if (d==EOF) ebf_error("'\''", "end of file");
1778 circle[circle_position].type = SQ_TT;
1781 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1784 { d = (*get_next_char)(); lexaddc(d);
1787 while (lexlastc() == ' ') lex_pos--;
1788 if (lexlastc() != '^') lexaddc(' ');
1789 while ((lookahead != EOF) &&
1790 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1794 { int newline_passed = FALSE;
1796 while ((lookahead != EOF) &&
1797 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1798 if ((d = (*get_next_char)()) == '\n')
1799 newline_passed = TRUE;
1800 if (!newline_passed)
1802 chb[0] = '\"'; chb[1] = lookahead;
1803 chb[2] = '\"'; chb[3] = 0;
1804 ebf_error("empty rest of line after '\\' in string",
1808 } while ((d != EOF) && (d!='\"'));
1809 if (d==EOF) ebf_error("'\"'", "end of file");
1811 circle[circle_position].type = DQ_TT;
1814 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1817 while ((n<=MAX_IDENTIFIER_LENGTH)
1818 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1819 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1820 n++, lexaddc((*get_next_char)());
1824 if (n > MAX_IDENTIFIER_LENGTH)
1825 { char bad_length[100];
1827 "Name exceeds the maximum length of %d characters:",
1828 MAX_IDENTIFIER_LENGTH);
1829 error_named(bad_length, lextexts[lex_index].text);
1830 /* Eat any further extra characters in the identifier */
1831 while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1832 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1834 /* Trim token so that it doesn't violate
1835 MAX_IDENTIFIER_LENGTH during error recovery */
1836 lextexts[lex_index].text[MAX_IDENTIFIER_LENGTH] = 0;
1839 if (dont_enter_into_symbol_table)
1840 { circle[circle_position].type = DQ_TT;
1841 circle[circle_position].value = 0;
1842 if (dont_enter_into_symbol_table == -2)
1843 interpret_identifier(lextexts[lex_index].text, circle_position, TRUE);
1847 interpret_identifier(lextexts[lex_index].text, circle_position, FALSE);
1852 /* The character is initial to at least one of the separators */
1854 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1855 { r = (char *) separators[j];
1859 goto SeparatorMatched;
1863 { if (*(r+1) == lookahead)
1865 lexaddc((*get_next_char)());
1867 goto SeparatorMatched;
1871 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1873 lexaddc((*get_next_char)());
1874 lexaddc((*get_next_char)());
1876 goto SeparatorMatched;
1881 /* The following contingency never in fact arises with the
1882 current set of separators, but might in future */
1884 lexaddc(d); lexaddc(lookahead); lexaddc(lookahead2);
1886 error_named("Unrecognised combination in source:",
1887 lextexts[lex_index].text);
1888 goto StartTokenAgain;
1892 circle[circle_position].type = SEP_TT;
1893 circle[circle_position].value = j;
1895 { case SEMICOLON_SEP: break;
1896 case HASHNDOLLAR_SEP:
1897 case HASHWDOLLAR_SEP:
1898 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1899 { error_named("Character expected after",
1900 lextexts[lex_index].text);
1904 lexaddc((*get_next_char)());
1905 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1906 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1907 lexaddc((*get_next_char)());
1910 case HASHADOLLAR_SEP:
1911 case HASHGDOLLAR_SEP:
1912 case HASHRDOLLAR_SEP:
1914 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1915 { error_named("Alphabetic character expected after",
1916 lextexts[lex_index].text);
1920 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1921 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1922 lexaddc((*get_next_char)());
1929 /* We can now assign the text pointer, since the lextext has finished reallocing. */
1930 circle[circle_position].text = lextexts[lex_index].text;
1931 i = circle_position;
1934 /* We've either parsed a new token or selected a put-back one.
1935 i is the circle-position of the token in question. Time to
1936 export the token data where higher-level code can find it. */
1937 token_value = circle[i].value;
1938 token_type = circle[i].type;
1939 token_text = circle[i].text;
1940 if (!returning_a_put_back_token)
1941 { set_token_location(circle[i].location);
1944 if (tokens_trace_level > 0)
1945 { if (tokens_trace_level == 1) {
1946 printf("'%s' ", circle[i].text);
1947 if (circle[i].type == EOF_TT) printf("\n");
1950 { printf("-> "); describe_token(&circle[i]);
1952 if (tokens_trace_level > 2) print_context(circle[i].context);
1958 static char veneer_error_title[64];
1960 extern void restart_lexer(char *lexical_source, char *name)
1962 circle_position = 0;
1963 for (i=0; i<CIRCLE_SIZE; i++)
1964 { circle[i].type = 0;
1965 circle[i].value = 0;
1966 circle[i].text = "(if this is ever visible, there is a bug)";
1967 circle[i].lextext = -1;
1968 circle[i].context = 0;
1972 /* But we don't touch no_lextexts; those allocated blocks can be reused */
1976 tokens_put_back = 0;
1977 forerrors_pointer = 0;
1978 dont_enter_into_symbol_table = FALSE;
1979 return_sp_as_variable = FALSE;
1980 next_token_begins_syntax_line = TRUE;
1982 source_to_analyse = lexical_source;
1984 if (source_to_analyse == NULL)
1985 { get_next_char = get_next_char_from_pipeline;
1986 if (!pipeline_made) create_char_pipeline();
1987 forerrors_buff[0] = 0; forerrors_pointer = 0;
1990 { get_next_char = get_next_char_from_string;
1991 source_to_analyse_pointer = 0;
1992 CurrentLB = &StringLB;
1993 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1994 StringLB.filename = veneer_error_title;
1996 CurrentLB->source_line = 1;
1997 CurrentLB->line_start = 0;
1998 CurrentLB->chars_read = 0;
2002 /* ========================================================================= */
2003 /* Data structure management routines */
2004 /* ------------------------------------------------------------------------- */
2006 extern void init_lexer_vars(void)
2019 blank_brief_location.file_index = -1;
2020 blank_brief_location.line_number = 0;
2021 blank_brief_location.orig_file_index = 0;
2022 blank_brief_location.orig_line_number = 0;
2025 extern void lexer_begin_prepass(void)
2026 { total_source_line_count = 0;
2027 CurrentLB = &NoFileOpen;
2028 report_errors_at_current_line();
2031 extern void lexer_begin_pass(void)
2032 { no_hash_printed_yet = TRUE;
2033 hash_printed_since_newline = FALSE;
2035 pipeline_made = FALSE;
2037 restart_lexer(NULL, NULL);
2040 extern void lexer_endpass(void)
2041 { CurrentLB = &MakingOutput;
2042 report_errors_at_current_line();
2045 extern void lexer_allocate_arrays(void)
2047 initialise_memory_list(&FileStack_memlist,
2048 sizeof(Sourcefile), 4, (void**)&FileStack,
2049 "source file stack");
2052 initialise_memory_list(&lextexts_memlist,
2053 sizeof(lextext), 200, (void**)&lextexts,
2054 "lexeme texts list");
2057 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2058 "keyword hash table");
2059 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2060 "keyword hash end table");
2061 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
2062 "keyword hashing linked list");
2064 local_variable_names = my_calloc(sizeof(identstruct), MAX_LOCAL_VARIABLES-1,
2065 "text of local variable names");
2066 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
2067 "local variable hash table");
2068 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
2069 "local variable hash codes");
2071 make_tokeniser_grid();
2072 make_keywords_tables();
2074 first_token_locations =
2075 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
2076 first_token_locations->location.file_index = 0;
2077 first_token_locations->location.beginning_byte_index = 0;
2078 first_token_locations->location.end_byte_index = 0;
2079 first_token_locations->location.beginning_line_number = 0;
2080 first_token_locations->location.end_line_number = 0;
2081 first_token_locations->location.beginning_character_number = 0;
2082 first_token_locations->location.end_character_number = 0;
2083 first_token_locations->location.orig_file_index = 0;
2084 first_token_locations->location.orig_beg_line_number = 0;
2085 first_token_locations->location.orig_beg_char_number = 0;
2086 first_token_locations->next = NULL;
2087 first_token_locations->reference_count = 0;
2088 last_token_location = first_token_locations;
2091 extern void lexer_free_arrays(void)
2096 for (ix=0; ix<FileStack_max; ix++) {
2097 my_free(&FileStack[ix].buffer, "source file buffer");
2099 deallocate_memory_list(&FileStack_memlist);
2101 for (ix=0; ix<no_lextexts; ix++) {
2102 my_free(&lextexts[ix].text, "one lexeme text");
2104 deallocate_memory_list(&lextexts_memlist);
2106 my_free(&keywords_hash_table, "keyword hash table");
2107 my_free(&keywords_hash_ends_table, "keyword hash end table");
2108 my_free(&keywords_data_table, "keyword hashing linked list");
2110 my_free(&local_variable_names, "text of local variable names");
2111 my_free(&local_variable_hash_table, "local variable hash table");
2112 my_free(&local_variable_hash_codes, "local variable hash codes");
2114 cleanup_token_locations(NULL);
2117 /* ========================================================================= */