1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
4 /* Part of Inform 6.35 */
5 /* copyright (c) Graham Nelson 1993 - 2021 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ *
20 /* ------------------------------------------------------------------------- */
24 int total_source_line_count, /* Number of source lines so far */
26 no_hash_printed_yet, /* Have not yet printed the first # */
27 hash_printed_since_newline, /* A hash has been printed since the
28 most recent new-line was printed
29 (generally as a result of an error
30 message or the start of pass) */
31 dont_enter_into_symbol_table, /* Return names as text (with
32 token type DQ_TT, i.e., as if
33 they had double-quotes around)
34 and not as entries in the symbol
35 table, when TRUE. If -2, only the
36 keyword table is searched. */
37 return_sp_as_variable; /* When TRUE, the word "sp" denotes
38 the stack pointer variable
39 (used in assembly language only) */
40 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
41 line (for error reporting, etc.)
42 on the source code line where
43 the next token appears */
45 int32 last_mapped_line; /* Last syntax line reported to debugging file */
47 /* ------------------------------------------------------------------------- */
48 /* The lexer's output is a sequence of triples, each called a "token", */
49 /* representing one lexical unit (or "lexeme") each. Instead of providing */
50 /* "lookahead" (that is, always having available the next token after the */
51 /* current one, so that syntax analysers higher up in Inform can have */
52 /* advance knowledge of what is coming), the lexer instead has a system */
53 /* where tokens can be read in and then "put back again". */
54 /* The meaning of the number (and to some extent the text) supplied with */
55 /* a token depends on its type: see "header.h" for the list of types. */
56 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
57 /* number, and translated to the token: */
58 /* type NUMBER_TT, value 483, text "$1e3" */
59 /* ------------------------------------------------------------------------- */
60 /* These three variables are set to the current token on a call to */
61 /* get_next_token() (but are not changed by a call to put_token_back()). */
62 /* ------------------------------------------------------------------------- */
68 /* ------------------------------------------------------------------------- */
69 /* The next two variables are the head and tail of a singly linked list. */
70 /* The tail stores the portion most recently read from the current */
71 /* lexical block; its end values therefore describe the location of the */
72 /* current token, and are updated whenever the three variables above are */
73 /* via set_token_location(...). Earlier vertices, if any, represent the */
74 /* regions of lexical blocks read beforehand, where new vertices are */
75 /* only introduced by interruptions like a file inclusion or an EOF. */
76 /* Vertices are deleted off of the front of the list once they are no */
77 /* longer referenced by pending debug information records. */
78 /* ------------------------------------------------------------------------- */
80 static debug_locations *first_token_locations;
81 static debug_locations *last_token_location;
83 extern debug_location get_token_location(void)
84 { debug_location result;
85 debug_location *location = &(last_token_location->location);
86 result.file_index = location->file_index;
87 result.beginning_byte_index = location->end_byte_index;
88 result.end_byte_index = location->end_byte_index;
89 result.beginning_line_number = location->end_line_number;
90 result.end_line_number = location->end_line_number;
91 result.beginning_character_number = location->end_character_number;
92 result.end_character_number = location->end_character_number;
93 result.orig_file_index = location->orig_file_index;
94 result.orig_beg_line_number = location->orig_beg_line_number;
95 result.orig_beg_char_number = location->orig_beg_char_number;
99 extern debug_locations get_token_locations(void)
100 { debug_locations result;
101 result.location = get_token_location();
103 result.reference_count = 0;
107 static void set_token_location(debug_location location)
108 { if (location.file_index == last_token_location->location.file_index)
109 { last_token_location->location.end_byte_index =
110 location.end_byte_index;
111 last_token_location->location.end_line_number =
112 location.end_line_number;
113 last_token_location->location.end_character_number =
114 location.end_character_number;
115 last_token_location->location.orig_file_index =
116 location.orig_file_index;
117 last_token_location->location.orig_beg_line_number =
118 location.orig_beg_line_number;
119 last_token_location->location.orig_beg_char_number =
120 location.orig_beg_char_number;
122 { debug_locations*successor =
124 (sizeof(debug_locations),
125 "debug locations of recent tokens");
126 successor->location = location;
127 successor->next = NULL;
128 successor->reference_count = 0;
129 last_token_location->next = successor;
130 last_token_location = successor;
134 extern debug_location_beginning get_token_location_beginning(void)
135 { debug_location_beginning result;
136 ++(last_token_location->reference_count);
137 result.head = last_token_location;
138 result.beginning_byte_index =
139 last_token_location->location.end_byte_index;
140 result.beginning_line_number =
141 last_token_location->location.end_line_number;
142 result.beginning_character_number =
143 last_token_location->location.end_character_number;
144 result.orig_file_index = last_token_location->location.orig_file_index;
145 result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
146 result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
151 static void cleanup_token_locations(debug_location_beginning*beginning)
152 { if (first_token_locations)
153 { while (first_token_locations &&
154 !first_token_locations->reference_count)
155 { debug_locations*moribund = first_token_locations;
156 first_token_locations = moribund->next;
157 my_free(&moribund, "debug locations of recent tokens");
159 (beginning->head == moribund || !first_token_locations))
161 ("Records needed by a debug_location_beginning are no "
162 "longer allocated, perhaps because of an invalid reuse "
163 "of this or an earlier beginning");
169 ("Attempt to use a debug_location_beginning when no token "
170 "locations are defined");
173 ("Attempt to clean up token locations when no token locations "
179 extern void discard_token_location(debug_location_beginning beginning)
180 { --(beginning.head->reference_count);
183 extern debug_locations get_token_location_end
184 (debug_location_beginning beginning)
185 { debug_locations result;
186 cleanup_token_locations(&beginning);
187 --(beginning.head->reference_count);
188 /* Sometimes we know what we'll read before we switch to the lexical block
189 where we'll read it. In such cases the beginning will be placed in the
190 prior block and last exactly zero bytes there. It's misleading to
191 include such ranges, so we gobble them. */
192 if (beginning.head->location.end_byte_index ==
193 beginning.beginning_byte_index &&
194 beginning.head->next)
195 { beginning.head = beginning.head->next;
196 result.location = beginning.head->location;
197 result.location.beginning_byte_index = 0;
198 result.location.beginning_line_number = 1;
199 result.location.beginning_character_number = 1;
201 { result.location = beginning.head->location;
202 result.location.beginning_byte_index =
203 beginning.beginning_byte_index;
204 result.location.beginning_line_number =
205 beginning.beginning_line_number;
206 result.location.beginning_character_number =
207 beginning.beginning_character_number;
210 result.location.orig_file_index =
211 beginning.orig_file_index;
212 result.location.orig_beg_line_number =
213 beginning.orig_beg_line_number;
214 result.location.orig_beg_char_number =
215 beginning.orig_beg_char_number;
217 result.next = beginning.head->next;
218 result.reference_count = 0;
222 /* ------------------------------------------------------------------------- */
223 /* In order to be able to put tokens back efficiently, the lexer stores */
224 /* tokens in a "circle": the variable circle_position ranges between */
225 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
226 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
227 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
229 /* Unlike some compilers, Inform does not have a context-free lexer: in */
230 /* fact it has 12288 different possible states. However, the context only */
231 /* affects the interpretation of "identifiers": lexemes beginning with a */
232 /* letter and containing up to 32 chars of alphanumeric and underscore */
233 /* chars. (For example, "default" may refer to the directive or statement */
234 /* of that name, and which token values are returned depends on the */
235 /* current lexical context.) */
237 /* Along with each token, we also store the lexical context it was */
238 /* translated under; because if it is called for again, there may need */
239 /* to be a fresh interpretation of it if the context has changed. */
240 /* ------------------------------------------------------------------------- */
242 #define CIRCLE_SIZE 6
244 /* (The worst case for token lookahead is distinguishing between an
245 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
247 static int circle_position;
248 static token_data circle[CIRCLE_SIZE];
250 static int token_contexts[CIRCLE_SIZE];
252 /* ------------------------------------------------------------------------- */
253 /* A complication, however, is that the text of some lexemes needs to be */
254 /* held in Inform's memory for much longer periods: for example, a */
255 /* dictionary word lexeme (like "'south'") must have its text preserved */
256 /* until the code generation time for the expression it occurs in, when */
257 /* the dictionary reference is actually made. Code generation in general */
258 /* occurs as early as possible in Inform: pending some better method of */
259 /* garbage collection, we simply use a buffer so large that unless */
260 /* expressions spread across 10K of source code are found, there can be */
262 /* ------------------------------------------------------------------------- */
264 static char *lexeme_memory;
265 static char *lex_p; /* Current write position */
267 /* ------------------------------------------------------------------------- */
268 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
269 /* LR(3) grammar to translate characters into tokens). */
270 /* ------------------------------------------------------------------------- */
272 #define LOOKAHEAD_SIZE 3
274 static int current, lookahead, /* The latest character read, and */
275 lookahead2, lookahead3; /* the three characters following it */
277 static int pipeline_made; /* Whether or not the pipeline of
278 characters has been constructed
281 static int (* get_next_char)(void); /* Routine for reading the stream of
282 characters: the lexer does not
283 need any "ungetc" routine for
284 putting them back again. End of
285 stream is signalled by returning
288 static char *source_to_analyse; /* The current lexical source:
289 NULL for "load from source files",
290 otherwise this points to a string
291 containing Inform code */
293 static int tokens_put_back; /* Count of the number of backward
294 moves made from the last-read
297 extern void describe_token(token_data t)
299 /* Many of the token types are not set in this file, but later on in
300 Inform's higher stages (for example, in the expression evaluator);
301 but this routine describes them all. */
307 /* The following token types occur in lexer output: */
309 case SYMBOL_TT: printf("symbol ");
310 describe_symbol(t.value);
312 case NUMBER_TT: printf("literal number %d", t.value);
314 case DQ_TT: printf("string \"%s\"", t.text);
316 case SQ_TT: printf("string '%s'", t.text);
318 case SEP_TT: printf("separator '%s'", t.text);
320 case EOF_TT: printf("end of file");
323 case STATEMENT_TT: printf("statement name '%s'", t.text);
325 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
327 case DIRECTIVE_TT: printf("directive name '%s'", t.text);
329 case CND_TT: printf("textual conditional '%s'", t.text);
331 case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
333 case SYSFUN_TT: printf("built-in function name '%s'", t.text);
335 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
337 case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
339 case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
341 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
343 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
346 /* The remaining are etoken types, not set by the lexer */
348 case OP_TT: printf("operator '%s'",
349 operators[t.value].description);
351 case ENDEXP_TT: printf("end of expression");
353 case SUBOPEN_TT: printf("open bracket");
355 case SUBCLOSE_TT: printf("close bracket");
357 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
359 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
361 case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
363 case DICTWORD_TT: printf("dictionary word '%s'", t.text);
365 case ACTION_TT: printf("action name '%s'", t.text);
369 printf("** unknown token type %d, text='%s', value=%d **",
370 t.type, t.text, t.value);
375 /* ------------------------------------------------------------------------- */
376 /* All but one of the 280 Inform keywords (118 of them opcode names used */
377 /* only by the assembler). (The one left over is "sp", a keyword used in */
378 /* assembly language only.) */
380 /* A "keyword group" is a set of keywords to be searched for. If a match */
381 /* is made on an identifier, the token type becomes that given in the KG */
382 /* and the token value is its index in the KG. */
384 /* The keyword ordering must correspond with the appropriate #define's in */
385 /* "header.h" but is otherwise not significant. */
386 /* ------------------------------------------------------------------------- */
388 #define MAX_KEYWORDS 350
390 /* The values will be filled in at compile time, when we know
391 which opcode set to use. */
392 keyword_group opcode_names =
394 OPCODE_NAME_TT, FALSE, TRUE
397 static char *opcode_list_z[] = {
398 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
399 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
400 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
401 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
402 "print_char", "print_num", "random", "push", "pull", "split_window",
403 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
404 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
405 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
406 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
407 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
408 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
409 "erase_window", "erase_line", "set_cursor", "get_cursor",
410 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
411 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
412 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
413 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
414 "restore_undo", "draw_picture", "picture_data", "erase_picture",
415 "set_margins", "move_window", "window_size", "window_style",
416 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
417 "mouse_window", "push_stack", "put_wind_prop", "print_form",
418 "make_menu", "picture_table", "print_unicode", "check_unicode",
422 static char *opcode_list_g[] = {
423 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
424 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
425 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
426 "jltu", "jgeu", "jgtu", "jleu",
428 "catch", "throw", "tailcall",
429 "copy", "copys", "copyb", "sexs", "sexb", "aload",
430 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
431 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
432 "streamchar", "streamnum", "streamstr",
433 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
434 "random", "setrandom", "quit", "verify",
435 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
436 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
437 "linearsearch", "binarysearch", "linkedsearch",
438 "callf", "callfi", "callfii", "callfiii",
440 "mzero", "mcopy", "malloc", "mfree",
441 "accelfunc", "accelparam",
442 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
443 "fadd", "fsub", "fmul", "fdiv", "fmod",
444 "sqrt", "exp", "log", "pow",
445 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
446 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
450 keyword_group opcode_macros =
452 OPCODE_MACRO_TT, FALSE, TRUE
455 static char *opmacro_list_z[] = { "" };
457 static char *opmacro_list_g[] = {
462 keyword_group directives =
463 { { "abbreviate", "array", "attribute", "class", "constant",
464 "default", "dictionary", "end", "endif", "extend", "fake_action",
465 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
466 "iffalse", "import", "include", "link", "lowstring", "message",
467 "nearby", "object", "origsource", "property", "release", "replace",
468 "serial", "switches", "statusline", "stub", "system_file", "trace",
469 "undef", "verb", "version", "zcharacter",
471 DIRECTIVE_TT, FALSE, FALSE
474 keyword_group trace_keywords =
475 { { "dictionary", "symbols", "objects", "verbs",
476 "assembly", "expressions", "lines", "tokens", "linker",
478 TRACE_KEYWORD_TT, FALSE, TRUE
481 keyword_group segment_markers =
482 { { "class", "has", "private", "with", "" },
483 SEGMENT_MARKER_TT, FALSE, TRUE
486 keyword_group directive_keywords =
487 { { "alias", "long", "additive",
489 "noun", "held", "multi", "multiheld", "multiexcept",
490 "multiinside", "creature", "special", "number", "scope", "topic",
491 "reverse", "meta", "only", "replace", "first", "last",
492 "string", "table", "buffer", "data", "initial", "initstr",
493 "with", "private", "has", "class",
494 "error", "fatalerror", "warning",
495 "terminating", "static",
497 DIR_KEYWORD_TT, FALSE, TRUE
500 keyword_group misc_keywords =
501 { { "char", "name", "the", "a", "an", "The", "number",
502 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
503 "to", "address", "string", "object", "near", "from", "property", "A", "" },
504 MISC_KEYWORD_TT, FALSE, TRUE
507 keyword_group statements =
508 { { "box", "break", "continue", "default", "do", "else", "font", "for",
509 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
510 "print", "print_ret", "quit", "read", "remove", "restore", "return",
511 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
512 "until", "while", "" },
513 STATEMENT_TT, FALSE, TRUE
516 keyword_group conditions =
517 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
521 keyword_group system_functions =
522 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
523 "sibling", "younger", "youngest", "metaclass", "glk", "" },
524 SYSFUN_TT, FALSE, TRUE
527 keyword_group system_constants =
528 { { "adjectives_table", "actions_table", "classes_table",
529 "identifiers_table", "preactions_table", "version_number",
530 "largest_object", "strings_offset", "code_offset",
531 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
532 "static_memory_offset", "array_names_offset", "readable_memory_offset",
533 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
534 "array__start", "array__end",
535 "lowest_attribute_number", "highest_attribute_number",
536 "attribute_names_array",
537 "lowest_property_number", "highest_property_number",
538 "property_names_array",
539 "lowest_action_number", "highest_action_number",
540 "action_names_array",
541 "lowest_fake_action_number", "highest_fake_action_number",
542 "fake_action_names_array",
543 "lowest_routine_number", "highest_routine_number", "routines_array",
544 "routine_names_array", "routine_flags_array",
545 "lowest_global_number", "highest_global_number", "globals_array",
546 "global_names_array", "global_flags_array",
547 "lowest_array_number", "highest_array_number", "arrays_array",
548 "array_names_array", "array_flags_array",
549 "lowest_constant_number", "highest_constant_number", "constants_array",
550 "constant_names_array",
551 "lowest_class_number", "highest_class_number", "class_objects_array",
552 "lowest_object_number", "highest_object_number",
554 "grammar_table", "dictionary_table", "dynam_string_table",
556 SYSTEM_CONSTANT_TT, FALSE, TRUE
559 keyword_group *keyword_groups[12]
560 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
561 &directive_keywords, &misc_keywords, &statements, &conditions,
562 &system_functions, &system_constants, &opcode_macros};
564 keyword_group local_variables =
565 { { "" }, /* Filled in when routine declared */
566 LOCAL_VARIABLE_TT, FALSE, FALSE
569 static int lexical_context(void)
571 /* The lexical context is a number representing all of the context
572 information in the lexical analyser: the same input text will
573 always translate to the same output tokens whenever the context
576 In fact, for efficiency reasons this number omits the bit of
577 information held in the variable "dont_enter_into_symbol_table".
578 Inform never needs to backtrack through tokens parsed in that
579 way (thankfully, as it would be expensive indeed to check
583 if (opcode_names.enabled) c |= 1;
584 if (directives.enabled) c |= 2;
585 if (trace_keywords.enabled) c |= 4;
586 if (segment_markers.enabled) c |= 8;
587 if (directive_keywords.enabled) c |= 16;
588 if (misc_keywords.enabled) c |= 32;
589 if (statements.enabled) c |= 64;
590 if (conditions.enabled) c |= 128;
591 if (system_functions.enabled) c |= 256;
592 if (system_constants.enabled) c |= 512;
593 if (local_variables.enabled) c |= 1024;
595 if (return_sp_as_variable) c |= 2048;
599 static void print_context(int c)
601 if ((c & 1) != 0) printf("OPC ");
602 if ((c & 2) != 0) printf("DIR ");
603 if ((c & 4) != 0) printf("TK ");
604 if ((c & 8) != 0) printf("SEG ");
605 if ((c & 16) != 0) printf("DK ");
606 if ((c & 32) != 0) printf("MK ");
607 if ((c & 64) != 0) printf("STA ");
608 if ((c & 128) != 0) printf("CND ");
609 if ((c & 256) != 0) printf("SFUN ");
610 if ((c & 512) != 0) printf("SCON ");
611 if ((c & 1024) != 0) printf("LV ");
612 if ((c & 2048) != 0) printf("sp ");
615 static int *keywords_hash_table;
616 static int *keywords_hash_ends_table;
617 static int *keywords_data_table;
619 static int *local_variable_hash_table;
620 static int *local_variable_hash_codes;
621 char **local_variable_texts;
622 static char *local_variable_text_table;
624 static char one_letter_locals[128];
626 static void make_keywords_tables(void)
628 char **oplist, **maclist;
631 oplist = opcode_list_z;
632 maclist = opmacro_list_z;
635 oplist = opcode_list_g;
636 maclist = opmacro_list_g;
639 for (j=0; *(oplist[j]); j++) {
640 opcode_names.keywords[j] = oplist[j];
642 opcode_names.keywords[j] = "";
644 for (j=0; *(maclist[j]); j++) {
645 opcode_macros.keywords[j] = maclist[j];
647 opcode_macros.keywords[j] = "";
649 for (i=0; i<HASH_TAB_SIZE; i++)
650 { keywords_hash_table[i] = -1;
651 keywords_hash_ends_table[i] = -1;
654 for (i=1; i<=11; i++)
655 { keyword_group *kg = keyword_groups[i];
656 for (j=0; *(kg->keywords[j]) != 0; j++)
657 { h = hash_code_from_string(kg->keywords[j]);
658 if (keywords_hash_table[h] == -1)
659 keywords_hash_table[h] = tp;
661 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
662 keywords_hash_ends_table[h] = tp;
663 *(keywords_data_table + 3*tp) = i;
664 *(keywords_data_table + 3*tp+1) = j;
665 *(keywords_data_table + 3*tp+2) = -1;
671 extern void construct_local_variable_tables(void)
672 { int i, h; char *p = local_variable_text_table;
673 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
674 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
676 for (i=0; i<no_locals; i++)
677 { char *q = local_variables.keywords[i];
679 { one_letter_locals[(uchar)q[0]] = i;
680 if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
681 if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
683 h = hash_code_from_string(q);
684 if (local_variable_hash_table[h] == -1)
685 local_variable_hash_table[h] = i;
686 local_variable_hash_codes[i] = h;
687 local_variable_texts[i] = p;
691 for (;i<MAX_LOCAL_VARIABLES-1;i++)
692 local_variable_texts[i] = "<no such local variable>";
695 static void interpret_identifier(int pos, int dirs_only_flag)
696 { int index, hashcode; char *p = circle[pos].text;
698 /* An identifier is either a keyword or a "symbol", a name which the
699 lexical analyser leaves to higher levels of Inform to understand. */
701 hashcode = hash_code_from_string(p);
703 if (dirs_only_flag) goto KeywordSearch;
705 /* If this is assembly language, perhaps it is "sp"? */
707 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
708 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
712 /* Test for local variables first, quite quickly. */
714 if (local_variables.enabled)
716 { index = one_letter_locals[(uchar)p[0]];
717 if (index<MAX_LOCAL_VARIABLES)
718 { circle[pos].type = LOCAL_VARIABLE_TT;
719 circle[pos].value = index+1;
723 index = local_variable_hash_table[hashcode];
725 { for (;index<no_locals;index++)
726 { if (hashcode == local_variable_hash_codes[index])
727 { if (strcmpcis(p, local_variable_texts[index])==0)
728 { circle[pos].type = LOCAL_VARIABLE_TT;
729 circle[pos].value = index+1;
737 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
738 the name of a system function which has been Replaced. */
741 index = keywords_hash_table[hashcode];
743 { int *i = keywords_data_table + 3*index;
744 keyword_group *kg = keyword_groups[*i];
745 if (((!dirs_only_flag) && (kg->enabled))
746 || (dirs_only_flag && (kg == &directives)))
747 { char *q = kg->keywords[*(i+1)];
748 if (((kg->case_sensitive) && (strcmp(p, q)==0))
749 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
750 { if ((kg != &system_functions)
751 || (system_function_usage[*(i+1)]!=2))
752 { circle[pos].type = kg->change_token_type;
753 circle[pos].value = *(i+1);
761 if (dirs_only_flag) return;
763 /* Search for the name; create it if necessary. */
765 circle[pos].value = symbol_index(p, hashcode);
766 circle[pos].type = SYMBOL_TT;
770 /* ------------------------------------------------------------------------- */
771 /* The tokeniser grid aids a rapid decision about the consequences of a */
772 /* character reached in the buffer. In effect it is an efficiently stored */
773 /* transition table using an algorithm similar to that of S. C. Johnson's */
774 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
775 /* My thanks to Dilip Sequeira for suggesting this. */
777 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
778 /* separator numbers n, n+1, ..., n+m-1 */
779 /* or certain special values (QUOTE_CODE, etc) */
782 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
783 /* number of separators supported exceeds 61. */
784 /* ------------------------------------------------------------------------- */
786 static int tokeniser_grid[256];
788 #define QUOTE_CODE 1000
789 #define DQUOTE_CODE 1001
790 #define NULL_CODE 1002
791 #define SPACE_CODE 1003
792 #define NEGATIVE_CODE 1004
793 #define DIGIT_CODE 1005
794 #define RADIX_CODE 1006
795 #define KEYWORD_CODE 1007
796 #define EOF_CODE 1008
797 #define WHITESPACE_CODE 1009
798 #define COMMENT_CODE 1010
799 #define IDENTIFIER_CODE 1011
801 /* This list cannot safely be changed without also changing the header
802 separator #defines. The ordering is significant in that (i) all entries
803 beginning with the same character must be adjacent and (ii) that if
804 X is a an initial substring of Y then X must come before Y.
806 E.g. --> must occur before -- to prevent "-->0" being tokenised
807 wrongly as "--", ">", "0" rather than "-->", "0". */
809 static const char separators[NUMBER_SEPARATORS][4] =
810 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
811 "||", "|", "&&", "&", "~~",
812 "~=", "~", "==", "=", ">=", ">",
813 "<=", "<", "(", ")", ",",
814 ".&", ".#", "..&", "..#", "..", ".",
815 "::", ":", "@", ";", "[", "]", "{", "}",
817 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
820 static void make_tokeniser_grid(void)
822 /* Construct the grid to the specification above. */
826 for (i=0; i<256; i++) tokeniser_grid[i]=0;
828 for (i=0; i<NUMBER_SEPARATORS; i++)
829 { j=separators[i][0];
830 if (tokeniser_grid[j]==0)
831 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
833 tokeniser_grid['\''] = QUOTE_CODE;
834 tokeniser_grid['\"'] = DQUOTE_CODE;
835 tokeniser_grid[0] = EOF_CODE;
836 tokeniser_grid[' '] = WHITESPACE_CODE;
837 tokeniser_grid['\n'] = WHITESPACE_CODE;
838 tokeniser_grid['$'] = RADIX_CODE;
839 tokeniser_grid['!'] = COMMENT_CODE;
841 tokeniser_grid['0'] = DIGIT_CODE;
842 tokeniser_grid['1'] = DIGIT_CODE;
843 tokeniser_grid['2'] = DIGIT_CODE;
844 tokeniser_grid['3'] = DIGIT_CODE;
845 tokeniser_grid['4'] = DIGIT_CODE;
846 tokeniser_grid['5'] = DIGIT_CODE;
847 tokeniser_grid['6'] = DIGIT_CODE;
848 tokeniser_grid['7'] = DIGIT_CODE;
849 tokeniser_grid['8'] = DIGIT_CODE;
850 tokeniser_grid['9'] = DIGIT_CODE;
852 tokeniser_grid['a'] = IDENTIFIER_CODE;
853 tokeniser_grid['b'] = IDENTIFIER_CODE;
854 tokeniser_grid['c'] = IDENTIFIER_CODE;
855 tokeniser_grid['d'] = IDENTIFIER_CODE;
856 tokeniser_grid['e'] = IDENTIFIER_CODE;
857 tokeniser_grid['f'] = IDENTIFIER_CODE;
858 tokeniser_grid['g'] = IDENTIFIER_CODE;
859 tokeniser_grid['h'] = IDENTIFIER_CODE;
860 tokeniser_grid['i'] = IDENTIFIER_CODE;
861 tokeniser_grid['j'] = IDENTIFIER_CODE;
862 tokeniser_grid['k'] = IDENTIFIER_CODE;
863 tokeniser_grid['l'] = IDENTIFIER_CODE;
864 tokeniser_grid['m'] = IDENTIFIER_CODE;
865 tokeniser_grid['n'] = IDENTIFIER_CODE;
866 tokeniser_grid['o'] = IDENTIFIER_CODE;
867 tokeniser_grid['p'] = IDENTIFIER_CODE;
868 tokeniser_grid['q'] = IDENTIFIER_CODE;
869 tokeniser_grid['r'] = IDENTIFIER_CODE;
870 tokeniser_grid['s'] = IDENTIFIER_CODE;
871 tokeniser_grid['t'] = IDENTIFIER_CODE;
872 tokeniser_grid['u'] = IDENTIFIER_CODE;
873 tokeniser_grid['v'] = IDENTIFIER_CODE;
874 tokeniser_grid['w'] = IDENTIFIER_CODE;
875 tokeniser_grid['x'] = IDENTIFIER_CODE;
876 tokeniser_grid['y'] = IDENTIFIER_CODE;
877 tokeniser_grid['z'] = IDENTIFIER_CODE;
879 tokeniser_grid['A'] = IDENTIFIER_CODE;
880 tokeniser_grid['B'] = IDENTIFIER_CODE;
881 tokeniser_grid['C'] = IDENTIFIER_CODE;
882 tokeniser_grid['D'] = IDENTIFIER_CODE;
883 tokeniser_grid['E'] = IDENTIFIER_CODE;
884 tokeniser_grid['F'] = IDENTIFIER_CODE;
885 tokeniser_grid['G'] = IDENTIFIER_CODE;
886 tokeniser_grid['H'] = IDENTIFIER_CODE;
887 tokeniser_grid['I'] = IDENTIFIER_CODE;
888 tokeniser_grid['J'] = IDENTIFIER_CODE;
889 tokeniser_grid['K'] = IDENTIFIER_CODE;
890 tokeniser_grid['L'] = IDENTIFIER_CODE;
891 tokeniser_grid['M'] = IDENTIFIER_CODE;
892 tokeniser_grid['N'] = IDENTIFIER_CODE;
893 tokeniser_grid['O'] = IDENTIFIER_CODE;
894 tokeniser_grid['P'] = IDENTIFIER_CODE;
895 tokeniser_grid['Q'] = IDENTIFIER_CODE;
896 tokeniser_grid['R'] = IDENTIFIER_CODE;
897 tokeniser_grid['S'] = IDENTIFIER_CODE;
898 tokeniser_grid['T'] = IDENTIFIER_CODE;
899 tokeniser_grid['U'] = IDENTIFIER_CODE;
900 tokeniser_grid['V'] = IDENTIFIER_CODE;
901 tokeniser_grid['W'] = IDENTIFIER_CODE;
902 tokeniser_grid['X'] = IDENTIFIER_CODE;
903 tokeniser_grid['Y'] = IDENTIFIER_CODE;
904 tokeniser_grid['Z'] = IDENTIFIER_CODE;
906 tokeniser_grid['_'] = IDENTIFIER_CODE;
909 /* ------------------------------------------------------------------------- */
910 /* Definition of a lexical block: a source file or a string containing */
911 /* text for lexical analysis; an independent source from the point of */
912 /* view of issuing error reports. */
913 /* ------------------------------------------------------------------------- */
915 typedef struct LexicalBlock_s
916 { char *filename; /* Full translated name */
917 int main_flag; /* TRUE if the main file
918 (the first one opened) */
919 int sys_flag; /* TRUE if a System_File */
920 int source_line; /* Line number count */
921 int line_start; /* Char number within file
922 where the current line
924 int chars_read; /* Char number of read pos */
925 int file_no; /* Or 255 if not from a
928 char *orig_source; /* From #origsource direct */
934 static LexicalBlock NoFileOpen =
935 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
937 static LexicalBlock MakingOutput =
938 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
940 static LexicalBlock StringLB =
941 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255, NULL, 0, 0, 0 };
943 static LexicalBlock *CurrentLB; /* The current lexical
944 block of input text */
946 extern void declare_systemfile(void)
947 { CurrentLB->sys_flag = TRUE;
950 extern int is_systemfile(void)
951 { return ((CurrentLB->sys_flag)?1:0);
954 extern void set_origsource_location(char *source, int32 line, int32 charnum)
959 /* Clear the Origsource declaration. */
960 CurrentLB->orig_file = 0;
961 CurrentLB->orig_source = NULL;
962 CurrentLB->orig_line = 0;
963 CurrentLB->orig_char = 0;
967 /* Get the file number for a new or existing InputFiles entry. */
968 file_no = register_orig_sourcefile(source);
970 CurrentLB->orig_file = file_no;
971 CurrentLB->orig_source = InputFiles[file_no-1].filename;
972 CurrentLB->orig_line = line;
973 CurrentLB->orig_char = charnum;
976 /* Error locations. */
978 extern debug_location get_current_debug_location(void)
979 { debug_location result;
980 /* Assume that all input characters are one byte. */
981 result.file_index = CurrentLB->file_no;
982 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
983 result.end_byte_index = result.beginning_byte_index;
984 result.beginning_line_number = CurrentLB->source_line;
985 result.end_line_number = result.beginning_line_number;
986 result.beginning_character_number =
987 CurrentLB->chars_read - CurrentLB->line_start;
988 result.end_character_number = result.beginning_character_number;
989 result.orig_file_index = CurrentLB->orig_file;
990 result.orig_beg_line_number = CurrentLB->orig_line;
991 result.orig_beg_char_number = CurrentLB->orig_char;
995 static debug_location ErrorReport_debug_location;
997 extern void report_errors_at_current_line(void)
998 { ErrorReport.line_number = CurrentLB->source_line;
999 ErrorReport.file_number = CurrentLB->file_no;
1000 if (ErrorReport.file_number == 255)
1001 ErrorReport.file_number = -1;
1002 ErrorReport.source = CurrentLB->filename;
1003 ErrorReport.main_flag = CurrentLB->main_flag;
1004 if (debugfile_switch)
1005 ErrorReport_debug_location = get_current_debug_location();
1006 ErrorReport.orig_file = CurrentLB->orig_file;
1007 ErrorReport.orig_source = CurrentLB->orig_source;
1008 ErrorReport.orig_line = CurrentLB->orig_line;
1009 ErrorReport.orig_char = CurrentLB->orig_char;
1012 extern debug_location get_error_report_debug_location(void)
1013 { return ErrorReport_debug_location;
1016 extern int32 get_current_line_start(void)
1017 { return CurrentLB->line_start;
1020 brief_location blank_brief_location;
1022 extern brief_location get_brief_location(ErrorPosition *errpos)
1025 loc.file_index = errpos->file_number;
1026 loc.line_number = errpos->line_number;
1027 loc.orig_file_index = errpos->orig_file;
1028 loc.orig_line_number = errpos->orig_line;
1032 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1034 if (loc.file_index != -1)
1035 { errpos->file_number = loc.file_index;
1036 errpos->line_number = loc.line_number;
1037 errpos->main_flag = (errpos->file_number == 1);
1038 errpos->orig_source = NULL;
1039 errpos->orig_file = loc.orig_file_index;
1040 errpos->orig_line = loc.orig_line_number;
1041 errpos->orig_char = 0;
1045 /* ------------------------------------------------------------------------- */
1046 /* Hash printing and line counting */
1047 /* ------------------------------------------------------------------------- */
1049 static void print_hash(void)
1051 /* Hash-printing is the practice of printing a # character every 100
1052 lines of source code (the -x switch), reassuring the user that
1053 progress is being made */
1055 if (no_hash_printed_yet)
1056 { printf("::"); no_hash_printed_yet = FALSE;
1058 printf("#"); hash_printed_since_newline = TRUE;
1061 /* On some systems, text output is buffered to a line at a time, and
1062 this would frustrate the point of hash-printing, so: */
1068 static void reached_new_line(void)
1070 /* Called to signal that a new line has been reached in the source code */
1072 forerrors_pointer = 0;
1074 CurrentLB->source_line++;
1075 CurrentLB->line_start = CurrentLB->chars_read;
1077 total_source_line_count++;
1079 if (total_source_line_count%100==0)
1080 { if (hash_switch) print_hash();
1082 SpinCursor(32); /* I.e., allow other tasks to run */
1087 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1088 { ProcessEvents (&g_proc);
1092 if (temporary_files_switch)
1093 remove_temp_files();
1095 my_free(&all_text,"transcription text");
1096 abort_transcript_file();
1097 longjmp (g_fallback, 1);
1103 static void new_syntax_line(void)
1104 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1105 report_errors_at_current_line();
1108 /* Return 10 raised to the expo power.
1110 * I'm avoiding the standard pow() function for a rather lame reason:
1111 * it's in the libmath (-lm) library, and I don't want to change the
1112 * build model for the compiler. So, this is implemented with a stupid
1113 * lookup table. It's faster than pow() for small values of expo.
1114 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1115 * anyway, so I don't expect that to be a problem.
1117 * (For some reason, frexp() and ldexp(), which are used later on, do
1118 * not require libmath to be linked in.)
1120 static double pow10_cheap(int expo)
1122 #define POW10_RANGE (8)
1123 static double powers[POW10_RANGE*2+1] = {
1124 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1126 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1132 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1135 return res * powers[POW10_RANGE+expo];
1138 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1139 res *= powers[POW10_RANGE*2];
1141 return res * powers[POW10_RANGE+expo];
1145 /* Return the IEEE-754 single-precision encoding of a floating-point
1146 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
1147 * for an explanation.
1149 * The number is provided in the pieces it was parsed in:
1150 * [+|-] intv "." fracv "e" [+|-]expo
1152 * If the magnitude is too large (beyond about 3.4e+38), this returns
1153 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1154 * small (below about 1e-45), this returns a zero value (0x00000000 or
1155 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1156 * lexer should never do that).
1158 * Note that using a float constant does *not* set the uses_float_features
1159 * flag (which would cause the game file to be labelled 3.1.2). There's
1160 * no VM feature here, just an integer. Of course, any use of the float
1161 * *opcodes* will set the flag.
1163 * The math functions in this routine require #including <math.h>, but
1164 * they should not require linking the math library (-lm). At least,
1165 * they do not on OSX and Linux.
1167 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1169 double absval = (intv + fracv) * pow10_cheap(expo);
1170 int32 sign = (signbit ? 0x80000000 : 0x0);
1174 if (isinf(absval)) {
1175 return sign | 0x7f800000; /* infinity */
1177 if (isnan(absval)) {
1178 return sign | 0x7fc00000;
1181 mant = frexp(absval, &expo);
1183 /* Normalize mantissa to be in the range [1.0, 2.0) */
1184 if (0.5 <= mant && mant < 1.0) {
1188 else if (mant == 0.0) {
1192 return sign | 0x7f800000; /* infinity */
1196 return sign | 0x7f800000; /* infinity */
1198 else if (expo < -126) {
1199 /* Denormalized (very small) number */
1200 mant = ldexp(mant, 126 + expo);
1203 else if (!(expo == 0 && mant == 0.0)) {
1205 mant -= 1.0; /* Get rid of leading 1 */
1208 mant *= 8388608.0; /* 2^23 */
1209 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1211 /* The carry propagated out of a string of 23 1 bits. */
1215 return sign | 0x7f800000; /* infinity */
1219 return (sign) | ((int32)(expo << 23)) | (fbits);
1222 /* ------------------------------------------------------------------------- */
1223 /* Characters are read via a "pipeline" of variables, allowing us to look */
1224 /* up to three characters ahead of the current position. */
1226 /* There are two possible sources: from the source files being loaded in, */
1227 /* and from a string inside Inform (which is where the code for veneer */
1228 /* routines comes from). Each source has its own get-next-character */
1230 /* ------------------------------------------------------------------------- */
1231 /* Source 1: from files */
1233 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1234 /* from the current input file. If the file runs out, then if it was */
1235 /* the last source file 4 EOF characters are placed in the buffer: if it */
1236 /* was only an Include file ending, then a '\n' character is placed there */
1237 /* (essentially to force termination of any comment line) followed by */
1238 /* three harmless spaces. */
1240 /* The routine returns the number of characters it has written, and note */
1241 /* that this conveniently ensures that all characters in the buffer come */
1242 /* from the same file. */
1243 /* ------------------------------------------------------------------------- */
1245 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1247 typedef struct Sourcefile_s
1248 { char *buffer; /* Input buffer */
1249 int read_pos; /* Read position in buffer */
1250 int size; /* Number of meaningful
1251 characters in buffer */
1252 int la, la2, la3; /* Three characters of
1253 lookahead pipeline */
1254 int file_no; /* Internal file number
1259 static Sourcefile *FileStack;
1260 static int File_sp; /* Stack pointer */
1262 static Sourcefile *CF; /* Top entry on stack */
1264 static int last_input_file;
1266 static void begin_buffering_file(int i, int file_no)
1267 { int j, cnt; uchar *p;
1269 if (i >= MAX_INCLUSION_DEPTH)
1270 memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
1272 p = (uchar *) FileStack[i].buffer;
1275 { FileStack[i-1].la = lookahead;
1276 FileStack[i-1].la2 = lookahead2;
1277 FileStack[i-1].la3 = lookahead3;
1280 FileStack[i].file_no = file_no;
1281 FileStack[i].size = file_load_chars(file_no,
1282 (char *) p, SOURCE_BUFFER_SIZE);
1283 lookahead = source_to_iso_grid[p[0]];
1284 lookahead2 = source_to_iso_grid[p[1]];
1285 lookahead3 = source_to_iso_grid[p[2]];
1286 if (LOOKAHEAD_SIZE != 3)
1288 ("Lexer lookahead size does not match hard-coded lookahead code");
1289 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1291 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1292 else FileStack[i].LB.main_flag = FALSE;
1293 FileStack[i].LB.sys_flag = FALSE;
1294 FileStack[i].LB.source_line = 1;
1295 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1296 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1297 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1298 FileStack[i].LB.file_no = file_no;
1299 FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1300 FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1302 CurrentLB = &(FileStack[i].LB);
1303 CF = &(FileStack[i]);
1305 /* Check for recursive inclusion */
1308 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1312 warning_named("File included more than once",
1313 FileStack[j].LB.filename);
1316 static void create_char_pipeline(void)
1319 begin_buffering_file(File_sp++, 1);
1320 pipeline_made = TRUE;
1321 last_input_file = current_input_file;
1324 static int get_next_char_from_pipeline(void)
1327 while (last_input_file < current_input_file)
1329 /* An "Include" file must have opened since the last character
1330 was read. Perhaps more than one. We run forward through the
1331 list and add them to the include stack. But we ignore
1332 "Origsource" files (which were never actually opened for
1336 if (!InputFiles[last_input_file-1].is_input)
1339 begin_buffering_file(File_sp++, last_input_file);
1341 if (last_input_file != current_input_file)
1342 compiler_error("last_input_file did not match after Include");
1345 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1348 if (CF->read_pos == CF->size)
1350 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1354 if (CF->read_pos == -(CF->size))
1355 { set_token_location(get_current_debug_location());
1358 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1360 CF = &(FileStack[File_sp-1]);
1361 CurrentLB = &(FileStack[File_sp-1].LB);
1362 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1363 if (CF->read_pos == CF->size)
1365 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1368 set_token_location(get_current_debug_location());
1371 p = (uchar *) (CF->buffer);
1373 current = lookahead;
1374 lookahead = lookahead2;
1375 lookahead2 = lookahead3;
1376 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1378 CurrentLB->chars_read++;
1379 if (forerrors_pointer < 511)
1380 forerrors_buff[forerrors_pointer++] = current;
1381 if (current == '\n') reached_new_line();
1385 /* ------------------------------------------------------------------------- */
1386 /* Source 2: from a string */
1387 /* ------------------------------------------------------------------------- */
1389 static int source_to_analyse_pointer; /* Current read position */
1391 static int get_next_char_from_string(void)
1392 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1393 current = source_to_iso_grid[p[0]];
1395 if (current == 0) lookahead = 0;
1396 else lookahead = source_to_iso_grid[p[1]];
1397 if (lookahead == 0) lookahead2 = 0;
1398 else lookahead2 = source_to_iso_grid[p[2]];
1399 if (lookahead2 == 0) lookahead3 = 0;
1400 else lookahead3 = source_to_iso_grid[p[3]];
1402 CurrentLB->chars_read++;
1403 if (forerrors_pointer < 511)
1404 forerrors_buff[forerrors_pointer++] = current;
1405 if (current == '\n') reached_new_line();
1409 /* ========================================================================= */
1410 /* The interface between the lexer and Inform's higher levels: */
1412 /* put_token_back() (effectively) move the read position */
1413 /* back by one token */
1415 /* get_next_token() copy the token at the current read */
1416 /* position into the triple */
1417 /* (token_type, token_value, token_text) */
1418 /* and move the read position forward */
1421 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1422 /* to read from source files; */
1423 /* otherwise, to read from this string. */
1424 /* ------------------------------------------------------------------------- */
1426 extern void put_token_back(void)
1427 { tokens_put_back++;
1429 if (tokens_trace_level > 0)
1430 { if (tokens_trace_level == 1) printf("<- ");
1431 else printf("<-\n");
1434 /* The following error, of course, should never happen! */
1436 if (tokens_put_back == CIRCLE_SIZE)
1437 { compiler_error("The lexical analyser has collapsed because of a wrong \
1438 assumption inside Inform");
1444 extern void get_next_token(void)
1445 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1446 int returning_a_put_back_token = TRUE;
1448 context = lexical_context();
1450 if (tokens_put_back > 0)
1451 { i = circle_position - tokens_put_back + 1;
1452 if (i<0) i += CIRCLE_SIZE;
1454 if (context != token_contexts[i])
1455 { j = circle[i].type;
1456 if ((j==0) || ((j>=100) && (j<200)))
1457 interpret_identifier(i, FALSE);
1461 returning_a_put_back_token = FALSE;
1463 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1464 else circle_position++;
1466 if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
1467 lex_p = lexeme_memory;
1469 circle[circle_position].text = lex_p;
1470 circle[circle_position].value = 0;
1474 d = (*get_next_char)();
1475 e = tokeniser_grid[d];
1477 if (next_token_begins_syntax_line)
1478 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1479 { new_syntax_line();
1480 next_token_begins_syntax_line = FALSE;
1484 circle[circle_position].location = get_current_debug_location();
1487 { case 0: char_error("Illegal character found in source:", d);
1488 goto StartTokenAgain;
1490 case WHITESPACE_CODE:
1491 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1493 goto StartTokenAgain;
1496 while ((lookahead != '\n') && (lookahead != 0))
1498 goto StartTokenAgain;
1501 circle[circle_position].type = EOF_TT;
1502 strcpy(lex_p, "<end of file>");
1503 lex_p += strlen(lex_p) + 1;
1511 { n = n*radix + character_digit_value[d];
1513 } while ((character_digit_value[lookahead] < radix)
1514 && (d = (*get_next_char)(), TRUE));
1517 circle[circle_position].type = NUMBER_TT;
1518 circle[circle_position].value = n;
1522 { int expo=0; double intv=0, fracv=0;
1523 int expocount=0, intcount=0, fraccount=0;
1524 int signbit = (d == '-');
1526 while (character_digit_value[lookahead] < 10) {
1527 intv = 10.0*intv + character_digit_value[lookahead];
1529 *lex_p++ = lookahead;
1532 if (lookahead == '.') {
1533 double fracpow = 1.0;
1534 *lex_p++ = lookahead;
1536 while (character_digit_value[lookahead] < 10) {
1538 fracv = fracv + fracpow*character_digit_value[lookahead];
1540 *lex_p++ = lookahead;
1544 if (lookahead == 'e' || lookahead == 'E') {
1546 *lex_p++ = lookahead;
1548 if (lookahead == '+' || lookahead == '-') {
1549 exposign = (lookahead == '-');
1550 *lex_p++ = lookahead;
1553 while (character_digit_value[lookahead] < 10) {
1554 expo = 10*expo + character_digit_value[lookahead];
1556 *lex_p++ = lookahead;
1560 error("Floating-point literal must have digits after the 'e'");
1561 if (exposign) { expo = -expo; }
1563 if (intcount + fraccount == 0)
1564 error("Floating-point literal must have digits");
1565 n = construct_float(signbit, intv, fracv, expo);
1568 circle[circle_position].type = NUMBER_TT;
1569 circle[circle_position].value = n;
1570 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1574 radix = 16; d = (*get_next_char)();
1575 if (d == '-' || d == '+') { goto FloatNumber; }
1576 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1577 if (character_digit_value[d] >= radix)
1579 error("Binary number expected after '$$'");
1581 error("Hexadecimal number expected after '$'");
1585 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1588 { e = d; d = (*get_next_char)(); *lex_p++ = d;
1589 if (quoted_size++==64)
1591 "Too much text for one pair of quotations '...' to hold");
1594 if ((d == '\'') && (e != '@'))
1595 { if (quoted_size == 1)
1596 { d = (*get_next_char)(); *lex_p++ = d;
1598 error("No text between quotation marks ''");
1603 if (d==EOF) ebf_error("'\''", "end of file");
1605 circle[circle_position].type = SQ_TT;
1608 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1611 { d = (*get_next_char)(); *lex_p++ = d;
1612 if (quoted_size++==MAX_QTEXT_SIZE)
1613 { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
1618 while (*(lex_p-1) == ' ') lex_p--;
1619 if (*(lex_p-1) != '^') *lex_p++ = ' ';
1620 while ((lookahead != EOF) &&
1621 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1625 { int newline_passed = FALSE;
1627 while ((lookahead != EOF) &&
1628 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1629 if ((d = (*get_next_char)()) == '\n')
1630 newline_passed = TRUE;
1631 if (!newline_passed)
1633 chb[0] = '\"'; chb[1] = lookahead;
1634 chb[2] = '\"'; chb[3] = 0;
1635 ebf_error("empty rest of line after '\\' in string",
1639 } while ((d != EOF) && (d!='\"'));
1640 if (d==EOF) ebf_error("'\"'", "end of file");
1642 circle[circle_position].type = DQ_TT;
1645 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1648 while ((n<=MAX_IDENTIFIER_LENGTH)
1649 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1650 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1651 n++, *lex_p++ = (*get_next_char)();
1655 if (n > MAX_IDENTIFIER_LENGTH)
1656 { char bad_length[100];
1658 "Name exceeds the maximum length of %d characters:",
1659 MAX_IDENTIFIER_LENGTH);
1660 error_named(bad_length, circle[circle_position].text);
1661 /* Trim token so that it doesn't violate
1662 MAX_IDENTIFIER_LENGTH during error recovery */
1663 circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
1666 if (dont_enter_into_symbol_table)
1667 { circle[circle_position].type = DQ_TT;
1668 circle[circle_position].value = 0;
1669 if (dont_enter_into_symbol_table == -2)
1670 interpret_identifier(circle_position, TRUE);
1674 interpret_identifier(circle_position, FALSE);
1679 /* The character is initial to at least one of the separators */
1681 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1682 { r = (char *) separators[j];
1684 { *lex_p++=d; *lex_p++=0;
1685 goto SeparatorMatched;
1689 { if (*(r+1) == lookahead)
1691 *lex_p++=(*get_next_char)();
1693 goto SeparatorMatched;
1697 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1699 *lex_p++=(*get_next_char)();
1700 *lex_p++=(*get_next_char)();
1702 goto SeparatorMatched;
1707 /* The following contingency never in fact arises with the
1708 current set of separators, but might in future */
1710 *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
1712 error_named("Unrecognised combination in source:", lex_p);
1713 goto StartTokenAgain;
1717 circle[circle_position].type = SEP_TT;
1718 circle[circle_position].value = j;
1720 { case SEMICOLON_SEP: break;
1721 case HASHNDOLLAR_SEP:
1722 case HASHWDOLLAR_SEP:
1723 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1724 { error_named("Character expected after",
1725 circle[circle_position].text);
1729 *lex_p++ = (*get_next_char)();
1730 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1731 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1732 *lex_p++ = (*get_next_char)();
1735 case HASHADOLLAR_SEP:
1736 case HASHGDOLLAR_SEP:
1737 case HASHRDOLLAR_SEP:
1739 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1740 { error_named("Alphabetic character expected after",
1741 circle[circle_position].text);
1745 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1746 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1747 *lex_p++ = (*get_next_char)();
1754 i = circle_position;
1757 token_value = circle[i].value;
1758 token_type = circle[i].type;
1759 token_text = circle[i].text;
1760 if (!returning_a_put_back_token)
1761 { set_token_location(circle[i].location);
1763 token_contexts[i] = context;
1765 if (tokens_trace_level > 0)
1766 { if (tokens_trace_level == 1)
1767 printf("'%s' ", circle[i].text);
1769 { printf("-> "); describe_token(circle[i]);
1771 if (tokens_trace_level > 2) print_context(token_contexts[i]);
1777 static char veneer_error_title[64];
1779 extern void restart_lexer(char *lexical_source, char *name)
1781 circle_position = 0;
1782 for (i=0; i<CIRCLE_SIZE; i++)
1783 { circle[i].type = 0;
1784 circle[i].value = 0;
1785 circle[i].text = "(if this is ever visible, there is a bug)";
1786 token_contexts[i] = 0;
1789 lex_p = lexeme_memory;
1790 tokens_put_back = 0;
1791 forerrors_pointer = 0;
1792 dont_enter_into_symbol_table = FALSE;
1793 return_sp_as_variable = FALSE;
1794 next_token_begins_syntax_line = TRUE;
1796 source_to_analyse = lexical_source;
1798 if (source_to_analyse == NULL)
1799 { get_next_char = get_next_char_from_pipeline;
1800 if (!pipeline_made) create_char_pipeline();
1801 forerrors_buff[0] = 0; forerrors_pointer = 0;
1804 { get_next_char = get_next_char_from_string;
1805 source_to_analyse_pointer = 0;
1806 CurrentLB = &StringLB;
1807 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1808 StringLB.filename = veneer_error_title;
1810 CurrentLB->source_line = 1;
1811 CurrentLB->line_start = 0;
1812 CurrentLB->chars_read = 0;
1816 /* ========================================================================= */
1817 /* Data structure management routines */
1818 /* ------------------------------------------------------------------------- */
1820 extern void init_lexer_vars(void)
1822 blank_brief_location.file_index = -1;
1823 blank_brief_location.line_number = 0;
1824 blank_brief_location.orig_file_index = 0;
1825 blank_brief_location.orig_line_number = 0;
1828 extern void lexer_begin_prepass(void)
1829 { total_source_line_count = 0;
1830 CurrentLB = &NoFileOpen;
1831 report_errors_at_current_line();
1834 extern void lexer_begin_pass(void)
1835 { no_hash_printed_yet = TRUE;
1836 hash_printed_since_newline = FALSE;
1838 pipeline_made = FALSE;
1840 restart_lexer(NULL, NULL);
1843 extern void lexer_endpass(void)
1844 { CurrentLB = &MakingOutput;
1845 report_errors_at_current_line();
1848 extern void lexer_allocate_arrays(void)
1851 FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
1852 "filestack buffer");
1854 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1855 FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1857 lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
1859 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1860 "keyword hash table");
1861 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1862 "keyword hash end table");
1863 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
1864 "keyword hashing linked list");
1865 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1866 "local variable hash table");
1867 local_variable_text_table = my_malloc(
1868 (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
1869 "text of local variable names");
1871 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
1872 "local variable hash codes");
1873 local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
1874 "local variable text pointers");
1876 make_tokeniser_grid();
1877 make_keywords_tables();
1879 first_token_locations =
1880 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
1881 first_token_locations->location.file_index = 0;
1882 first_token_locations->location.beginning_byte_index = 0;
1883 first_token_locations->location.end_byte_index = 0;
1884 first_token_locations->location.beginning_line_number = 0;
1885 first_token_locations->location.end_line_number = 0;
1886 first_token_locations->location.beginning_character_number = 0;
1887 first_token_locations->location.end_character_number = 0;
1888 first_token_locations->location.orig_file_index = 0;
1889 first_token_locations->location.orig_beg_line_number = 0;
1890 first_token_locations->location.orig_beg_char_number = 0;
1891 first_token_locations->next = NULL;
1892 first_token_locations->reference_count = 0;
1893 last_token_location = first_token_locations;
1896 extern void lexer_free_arrays(void)
1899 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1900 { p = FileStack[i].buffer;
1901 my_free(&p, "source file buffer");
1903 my_free(&FileStack, "filestack buffer");
1904 my_free(&lexeme_memory, "lexeme memory");
1906 my_free(&keywords_hash_table, "keyword hash table");
1907 my_free(&keywords_hash_ends_table, "keyword hash end table");
1908 my_free(&keywords_data_table, "keyword hashing linked list");
1909 my_free(&local_variable_hash_table, "local variable hash table");
1910 my_free(&local_variable_text_table, "text of local variable names");
1912 my_free(&local_variable_hash_codes, "local variable hash codes");
1913 my_free(&local_variable_texts, "local variable text pointers");
1915 cleanup_token_locations(NULL);
1918 /* ========================================================================= */