1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
4 /* Part of Inform 6.35 */
5 /* copyright (c) Graham Nelson 1993 - 2020 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ *
20 /* ------------------------------------------------------------------------- */
24 int total_source_line_count, /* Number of source lines so far */
26 no_hash_printed_yet, /* Have not yet printed the first # */
27 hash_printed_since_newline, /* A hash has been printed since the
28 most recent new-line was printed
29 (generally as a result of an error
30 message or the start of pass) */
31 dont_enter_into_symbol_table, /* Return names as text (with
32 token type DQ_TT, i.e., as if
33 they had double-quotes around)
34 and not as entries in the symbol
35 table, when TRUE. If -2, only the
36 keyword table is searched. */
37 return_sp_as_variable; /* When TRUE, the word "sp" denotes
38 the stack pointer variable
39 (used in assembly language only) */
40 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
41 line (for error reporting, etc.)
42 on the source code line where
43 the next token appears */
45 int32 last_mapped_line; /* Last syntax line reported to debugging file */
47 /* ------------------------------------------------------------------------- */
48 /* The lexer's output is a sequence of triples, each called a "token", */
49 /* representing one lexical unit (or "lexeme") each. Instead of providing */
50 /* "lookahead" (that is, always having available the next token after the */
51 /* current one, so that syntax analysers higher up in Inform can have */
52 /* advance knowledge of what is coming), the lexer instead has a system */
53 /* where tokens can be read in and then "put back again". */
54 /* The meaning of the number (and to some extent the text) supplied with */
55 /* a token depends on its type: see "header.h" for the list of types. */
56 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
57 /* number, and translated to the token: */
58 /* type NUMBER_TT, value 483, text "$1e3" */
59 /* ------------------------------------------------------------------------- */
60 /* These three variables are set to the current token on a call to */
61 /* get_next_token() (but are not changed by a call to put_token_back()). */
62 /* ------------------------------------------------------------------------- */
68 /* ------------------------------------------------------------------------- */
69 /* The next two variables are the head and tail of a singly linked list. */
70 /* The tail stores the portion most recently read from the current */
71 /* lexical block; its end values therefore describe the location of the */
72 /* current token, and are updated whenever the three variables above are */
73 /* via set_token_location(...). Earlier vertices, if any, represent the */
74 /* regions of lexical blocks read beforehand, where new vertices are */
75 /* only introduced by interruptions like a file inclusion or an EOF. */
76 /* Vertices are deleted off of the front of the list once they are no */
77 /* longer referenced by pending debug information records. */
78 /* ------------------------------------------------------------------------- */
80 static debug_locations *first_token_locations;
81 static debug_locations *last_token_location;
83 extern debug_location get_token_location(void)
84 { debug_location result;
85 debug_location *location = &(last_token_location->location);
86 result.file_index = location->file_index;
87 result.beginning_byte_index = location->end_byte_index;
88 result.end_byte_index = location->end_byte_index;
89 result.beginning_line_number = location->end_line_number;
90 result.end_line_number = location->end_line_number;
91 result.beginning_character_number = location->end_character_number;
92 result.end_character_number = location->end_character_number;
93 result.orig_file_index = location->orig_file_index;
94 result.orig_beg_line_number = location->orig_beg_line_number;
95 result.orig_beg_char_number = location->orig_beg_char_number;
99 extern debug_locations get_token_locations(void)
100 { debug_locations result;
101 result.location = get_token_location();
103 result.reference_count = 0;
107 static void set_token_location(debug_location location)
108 { if (location.file_index == last_token_location->location.file_index)
109 { last_token_location->location.end_byte_index =
110 location.end_byte_index;
111 last_token_location->location.end_line_number =
112 location.end_line_number;
113 last_token_location->location.end_character_number =
114 location.end_character_number;
115 last_token_location->location.orig_file_index =
116 location.orig_file_index;
117 last_token_location->location.orig_beg_line_number =
118 location.orig_beg_line_number;
119 last_token_location->location.orig_beg_char_number =
120 location.orig_beg_char_number;
122 { debug_locations*successor =
124 (sizeof(debug_locations),
125 "debug locations of recent tokens");
126 successor->location = location;
127 successor->next = NULL;
128 successor->reference_count = 0;
129 last_token_location->next = successor;
130 last_token_location = successor;
134 extern debug_location_beginning get_token_location_beginning(void)
135 { debug_location_beginning result;
136 ++(last_token_location->reference_count);
137 result.head = last_token_location;
138 result.beginning_byte_index =
139 last_token_location->location.end_byte_index;
140 result.beginning_line_number =
141 last_token_location->location.end_line_number;
142 result.beginning_character_number =
143 last_token_location->location.end_character_number;
144 result.orig_file_index = last_token_location->location.orig_file_index;
145 result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
146 result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
151 static void cleanup_token_locations(debug_location_beginning*beginning)
152 { if (first_token_locations)
153 { while (first_token_locations &&
154 !first_token_locations->reference_count)
155 { debug_locations*moribund = first_token_locations;
156 first_token_locations = moribund->next;
157 my_free(&moribund, "debug locations of recent tokens");
159 (beginning->head == moribund || !first_token_locations))
161 ("Records needed by a debug_location_beginning are no "
162 "longer allocated, perhaps because of an invalid reuse "
163 "of this or an earlier beginning");
169 ("Attempt to use a debug_location_beginning when no token "
170 "locations are defined");
173 ("Attempt to clean up token locations when no token locations "
179 extern void discard_token_location(debug_location_beginning beginning)
180 { --(beginning.head->reference_count);
183 extern debug_locations get_token_location_end
184 (debug_location_beginning beginning)
185 { debug_locations result;
186 cleanup_token_locations(&beginning);
187 --(beginning.head->reference_count);
188 /* Sometimes we know what we'll read before we switch to the lexical block
189 where we'll read it. In such cases the beginning will be placed in the
190 prior block and last exactly zero bytes there. It's misleading to
191 include such ranges, so we gobble them. */
192 if (beginning.head->location.end_byte_index ==
193 beginning.beginning_byte_index &&
194 beginning.head->next)
195 { beginning.head = beginning.head->next;
196 result.location = beginning.head->location;
197 result.location.beginning_byte_index = 0;
198 result.location.beginning_line_number = 1;
199 result.location.beginning_character_number = 1;
201 { result.location = beginning.head->location;
202 result.location.beginning_byte_index =
203 beginning.beginning_byte_index;
204 result.location.beginning_line_number =
205 beginning.beginning_line_number;
206 result.location.beginning_character_number =
207 beginning.beginning_character_number;
210 result.location.orig_file_index =
211 beginning.orig_file_index;
212 result.location.orig_beg_line_number =
213 beginning.orig_beg_line_number;
214 result.location.orig_beg_char_number =
215 beginning.orig_beg_char_number;
217 result.next = beginning.head->next;
218 result.reference_count = 0;
222 /* ------------------------------------------------------------------------- */
223 /* In order to be able to put tokens back efficiently, the lexer stores */
224 /* tokens in a "circle": the variable circle_position ranges between */
225 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
226 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
227 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
229 /* Unlike some compilers, Inform does not have a context-free lexer: in */
230 /* fact it has 12288 different possible states. However, the context only */
231 /* affects the interpretation of "identifiers": lexemes beginning with a */
232 /* letter and containing up to 32 chars of alphanumeric and underscore */
233 /* chars. (For example, "default" may refer to the directive or statement */
234 /* of that name, and which token values are returned depends on the */
235 /* current lexical context.) */
237 /* Along with each token, we also store the lexical context it was */
238 /* translated under; because if it is called for again, there may need */
239 /* to be a fresh interpretation of it if the context has changed. */
240 /* ------------------------------------------------------------------------- */
242 #define CIRCLE_SIZE 6
244 /* (The worst case for token lookahead is distinguishing between an
245 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
247 static int circle_position;
248 static token_data circle[CIRCLE_SIZE];
250 static int token_contexts[CIRCLE_SIZE];
252 /* ------------------------------------------------------------------------- */
253 /* A complication, however, is that the text of some lexemes needs to be */
254 /* held in Inform's memory for much longer periods: for example, a */
255 /* dictionary word lexeme (like "'south'") must have its text preserved */
256 /* until the code generation time for the expression it occurs in, when */
257 /* the dictionary reference is actually made. Code generation in general */
258 /* occurs as early as possible in Inform: pending some better method of */
259 /* garbage collection, we simply use a buffer so large that unless */
260 /* expressions spread across 10K of source code are found, there can be */
262 /* ------------------------------------------------------------------------- */
264 static char *lexeme_memory;
265 static char *lex_p; /* Current write position */
267 /* ------------------------------------------------------------------------- */
268 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
269 /* LR(3) grammar to translate characters into tokens). */
270 /* ------------------------------------------------------------------------- */
272 #define LOOKAHEAD_SIZE 3
274 static int current, lookahead, /* The latest character read, and */
275 lookahead2, lookahead3; /* the three characters following it */
277 static int pipeline_made; /* Whether or not the pipeline of
278 characters has been constructed
281 static int (* get_next_char)(void); /* Routine for reading the stream of
282 characters: the lexer does not
283 need any "ungetc" routine for
284 putting them back again. End of
285 stream is signalled by returning
288 static char *source_to_analyse; /* The current lexical source:
289 NULL for "load from source files",
290 otherwise this points to a string
291 containing Inform code */
293 static int tokens_put_back; /* Count of the number of backward
294 moves made from the last-read
297 extern void describe_token(token_data t)
299 /* Many of the token types are not set in this file, but later on in
300 Inform's higher stages (for example, in the expression evaluator);
301 but this routine describes them all. */
307 /* The following token types occur in lexer output: */
309 case SYMBOL_TT: printf("symbol ");
310 describe_symbol(t.value);
312 case NUMBER_TT: printf("literal number %d", t.value);
314 case DQ_TT: printf("string \"%s\"", t.text);
316 case SQ_TT: printf("string '%s'", t.text);
318 case SEP_TT: printf("separator '%s'", t.text);
320 case EOF_TT: printf("end of file");
323 case STATEMENT_TT: printf("statement name '%s'", t.text);
325 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
327 case DIRECTIVE_TT: printf("directive name '%s'", t.text);
329 case CND_TT: printf("textual conditional '%s'", t.text);
331 case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
333 case SYSFUN_TT: printf("built-in function name '%s'", t.text);
335 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
337 case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
339 case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
341 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
343 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
346 /* The remaining are etoken types, not set by the lexer */
348 case OP_TT: printf("operator '%s'",
349 operators[t.value].description);
351 case ENDEXP_TT: printf("end of expression");
353 case SUBOPEN_TT: printf("open bracket");
355 case SUBCLOSE_TT: printf("close bracket");
357 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
359 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
361 case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
363 case DICTWORD_TT: printf("dictionary word '%s'", t.text);
365 case ACTION_TT: printf("action name '%s'", t.text);
369 printf("** unknown token type %d, text='%s', value=%d **",
370 t.type, t.text, t.value);
375 /* ------------------------------------------------------------------------- */
376 /* All but one of the 280 Inform keywords (118 of them opcode names used */
377 /* only by the assembler). (The one left over is "sp", a keyword used in */
378 /* assembly language only.) */
380 /* A "keyword group" is a set of keywords to be searched for. If a match */
381 /* is made on an identifier, the token type becomes that given in the KG */
382 /* and the token value is its index in the KG. */
384 /* The keyword ordering must correspond with the appropriate #define's in */
385 /* "header.h" but is otherwise not significant. */
386 /* ------------------------------------------------------------------------- */
388 #define MAX_KEYWORDS 350
390 /* The values will be filled in at compile time, when we know
391 which opcode set to use. */
392 keyword_group opcode_names =
394 OPCODE_NAME_TT, FALSE, TRUE
397 static char *opcode_list_z[] = {
398 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
399 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
400 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
401 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
402 "print_char", "print_num", "random", "push", "pull", "split_window",
403 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
404 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
405 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
406 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
407 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
408 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
409 "erase_window", "erase_line", "set_cursor", "get_cursor",
410 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
411 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
412 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
413 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
414 "restore_undo", "draw_picture", "picture_data", "erase_picture",
415 "set_margins", "move_window", "window_size", "window_style",
416 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
417 "mouse_window", "push_stack", "put_wind_prop", "print_form",
418 "make_menu", "picture_table", "print_unicode", "check_unicode",
422 static char *opcode_list_g[] = {
423 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
424 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
425 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
426 "jltu", "jgeu", "jgtu", "jleu",
428 "catch", "throw", "tailcall",
429 "copy", "copys", "copyb", "sexs", "sexb", "aload",
430 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
431 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
432 "streamchar", "streamnum", "streamstr",
433 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
434 "random", "setrandom", "quit", "verify",
435 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
436 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
437 "linearsearch", "binarysearch", "linkedsearch",
438 "callf", "callfi", "callfii", "callfiii",
440 "mzero", "mcopy", "malloc", "mfree",
441 "accelfunc", "accelparam",
442 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
443 "fadd", "fsub", "fmul", "fdiv", "fmod",
444 "sqrt", "exp", "log", "pow",
445 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
446 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
450 keyword_group opcode_macros =
452 OPCODE_MACRO_TT, FALSE, TRUE
455 static char *opmacro_list_z[] = { "" };
457 static char *opmacro_list_g[] = {
462 keyword_group directives =
463 { { "abbreviate", "array", "attribute", "class", "constant",
464 "default", "dictionary", "end", "endif", "extend", "fake_action",
465 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
466 "iffalse", "import", "include", "link", "lowstring", "message",
467 "nearby", "object", "origsource", "property", "release", "replace",
468 "serial", "switches", "statusline", "stub", "system_file", "trace",
469 "undef", "verb", "version", "zcharacter",
471 DIRECTIVE_TT, FALSE, FALSE
474 keyword_group trace_keywords =
475 { { "dictionary", "symbols", "objects", "verbs",
476 "assembly", "expressions", "lines", "tokens", "linker",
478 TRACE_KEYWORD_TT, FALSE, TRUE
481 keyword_group segment_markers =
482 { { "class", "has", "private", "with", "" },
483 SEGMENT_MARKER_TT, FALSE, TRUE
486 keyword_group directive_keywords =
487 { { "alias", "long", "additive",
489 "noun", "held", "multi", "multiheld", "multiexcept",
490 "multiinside", "creature", "special", "number", "scope", "topic",
491 "reverse", "meta", "only", "replace", "first", "last",
492 "string", "table", "buffer", "data", "initial", "initstr",
493 "with", "private", "has", "class",
494 "error", "fatalerror", "warning",
495 "terminating", "static",
497 DIR_KEYWORD_TT, FALSE, TRUE
500 keyword_group misc_keywords =
501 { { "char", "name", "the", "a", "an", "The", "number",
502 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
503 "to", "address", "string", "object", "near", "from", "property", "A", "" },
504 MISC_KEYWORD_TT, FALSE, TRUE
507 keyword_group statements =
508 { { "box", "break", "continue", "default", "do", "else", "font", "for",
509 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
510 "print", "print_ret", "quit", "read", "remove", "restore", "return",
511 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
512 "until", "while", "" },
513 STATEMENT_TT, FALSE, TRUE
516 keyword_group conditions =
517 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
521 keyword_group system_functions =
522 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
523 "sibling", "younger", "youngest", "metaclass", "glk", "" },
524 SYSFUN_TT, FALSE, TRUE
527 keyword_group system_constants =
528 { { "adjectives_table", "actions_table", "classes_table",
529 "identifiers_table", "preactions_table", "version_number",
530 "largest_object", "strings_offset", "code_offset",
531 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
532 "static_memory_offset", "array_names_offset", "readable_memory_offset",
533 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
534 "array__start", "array__end",
535 "lowest_attribute_number", "highest_attribute_number",
536 "attribute_names_array",
537 "lowest_property_number", "highest_property_number",
538 "property_names_array",
539 "lowest_action_number", "highest_action_number",
540 "action_names_array",
541 "lowest_fake_action_number", "highest_fake_action_number",
542 "fake_action_names_array",
543 "lowest_routine_number", "highest_routine_number", "routines_array",
544 "routine_names_array", "routine_flags_array",
545 "lowest_global_number", "highest_global_number", "globals_array",
546 "global_names_array", "global_flags_array",
547 "lowest_array_number", "highest_array_number", "arrays_array",
548 "array_names_array", "array_flags_array",
549 "lowest_constant_number", "highest_constant_number", "constants_array",
550 "constant_names_array",
551 "lowest_class_number", "highest_class_number", "class_objects_array",
552 "lowest_object_number", "highest_object_number",
554 "grammar_table", "dictionary_table", "dynam_string_table",
556 SYSTEM_CONSTANT_TT, FALSE, TRUE
559 keyword_group *keyword_groups[12]
560 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
561 &directive_keywords, &misc_keywords, &statements, &conditions,
562 &system_functions, &system_constants, &opcode_macros};
564 keyword_group local_variables =
565 { { "" }, /* Filled in when routine declared */
566 LOCAL_VARIABLE_TT, FALSE, FALSE
569 static int lexical_context(void)
571 /* The lexical context is a number representing all of the context
572 information in the lexical analyser: the same input text will
573 always translate to the same output tokens whenever the context
576 In fact, for efficiency reasons this number omits the bit of
577 information held in the variable "dont_enter_into_symbol_table".
578 Inform never needs to backtrack through tokens parsed in that
579 way (thankfully, as it would be expensive indeed to check
583 if (opcode_names.enabled) c |= 1;
584 if (directives.enabled) c |= 2;
585 if (trace_keywords.enabled) c |= 4;
586 if (segment_markers.enabled) c |= 8;
587 if (directive_keywords.enabled) c |= 16;
588 if (misc_keywords.enabled) c |= 32;
589 if (statements.enabled) c |= 64;
590 if (conditions.enabled) c |= 128;
591 if (system_functions.enabled) c |= 256;
592 if (system_constants.enabled) c |= 512;
593 if (local_variables.enabled) c |= 1024;
595 if (return_sp_as_variable) c |= 2048;
599 static void print_context(int c)
601 if ((c & 1) != 0) printf("OPC ");
602 if ((c & 2) != 0) printf("DIR ");
603 if ((c & 4) != 0) printf("TK ");
604 if ((c & 8) != 0) printf("SEG ");
605 if ((c & 16) != 0) printf("DK ");
606 if ((c & 32) != 0) printf("MK ");
607 if ((c & 64) != 0) printf("STA ");
608 if ((c & 128) != 0) printf("CND ");
609 if ((c & 256) != 0) printf("SFUN ");
610 if ((c & 512) != 0) printf("SCON ");
611 if ((c & 1024) != 0) printf("LV ");
612 if ((c & 2048) != 0) printf("sp ");
615 static int *keywords_hash_table;
616 static int *keywords_hash_ends_table;
617 static int *keywords_data_table;
619 static int *local_variable_hash_table;
620 static int *local_variable_hash_codes;
621 char **local_variable_texts;
622 static char *local_variable_text_table;
624 static char one_letter_locals[128];
626 static void make_keywords_tables(void)
628 char **oplist, **maclist;
631 oplist = opcode_list_z;
632 maclist = opmacro_list_z;
635 oplist = opcode_list_g;
636 maclist = opmacro_list_g;
639 for (j=0; *(oplist[j]); j++) {
640 opcode_names.keywords[j] = oplist[j];
642 opcode_names.keywords[j] = "";
644 for (j=0; *(maclist[j]); j++) {
645 opcode_macros.keywords[j] = maclist[j];
647 opcode_macros.keywords[j] = "";
649 for (i=0; i<HASH_TAB_SIZE; i++)
650 { keywords_hash_table[i] = -1;
651 keywords_hash_ends_table[i] = -1;
654 for (i=1; i<=11; i++)
655 { keyword_group *kg = keyword_groups[i];
656 for (j=0; *(kg->keywords[j]) != 0; j++)
657 { h = hash_code_from_string(kg->keywords[j]);
658 if (keywords_hash_table[h] == -1)
659 keywords_hash_table[h] = tp;
661 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
662 keywords_hash_ends_table[h] = tp;
663 *(keywords_data_table + 3*tp) = i;
664 *(keywords_data_table + 3*tp+1) = j;
665 *(keywords_data_table + 3*tp+2) = -1;
671 extern void construct_local_variable_tables(void)
672 { int i, h; char *p = local_variable_text_table;
673 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
674 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
676 for (i=0; i<no_locals; i++)
677 { char *q = local_variables.keywords[i];
679 { one_letter_locals[(uchar)q[0]] = i;
680 if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
681 if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
683 h = hash_code_from_string(q);
684 if (local_variable_hash_table[h] == -1)
685 local_variable_hash_table[h] = i;
686 local_variable_hash_codes[i] = h;
687 local_variable_texts[i] = p;
691 for (;i<MAX_LOCAL_VARIABLES-1;i++)
692 local_variable_texts[i] = "<no such local variable>";
695 static void interpret_identifier(int pos, int dirs_only_flag)
696 { int index, hashcode; char *p = circle[pos].text;
698 /* An identifier is either a keyword or a "symbol", a name which the
699 lexical analyser leaves to higher levels of Inform to understand. */
701 hashcode = hash_code_from_string(p);
703 if (dirs_only_flag) goto KeywordSearch;
705 /* If this is assembly language, perhaps it is "sp"? */
707 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
708 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
712 /* Test for local variables first, quite quickly. */
714 if (local_variables.enabled)
716 { index = one_letter_locals[(uchar)p[0]];
717 if (index<MAX_LOCAL_VARIABLES)
718 { circle[pos].type = LOCAL_VARIABLE_TT;
719 circle[pos].value = index+1;
723 index = local_variable_hash_table[hashcode];
725 { for (;index<no_locals;index++)
726 { if (hashcode == local_variable_hash_codes[index])
727 { if (strcmpcis(p, local_variable_texts[index])==0)
728 { circle[pos].type = LOCAL_VARIABLE_TT;
729 circle[pos].value = index+1;
737 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
738 the name of a system function which has been Replaced. */
741 index = keywords_hash_table[hashcode];
743 { int *i = keywords_data_table + 3*index;
744 keyword_group *kg = keyword_groups[*i];
745 if (((!dirs_only_flag) && (kg->enabled))
746 || (dirs_only_flag && (kg == &directives)))
747 { char *q = kg->keywords[*(i+1)];
748 if (((kg->case_sensitive) && (strcmp(p, q)==0))
749 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
750 { if ((kg != &system_functions)
751 || (system_function_usage[*(i+1)]!=2))
752 { circle[pos].type = kg->change_token_type;
753 circle[pos].value = *(i+1);
761 if (dirs_only_flag) return;
763 /* Search for the name; create it if necessary. */
765 circle[pos].value = symbol_index(p, hashcode);
766 circle[pos].type = SYMBOL_TT;
770 /* ------------------------------------------------------------------------- */
771 /* The tokeniser grid aids a rapid decision about the consequences of a */
772 /* character reached in the buffer. In effect it is an efficiently stored */
773 /* transition table using an algorithm similar to that of S. C. Johnson's */
774 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
775 /* My thanks to Dilip Sequeira for suggesting this. */
777 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
778 /* separator numbers n, n+1, ..., n+m-1 */
779 /* or certain special values (QUOTE_CODE, etc) */
782 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
783 /* number of separators supported exceeds 61. */
784 /* ------------------------------------------------------------------------- */
786 static int tokeniser_grid[256];
788 #define QUOTE_CODE 1000
789 #define DQUOTE_CODE 1001
790 #define NULL_CODE 1002
791 #define SPACE_CODE 1003
792 #define NEGATIVE_CODE 1004
793 #define DIGIT_CODE 1005
794 #define RADIX_CODE 1006
795 #define KEYWORD_CODE 1007
796 #define EOF_CODE 1008
797 #define WHITESPACE_CODE 1009
798 #define COMMENT_CODE 1010
799 #define IDENTIFIER_CODE 1011
801 /* This list cannot safely be changed without also changing the header
802 separator #defines. The ordering is significant in that (i) all entries
803 beginning with the same character must be adjacent and (ii) that if
804 X is a an initial substring of Y then X must come before Y.
806 E.g. --> must occur before -- to prevent "-->0" being tokenised
807 wrongly as "--", ">", "0" rather than "-->", "0". */
809 static const char separators[NUMBER_SEPARATORS][4] =
810 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
811 "||", "|", "&&", "&", "~~",
812 "~=", "~", "==", "=", ">=", ">",
813 "<=", "<", "(", ")", ",",
814 ".&", ".#", "..&", "..#", "..", ".",
815 "::", ":", "@", ";", "[", "]", "{", "}",
817 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
820 static void make_tokeniser_grid(void)
822 /* Construct the grid to the specification above. */
826 for (i=0; i<256; i++) tokeniser_grid[i]=0;
828 for (i=0; i<NUMBER_SEPARATORS; i++)
829 { j=separators[i][0];
830 if (tokeniser_grid[j]==0)
831 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
833 tokeniser_grid['\''] = QUOTE_CODE;
834 tokeniser_grid['\"'] = DQUOTE_CODE;
835 tokeniser_grid[0] = EOF_CODE;
836 tokeniser_grid[' '] = WHITESPACE_CODE;
837 tokeniser_grid['\n'] = WHITESPACE_CODE;
838 tokeniser_grid['$'] = RADIX_CODE;
839 tokeniser_grid['!'] = COMMENT_CODE;
841 tokeniser_grid['0'] = DIGIT_CODE;
842 tokeniser_grid['1'] = DIGIT_CODE;
843 tokeniser_grid['2'] = DIGIT_CODE;
844 tokeniser_grid['3'] = DIGIT_CODE;
845 tokeniser_grid['4'] = DIGIT_CODE;
846 tokeniser_grid['5'] = DIGIT_CODE;
847 tokeniser_grid['6'] = DIGIT_CODE;
848 tokeniser_grid['7'] = DIGIT_CODE;
849 tokeniser_grid['8'] = DIGIT_CODE;
850 tokeniser_grid['9'] = DIGIT_CODE;
852 tokeniser_grid['a'] = IDENTIFIER_CODE;
853 tokeniser_grid['b'] = IDENTIFIER_CODE;
854 tokeniser_grid['c'] = IDENTIFIER_CODE;
855 tokeniser_grid['d'] = IDENTIFIER_CODE;
856 tokeniser_grid['e'] = IDENTIFIER_CODE;
857 tokeniser_grid['f'] = IDENTIFIER_CODE;
858 tokeniser_grid['g'] = IDENTIFIER_CODE;
859 tokeniser_grid['h'] = IDENTIFIER_CODE;
860 tokeniser_grid['i'] = IDENTIFIER_CODE;
861 tokeniser_grid['j'] = IDENTIFIER_CODE;
862 tokeniser_grid['k'] = IDENTIFIER_CODE;
863 tokeniser_grid['l'] = IDENTIFIER_CODE;
864 tokeniser_grid['m'] = IDENTIFIER_CODE;
865 tokeniser_grid['n'] = IDENTIFIER_CODE;
866 tokeniser_grid['o'] = IDENTIFIER_CODE;
867 tokeniser_grid['p'] = IDENTIFIER_CODE;
868 tokeniser_grid['q'] = IDENTIFIER_CODE;
869 tokeniser_grid['r'] = IDENTIFIER_CODE;
870 tokeniser_grid['s'] = IDENTIFIER_CODE;
871 tokeniser_grid['t'] = IDENTIFIER_CODE;
872 tokeniser_grid['u'] = IDENTIFIER_CODE;
873 tokeniser_grid['v'] = IDENTIFIER_CODE;
874 tokeniser_grid['w'] = IDENTIFIER_CODE;
875 tokeniser_grid['x'] = IDENTIFIER_CODE;
876 tokeniser_grid['y'] = IDENTIFIER_CODE;
877 tokeniser_grid['z'] = IDENTIFIER_CODE;
879 tokeniser_grid['A'] = IDENTIFIER_CODE;
880 tokeniser_grid['B'] = IDENTIFIER_CODE;
881 tokeniser_grid['C'] = IDENTIFIER_CODE;
882 tokeniser_grid['D'] = IDENTIFIER_CODE;
883 tokeniser_grid['E'] = IDENTIFIER_CODE;
884 tokeniser_grid['F'] = IDENTIFIER_CODE;
885 tokeniser_grid['G'] = IDENTIFIER_CODE;
886 tokeniser_grid['H'] = IDENTIFIER_CODE;
887 tokeniser_grid['I'] = IDENTIFIER_CODE;
888 tokeniser_grid['J'] = IDENTIFIER_CODE;
889 tokeniser_grid['K'] = IDENTIFIER_CODE;
890 tokeniser_grid['L'] = IDENTIFIER_CODE;
891 tokeniser_grid['M'] = IDENTIFIER_CODE;
892 tokeniser_grid['N'] = IDENTIFIER_CODE;
893 tokeniser_grid['O'] = IDENTIFIER_CODE;
894 tokeniser_grid['P'] = IDENTIFIER_CODE;
895 tokeniser_grid['Q'] = IDENTIFIER_CODE;
896 tokeniser_grid['R'] = IDENTIFIER_CODE;
897 tokeniser_grid['S'] = IDENTIFIER_CODE;
898 tokeniser_grid['T'] = IDENTIFIER_CODE;
899 tokeniser_grid['U'] = IDENTIFIER_CODE;
900 tokeniser_grid['V'] = IDENTIFIER_CODE;
901 tokeniser_grid['W'] = IDENTIFIER_CODE;
902 tokeniser_grid['X'] = IDENTIFIER_CODE;
903 tokeniser_grid['Y'] = IDENTIFIER_CODE;
904 tokeniser_grid['Z'] = IDENTIFIER_CODE;
906 tokeniser_grid['_'] = IDENTIFIER_CODE;
909 /* ------------------------------------------------------------------------- */
910 /* Definition of a lexical block: a source file or a string containing */
911 /* text for lexical analysis; an independent source from the point of */
912 /* view of issuing error reports. */
913 /* ------------------------------------------------------------------------- */
915 typedef struct LexicalBlock_s
916 { char *filename; /* Full translated name */
917 int main_flag; /* TRUE if the main file
918 (the first one opened) */
919 int sys_flag; /* TRUE if a System_File */
920 int source_line; /* Line number count */
921 int line_start; /* Char number within file
922 where the current line
924 int chars_read; /* Char number of read pos */
925 int file_no; /* Or 255 if not from a
928 char *orig_source; /* From #origsource direct */
934 static LexicalBlock NoFileOpen =
935 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
937 static LexicalBlock MakingOutput =
938 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
940 static LexicalBlock StringLB =
941 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255, NULL, 0, 0, 0 };
943 static LexicalBlock *CurrentLB; /* The current lexical
944 block of input text */
946 extern void declare_systemfile(void)
947 { CurrentLB->sys_flag = TRUE;
950 extern int is_systemfile(void)
951 { return ((CurrentLB->sys_flag)?1:0);
954 extern void set_origsource_location(char *source, int32 line, int32 charnum)
957 /* Clear the Origsource declaration. */
958 CurrentLB->orig_file = 0;
959 CurrentLB->orig_source = NULL;
960 CurrentLB->orig_line = 0;
961 CurrentLB->orig_char = 0;
965 /* Get the file number for a new or existing InputFiles entry. */
966 int file_no = register_orig_sourcefile(source);
968 CurrentLB->orig_file = file_no;
969 CurrentLB->orig_source = InputFiles[file_no-1].filename;
970 CurrentLB->orig_line = line;
971 CurrentLB->orig_char = charnum;
974 /* Error locations. */
976 extern debug_location get_current_debug_location(void)
977 { debug_location result;
978 /* Assume that all input characters are one byte. */
979 result.file_index = CurrentLB->file_no;
980 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
981 result.end_byte_index = result.beginning_byte_index;
982 result.beginning_line_number = CurrentLB->source_line;
983 result.end_line_number = result.beginning_line_number;
984 result.beginning_character_number =
985 CurrentLB->chars_read - CurrentLB->line_start;
986 result.end_character_number = result.beginning_character_number;
987 result.orig_file_index = CurrentLB->orig_file;
988 result.orig_beg_line_number = CurrentLB->orig_line;
989 result.orig_beg_char_number = CurrentLB->orig_char;
993 static debug_location ErrorReport_debug_location;
995 extern void report_errors_at_current_line(void)
996 { ErrorReport.line_number = CurrentLB->source_line;
997 ErrorReport.file_number = CurrentLB->file_no;
998 if (ErrorReport.file_number == 255)
999 ErrorReport.file_number = -1;
1000 ErrorReport.source = CurrentLB->filename;
1001 ErrorReport.main_flag = CurrentLB->main_flag;
1002 if (debugfile_switch)
1003 ErrorReport_debug_location = get_current_debug_location();
1004 ErrorReport.orig_file = CurrentLB->orig_file;
1005 ErrorReport.orig_source = CurrentLB->orig_source;
1006 ErrorReport.orig_line = CurrentLB->orig_line;
1007 ErrorReport.orig_char = CurrentLB->orig_char;
1010 extern debug_location get_error_report_debug_location(void)
1011 { return ErrorReport_debug_location;
1014 extern int32 get_current_line_start(void)
1015 { return CurrentLB->line_start;
1018 brief_location blank_brief_location;
1020 extern brief_location get_brief_location(ErrorPosition *errpos)
1023 loc.file_index = errpos->file_number;
1024 loc.line_number = errpos->line_number;
1025 loc.orig_file_index = errpos->orig_file;
1026 loc.orig_line_number = errpos->orig_line;
1030 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1032 if (loc.file_index != -1)
1033 { errpos->file_number = loc.file_index;
1034 errpos->line_number = loc.line_number;
1035 errpos->main_flag = (errpos->file_number == 1);
1036 errpos->orig_source = NULL;
1037 errpos->orig_file = loc.orig_file_index;
1038 errpos->orig_line = loc.orig_line_number;
1039 errpos->orig_char = 0;
1043 /* ------------------------------------------------------------------------- */
1044 /* Hash printing and line counting */
1045 /* ------------------------------------------------------------------------- */
1047 static void print_hash(void)
1049 /* Hash-printing is the practice of printing a # character every 100
1050 lines of source code (the -x switch), reassuring the user that
1051 progress is being made */
1053 if (no_hash_printed_yet)
1054 { printf("::"); no_hash_printed_yet = FALSE;
1056 printf("#"); hash_printed_since_newline = TRUE;
1059 /* On some systems, text output is buffered to a line at a time, and
1060 this would frustrate the point of hash-printing, so: */
1066 static void reached_new_line(void)
1068 /* Called to signal that a new line has been reached in the source code */
1070 forerrors_pointer = 0;
1072 CurrentLB->source_line++;
1073 CurrentLB->line_start = CurrentLB->chars_read;
1075 total_source_line_count++;
1077 if (total_source_line_count%100==0)
1078 { if (hash_switch) print_hash();
1080 SpinCursor(32); /* I.e., allow other tasks to run */
1085 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1086 { ProcessEvents (&g_proc);
1090 if (temporary_files_switch)
1091 remove_temp_files();
1093 my_free(&all_text,"transcription text");
1094 abort_transcript_file();
1095 longjmp (g_fallback, 1);
1101 static void new_syntax_line(void)
1102 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1103 report_errors_at_current_line();
1106 /* Return 10 raised to the expo power.
1108 * I'm avoiding the standard pow() function for a rather lame reason:
1109 * it's in the libmath (-lm) library, and I don't want to change the
1110 * build model for the compiler. So, this is implemented with a stupid
1111 * lookup table. It's faster than pow() for small values of expo.
1112 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1113 * anyway, so I don't expect that to be a problem.
1115 * (For some reason, frexp() and ldexp(), which are used later on, do
1116 * not require libmath to be linked in.)
1118 static double pow10_cheap(int expo)
1120 #define POW10_RANGE (8)
1121 static double powers[POW10_RANGE*2+1] = {
1122 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1124 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1130 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1133 return res * powers[POW10_RANGE+expo];
1136 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1137 res *= powers[POW10_RANGE*2];
1139 return res * powers[POW10_RANGE+expo];
1143 /* Return the IEEE-754 single-precision encoding of a floating-point
1144 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
1145 * for an explanation.
1147 * The number is provided in the pieces it was parsed in:
1148 * [+|-] intv "." fracv "e" [+|-]expo
1150 * If the magnitude is too large (beyond about 3.4e+38), this returns
1151 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1152 * small (below about 1e-45), this returns a zero value (0x00000000 or
1153 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1154 * lexer should never do that).
1156 * Note that using a float constant does *not* set the uses_float_features
1157 * flag (which would cause the game file to be labelled 3.1.2). There's
1158 * no VM feature here, just an integer. Of course, any use of the float
1159 * *opcodes* will set the flag.
1161 * The math functions in this routine require #including <math.h>, but
1162 * they should not require linking the math library (-lm). At least,
1163 * they do not on OSX and Linux.
1165 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1167 double absval = (intv + fracv) * pow10_cheap(expo);
1168 int32 sign = (signbit ? 0x80000000 : 0x0);
1172 if (isinf(absval)) {
1173 return sign | 0x7f800000; /* infinity */
1175 if (isnan(absval)) {
1176 return sign | 0x7fc00000;
1179 mant = frexp(absval, &expo);
1181 /* Normalize mantissa to be in the range [1.0, 2.0) */
1182 if (0.5 <= mant && mant < 1.0) {
1186 else if (mant == 0.0) {
1190 return sign | 0x7f800000; /* infinity */
1194 return sign | 0x7f800000; /* infinity */
1196 else if (expo < -126) {
1197 /* Denormalized (very small) number */
1198 mant = ldexp(mant, 126 + expo);
1201 else if (!(expo == 0 && mant == 0.0)) {
1203 mant -= 1.0; /* Get rid of leading 1 */
1206 mant *= 8388608.0; /* 2^23 */
1207 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1209 /* The carry propagated out of a string of 23 1 bits. */
1213 return sign | 0x7f800000; /* infinity */
1217 return (sign) | ((int32)(expo << 23)) | (fbits);
1220 /* ------------------------------------------------------------------------- */
1221 /* Characters are read via a "pipeline" of variables, allowing us to look */
1222 /* up to three characters ahead of the current position. */
1224 /* There are two possible sources: from the source files being loaded in, */
1225 /* and from a string inside Inform (which is where the code for veneer */
1226 /* routines comes from). Each source has its own get-next-character */
1228 /* ------------------------------------------------------------------------- */
1229 /* Source 1: from files */
1231 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1232 /* from the current input file. If the file runs out, then if it was */
1233 /* the last source file 4 EOF characters are placed in the buffer: if it */
1234 /* was only an Include file ending, then a '\n' character is placed there */
1235 /* (essentially to force termination of any comment line) followed by */
1236 /* three harmless spaces. */
1238 /* The routine returns the number of characters it has written, and note */
1239 /* that this conveniently ensures that all characters in the buffer come */
1240 /* from the same file. */
1241 /* ------------------------------------------------------------------------- */
1243 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1245 typedef struct Sourcefile_s
1246 { char *buffer; /* Input buffer */
1247 int read_pos; /* Read position in buffer */
1248 int size; /* Number of meaningful
1249 characters in buffer */
1250 int la, la2, la3; /* Three characters of
1251 lookahead pipeline */
1252 int file_no; /* Internal file number
1257 static Sourcefile *FileStack;
1258 static int File_sp; /* Stack pointer */
1260 static Sourcefile *CF; /* Top entry on stack */
1262 static int last_input_file;
1264 static void begin_buffering_file(int i, int file_no)
1265 { int j, cnt; uchar *p;
1267 if (i >= MAX_INCLUSION_DEPTH)
1268 memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
1270 p = (uchar *) FileStack[i].buffer;
1273 { FileStack[i-1].la = lookahead;
1274 FileStack[i-1].la2 = lookahead2;
1275 FileStack[i-1].la3 = lookahead3;
1278 FileStack[i].file_no = file_no;
1279 FileStack[i].size = file_load_chars(file_no,
1280 (char *) p, SOURCE_BUFFER_SIZE);
1281 lookahead = source_to_iso_grid[p[0]];
1282 lookahead2 = source_to_iso_grid[p[1]];
1283 lookahead3 = source_to_iso_grid[p[2]];
1284 if (LOOKAHEAD_SIZE != 3)
1286 ("Lexer lookahead size does not match hard-coded lookahead code");
1287 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1289 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1290 else FileStack[i].LB.main_flag = FALSE;
1291 FileStack[i].LB.sys_flag = FALSE;
1292 FileStack[i].LB.source_line = 1;
1293 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1294 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1295 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1296 FileStack[i].LB.file_no = file_no;
1297 FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1298 FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1300 CurrentLB = &(FileStack[i].LB);
1301 CF = &(FileStack[i]);
1303 /* Check for recursive inclusion */
1306 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1310 warning_named("File included more than once",
1311 FileStack[j].LB.filename);
1314 static void create_char_pipeline(void)
1317 begin_buffering_file(File_sp++, 1);
1318 pipeline_made = TRUE;
1319 last_input_file = current_input_file;
1322 static int get_next_char_from_pipeline(void)
1325 while (last_input_file < current_input_file)
1327 /* An "Include" file must have opened since the last character
1328 was read. Perhaps more than one. We run forward through the
1329 list and add them to the include stack. But we ignore
1330 "Origsource" files (which were never actually opened for
1334 if (!InputFiles[last_input_file-1].is_input)
1337 begin_buffering_file(File_sp++, last_input_file);
1339 if (last_input_file != current_input_file)
1340 compiler_error("last_input_file did not match after Include");
1343 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1346 if (CF->read_pos == CF->size)
1348 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1352 if (CF->read_pos == -(CF->size))
1353 { set_token_location(get_current_debug_location());
1356 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1358 CF = &(FileStack[File_sp-1]);
1359 CurrentLB = &(FileStack[File_sp-1].LB);
1360 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1361 if (CF->read_pos == CF->size)
1363 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1366 set_token_location(get_current_debug_location());
1369 p = (uchar *) (CF->buffer);
1371 current = lookahead;
1372 lookahead = lookahead2;
1373 lookahead2 = lookahead3;
1374 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1376 CurrentLB->chars_read++;
1377 if (forerrors_pointer < 511)
1378 forerrors_buff[forerrors_pointer++] = current;
1379 if (current == '\n') reached_new_line();
1383 /* ------------------------------------------------------------------------- */
1384 /* Source 2: from a string */
1385 /* ------------------------------------------------------------------------- */
1387 static int source_to_analyse_pointer; /* Current read position */
1389 static int get_next_char_from_string(void)
1390 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1391 current = source_to_iso_grid[p[0]];
1393 if (current == 0) lookahead = 0;
1394 else lookahead = source_to_iso_grid[p[1]];
1395 if (lookahead == 0) lookahead2 = 0;
1396 else lookahead2 = source_to_iso_grid[p[2]];
1397 if (lookahead2 == 0) lookahead3 = 0;
1398 else lookahead3 = source_to_iso_grid[p[3]];
1400 CurrentLB->chars_read++;
1401 if (forerrors_pointer < 511)
1402 forerrors_buff[forerrors_pointer++] = current;
1403 if (current == '\n') reached_new_line();
1407 /* ========================================================================= */
1408 /* The interface between the lexer and Inform's higher levels: */
1410 /* put_token_back() (effectively) move the read position */
1411 /* back by one token */
1413 /* get_next_token() copy the token at the current read */
1414 /* position into the triple */
1415 /* (token_type, token_value, token_text) */
1416 /* and move the read position forward */
1419 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1420 /* to read from source files; */
1421 /* otherwise, to read from this string. */
1422 /* ------------------------------------------------------------------------- */
1424 extern void put_token_back(void)
1425 { tokens_put_back++;
1427 if (tokens_trace_level > 0)
1428 { if (tokens_trace_level == 1) printf("<- ");
1429 else printf("<-\n");
1432 /* The following error, of course, should never happen! */
1434 if (tokens_put_back == CIRCLE_SIZE)
1435 { compiler_error("The lexical analyser has collapsed because of a wrong \
1436 assumption inside Inform");
1442 extern void get_next_token(void)
1443 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1444 int returning_a_put_back_token = TRUE;
1446 context = lexical_context();
1448 if (tokens_put_back > 0)
1449 { i = circle_position - tokens_put_back + 1;
1450 if (i<0) i += CIRCLE_SIZE;
1452 if (context != token_contexts[i])
1453 { j = circle[i].type;
1454 if ((j==0) || ((j>=100) && (j<200)))
1455 interpret_identifier(i, FALSE);
1459 returning_a_put_back_token = FALSE;
1461 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1462 else circle_position++;
1464 if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
1465 lex_p = lexeme_memory;
1467 circle[circle_position].text = lex_p;
1468 circle[circle_position].value = 0;
1472 d = (*get_next_char)();
1473 e = tokeniser_grid[d];
1475 if (next_token_begins_syntax_line)
1476 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1477 { new_syntax_line();
1478 next_token_begins_syntax_line = FALSE;
1482 circle[circle_position].location = get_current_debug_location();
1485 { case 0: char_error("Illegal character found in source:", d);
1486 goto StartTokenAgain;
1488 case WHITESPACE_CODE:
1489 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1491 goto StartTokenAgain;
1494 while ((lookahead != '\n') && (lookahead != 0))
1496 goto StartTokenAgain;
1499 circle[circle_position].type = EOF_TT;
1500 strcpy(lex_p, "<end of file>");
1501 lex_p += strlen(lex_p) + 1;
1509 { n = n*radix + character_digit_value[d];
1511 } while ((character_digit_value[lookahead] < radix)
1512 && (d = (*get_next_char)(), TRUE));
1515 circle[circle_position].type = NUMBER_TT;
1516 circle[circle_position].value = n;
1520 { int expo=0; double intv=0, fracv=0;
1521 int expocount=0, intcount=0, fraccount=0;
1522 int signbit = (d == '-');
1524 while (character_digit_value[lookahead] < 10) {
1525 intv = 10.0*intv + character_digit_value[lookahead];
1527 *lex_p++ = lookahead;
1530 if (lookahead == '.') {
1531 double fracpow = 1.0;
1532 *lex_p++ = lookahead;
1534 while (character_digit_value[lookahead] < 10) {
1536 fracv = fracv + fracpow*character_digit_value[lookahead];
1538 *lex_p++ = lookahead;
1542 if (lookahead == 'e' || lookahead == 'E') {
1544 *lex_p++ = lookahead;
1546 if (lookahead == '+' || lookahead == '-') {
1547 exposign = (lookahead == '-');
1548 *lex_p++ = lookahead;
1551 while (character_digit_value[lookahead] < 10) {
1552 expo = 10*expo + character_digit_value[lookahead];
1554 *lex_p++ = lookahead;
1558 error("Floating-point literal must have digits after the 'e'");
1559 if (exposign) { expo = -expo; }
1561 if (intcount + fraccount == 0)
1562 error("Floating-point literal must have digits");
1563 n = construct_float(signbit, intv, fracv, expo);
1566 circle[circle_position].type = NUMBER_TT;
1567 circle[circle_position].value = n;
1568 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1572 radix = 16; d = (*get_next_char)();
1573 if (d == '-' || d == '+') { goto FloatNumber; }
1574 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1575 if (character_digit_value[d] >= radix)
1577 error("Binary number expected after '$$'");
1579 error("Hexadecimal number expected after '$'");
1583 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1586 { e = d; d = (*get_next_char)(); *lex_p++ = d;
1587 if (quoted_size++==64)
1589 "Too much text for one pair of quotations '...' to hold");
1592 if ((d == '\'') && (e != '@'))
1593 { if (quoted_size == 1)
1594 { d = (*get_next_char)(); *lex_p++ = d;
1596 error("No text between quotation marks ''");
1601 if (d==EOF) ebf_error("'\''", "end of file");
1603 circle[circle_position].type = SQ_TT;
1606 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1609 { d = (*get_next_char)(); *lex_p++ = d;
1610 if (quoted_size++==MAX_QTEXT_SIZE)
1611 { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
1616 while (*(lex_p-1) == ' ') lex_p--;
1617 if (*(lex_p-1) != '^') *lex_p++ = ' ';
1618 while ((lookahead != EOF) &&
1619 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1623 { int newline_passed = FALSE;
1625 while ((lookahead != EOF) &&
1626 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1627 if ((d = (*get_next_char)()) == '\n')
1628 newline_passed = TRUE;
1629 if (!newline_passed)
1631 chb[0] = '\"'; chb[1] = lookahead;
1632 chb[2] = '\"'; chb[3] = 0;
1633 ebf_error("empty rest of line after '\\' in string",
1637 } while ((d != EOF) && (d!='\"'));
1638 if (d==EOF) ebf_error("'\"'", "end of file");
1640 circle[circle_position].type = DQ_TT;
1643 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1646 while ((n<=MAX_IDENTIFIER_LENGTH)
1647 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1648 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1649 n++, *lex_p++ = (*get_next_char)();
1653 if (n > MAX_IDENTIFIER_LENGTH)
1654 { char bad_length[100];
1656 "Name exceeds the maximum length of %d characters:",
1657 MAX_IDENTIFIER_LENGTH);
1658 error_named(bad_length, circle[circle_position].text);
1659 /* Trim token so that it doesn't violate
1660 MAX_IDENTIFIER_LENGTH during error recovery */
1661 circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
1664 if (dont_enter_into_symbol_table)
1665 { circle[circle_position].type = DQ_TT;
1666 circle[circle_position].value = 0;
1667 if (dont_enter_into_symbol_table == -2)
1668 interpret_identifier(circle_position, TRUE);
1672 interpret_identifier(circle_position, FALSE);
1677 /* The character is initial to at least one of the separators */
1679 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1680 { r = (char *) separators[j];
1682 { *lex_p++=d; *lex_p++=0;
1683 goto SeparatorMatched;
1687 { if (*(r+1) == lookahead)
1689 *lex_p++=(*get_next_char)();
1691 goto SeparatorMatched;
1695 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1697 *lex_p++=(*get_next_char)();
1698 *lex_p++=(*get_next_char)();
1700 goto SeparatorMatched;
1705 /* The following contingency never in fact arises with the
1706 current set of separators, but might in future */
1708 *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
1710 error_named("Unrecognised combination in source:", lex_p);
1711 goto StartTokenAgain;
1715 circle[circle_position].type = SEP_TT;
1716 circle[circle_position].value = j;
1718 { case SEMICOLON_SEP: break;
1719 case HASHNDOLLAR_SEP:
1720 case HASHWDOLLAR_SEP:
1721 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1722 { error_named("Character expected after",
1723 circle[circle_position].text);
1727 *lex_p++ = (*get_next_char)();
1728 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1729 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1730 *lex_p++ = (*get_next_char)();
1733 case HASHADOLLAR_SEP:
1734 case HASHGDOLLAR_SEP:
1735 case HASHRDOLLAR_SEP:
1737 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1738 { error_named("Alphabetic character expected after",
1739 circle[circle_position].text);
1743 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1744 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1745 *lex_p++ = (*get_next_char)();
1752 i = circle_position;
1755 token_value = circle[i].value;
1756 token_type = circle[i].type;
1757 token_text = circle[i].text;
1758 if (!returning_a_put_back_token)
1759 { set_token_location(circle[i].location);
1761 token_contexts[i] = context;
1763 if (tokens_trace_level > 0)
1764 { if (tokens_trace_level == 1)
1765 printf("'%s' ", circle[i].text);
1767 { printf("-> "); describe_token(circle[i]);
1769 if (tokens_trace_level > 2) print_context(token_contexts[i]);
1775 static char veneer_error_title[64];
1777 extern void restart_lexer(char *lexical_source, char *name)
1779 circle_position = 0;
1780 for (i=0; i<CIRCLE_SIZE; i++)
1781 { circle[i].type = 0;
1782 circle[i].value = 0;
1783 circle[i].text = "(if this is ever visible, there is a bug)";
1784 token_contexts[i] = 0;
1787 lex_p = lexeme_memory;
1788 tokens_put_back = 0;
1789 forerrors_pointer = 0;
1790 dont_enter_into_symbol_table = FALSE;
1791 return_sp_as_variable = FALSE;
1792 next_token_begins_syntax_line = TRUE;
1794 source_to_analyse = lexical_source;
1796 if (source_to_analyse == NULL)
1797 { get_next_char = get_next_char_from_pipeline;
1798 if (!pipeline_made) create_char_pipeline();
1799 forerrors_buff[0] = 0; forerrors_pointer = 0;
1802 { get_next_char = get_next_char_from_string;
1803 source_to_analyse_pointer = 0;
1804 CurrentLB = &StringLB;
1805 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1806 StringLB.filename = veneer_error_title;
1808 CurrentLB->source_line = 1;
1809 CurrentLB->line_start = 0;
1810 CurrentLB->chars_read = 0;
1814 /* ========================================================================= */
1815 /* Data structure management routines */
1816 /* ------------------------------------------------------------------------- */
1818 extern void init_lexer_vars(void)
1820 blank_brief_location.file_index = -1;
1821 blank_brief_location.line_number = 0;
1822 blank_brief_location.orig_file_index = 0;
1823 blank_brief_location.orig_line_number = 0;
1826 extern void lexer_begin_prepass(void)
1827 { total_source_line_count = 0;
1828 CurrentLB = &NoFileOpen;
1829 report_errors_at_current_line();
1832 extern void lexer_begin_pass(void)
1833 { no_hash_printed_yet = TRUE;
1834 hash_printed_since_newline = FALSE;
1836 pipeline_made = FALSE;
1838 restart_lexer(NULL, NULL);
1841 extern void lexer_endpass(void)
1842 { CurrentLB = &MakingOutput;
1843 report_errors_at_current_line();
1846 extern void lexer_allocate_arrays(void)
1849 FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
1850 "filestack buffer");
1852 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1853 FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1855 lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
1857 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1858 "keyword hash table");
1859 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1860 "keyword hash end table");
1861 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
1862 "keyword hashing linked list");
1863 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1864 "local variable hash table");
1865 local_variable_text_table = my_malloc(
1866 (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
1867 "text of local variable names");
1869 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
1870 "local variable hash codes");
1871 local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
1872 "local variable text pointers");
1874 make_tokeniser_grid();
1875 make_keywords_tables();
1877 first_token_locations =
1878 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
1879 first_token_locations->location.file_index = 0;
1880 first_token_locations->location.beginning_byte_index = 0;
1881 first_token_locations->location.end_byte_index = 0;
1882 first_token_locations->location.beginning_line_number = 0;
1883 first_token_locations->location.end_line_number = 0;
1884 first_token_locations->location.beginning_character_number = 0;
1885 first_token_locations->location.end_character_number = 0;
1886 first_token_locations->location.orig_file_index = 0;
1887 first_token_locations->location.orig_beg_line_number = 0;
1888 first_token_locations->location.orig_beg_char_number = 0;
1889 first_token_locations->next = NULL;
1890 first_token_locations->reference_count = 0;
1891 last_token_location = first_token_locations;
1894 extern void lexer_free_arrays(void)
1897 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1898 { p = FileStack[i].buffer;
1899 my_free(&p, "source file buffer");
1901 my_free(&FileStack, "filestack buffer");
1902 my_free(&lexeme_memory, "lexeme memory");
1904 my_free(&keywords_hash_table, "keyword hash table");
1905 my_free(&keywords_hash_ends_table, "keyword hash end table");
1906 my_free(&keywords_data_table, "keyword hashing linked list");
1907 my_free(&local_variable_hash_table, "local variable hash table");
1908 my_free(&local_variable_text_table, "text of local variable names");
1910 my_free(&local_variable_hash_codes, "local variable hash codes");
1911 my_free(&local_variable_texts, "local variable text pointers");
1913 cleanup_token_locations(NULL);
1916 /* ========================================================================= */