1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
4 /* Copyright (c) Graham Nelson 1993 - 2016 */
6 /* This file is part of Inform. */
8 /* Inform is free software: you can redistribute it and/or modify */
9 /* it under the terms of the GNU General Public License as published by */
10 /* the Free Software Foundation, either version 3 of the License, or */
11 /* (at your option) any later version. */
13 /* Inform is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16 /* GNU General Public License for more details. */
18 /* You should have received a copy of the GNU General Public License */
19 /* along with Inform. If not, see https://gnu.org/licenses/ */
21 /* ------------------------------------------------------------------------- */
25 int total_source_line_count, /* Number of source lines so far */
27 no_hash_printed_yet, /* Have not yet printed the first # */
28 hash_printed_since_newline, /* A hash has been printed since the
29 most recent new-line was printed
30 (generally as a result of an error
31 message or the start of pass) */
32 dont_enter_into_symbol_table, /* Return names as text (with
33 token type DQ_TT, i.e., as if
34 they had double-quotes around)
35 and not as entries in the symbol
36 table, when TRUE. If -2, only the
37 keyword table is searched. */
38 return_sp_as_variable; /* When TRUE, the word "sp" denotes
39 the stack pointer variable
40 (used in assembly language only) */
41 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
42 line (for error reporting, etc.)
43 on the source code line where
44 the next token appears */
46 int32 last_mapped_line; /* Last syntax line reported to debugging file */
48 /* ------------------------------------------------------------------------- */
49 /* The lexer's output is a sequence of triples, each called a "token", */
50 /* representing one lexical unit (or "lexeme") each. Instead of providing */
51 /* "lookahead" (that is, always having available the next token after the */
52 /* current one, so that syntax analysers higher up in Inform can have */
53 /* advance knowledge of what is coming), the lexer instead has a system */
54 /* where tokens can be read in and then "put back again". */
55 /* The meaning of the number (and to some extent the text) supplied with */
56 /* a token depends on its type: see "header.h" for the list of types. */
57 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
58 /* number, and translated to the token: */
59 /* type NUMBER_TT, value 483, text "$1e3" */
60 /* ------------------------------------------------------------------------- */
61 /* These three variables are set to the current token on a call to */
62 /* get_next_token() (but are not changed by a call to put_token_back()). */
63 /* ------------------------------------------------------------------------- */
69 /* ------------------------------------------------------------------------- */
70 /* The next two variables are the head and tail of a singly linked list. */
71 /* The tail stores the portion most recently read from the current */
72 /* lexical block; its end values therefore describe the location of the */
73 /* current token, and are updated whenever the three variables above are */
74 /* via set_token_location(...). Earlier vertices, if any, represent the */
75 /* regions of lexical blocks read beforehand, where new vertices are */
76 /* only introduced by interruptions like a file inclusion or an EOF. */
77 /* Vertices are deleted off of the front of the list once they are no */
78 /* longer referenced by pending debug information records. */
79 /* ------------------------------------------------------------------------- */
81 static debug_locations *first_token_locations;
82 static debug_locations *last_token_location;
84 extern debug_location get_token_location(void)
85 { debug_location result;
86 debug_location *location = &(last_token_location->location);
87 result.file_index = location->file_index;
88 result.beginning_byte_index = location->end_byte_index;
89 result.end_byte_index = location->end_byte_index;
90 result.beginning_line_number = location->end_line_number;
91 result.end_line_number = location->end_line_number;
92 result.beginning_character_number = location->end_character_number;
93 result.end_character_number = location->end_character_number;
97 extern debug_locations get_token_locations(void)
98 { debug_locations result;
99 result.location = get_token_location();
101 result.reference_count = 0;
105 static void set_token_location(debug_location location)
106 { if (location.file_index == last_token_location->location.file_index)
107 { last_token_location->location.end_byte_index =
108 location.end_byte_index;
109 last_token_location->location.end_line_number =
110 location.end_line_number;
111 last_token_location->location.end_character_number =
112 location.end_character_number;
114 { debug_locations*successor =
116 (sizeof(debug_locations),
117 "debug locations of recent tokens");
118 successor->location = location;
119 successor->next = NULL;
120 successor->reference_count = 0;
121 last_token_location->next = successor;
122 last_token_location = successor;
126 extern debug_location_beginning get_token_location_beginning(void)
127 { debug_location_beginning result;
128 ++(last_token_location->reference_count);
129 result.head = last_token_location;
130 result.beginning_byte_index =
131 last_token_location->location.end_byte_index;
132 result.beginning_line_number =
133 last_token_location->location.end_line_number;
134 result.beginning_character_number =
135 last_token_location->location.end_character_number;
139 static void cleanup_token_locations(debug_location_beginning*beginning)
140 { if (first_token_locations)
141 { while (first_token_locations &&
142 !first_token_locations->reference_count)
143 { debug_locations*moribund = first_token_locations;
144 first_token_locations = moribund->next;
145 my_free(&moribund, "debug locations of recent tokens");
147 (beginning->head == moribund || !first_token_locations))
149 ("Records needed by a debug_location_beginning are no "
150 "longer allocated, perhaps because of an invalid reuse "
151 "of this or an earlier beginning");
157 ("Attempt to use a debug_location_beginning when no token "
158 "locations are defined");
161 ("Attempt to clean up token locations when no token locations "
167 extern void discard_token_location(debug_location_beginning beginning)
168 { --(beginning.head->reference_count);
171 extern debug_locations get_token_location_end
172 (debug_location_beginning beginning)
173 { debug_locations result;
174 cleanup_token_locations(&beginning);
175 --(beginning.head->reference_count);
176 /* Sometimes we know what we'll read before we switch to the lexical block
177 where we'll read it. In such cases the beginning will be placed in the
178 prior block and last exactly zero bytes there. It's misleading to
179 include such ranges, so we gobble them. */
180 if (beginning.head->location.end_byte_index ==
181 beginning.beginning_byte_index &&
182 beginning.head->next)
183 { beginning.head = beginning.head->next;
184 result.location = beginning.head->location;
185 result.location.beginning_byte_index = 0;
186 result.location.beginning_line_number = 1;
187 result.location.beginning_character_number = 1;
189 { result.location = beginning.head->location;
190 result.location.beginning_byte_index =
191 beginning.beginning_byte_index;
192 result.location.beginning_line_number =
193 beginning.beginning_line_number;
194 result.location.beginning_character_number =
195 beginning.beginning_character_number;
197 result.next = beginning.head->next;
198 result.reference_count = 0;
202 /* ------------------------------------------------------------------------- */
203 /* In order to be able to put tokens back efficiently, the lexer stores */
204 /* tokens in a "circle": the variable circle_position ranges between */
205 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
206 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
207 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
209 /* Unlike some compilers, Inform does not have a context-free lexer: in */
210 /* fact it has 12288 different possible states. However, the context only */
211 /* affects the interpretation of "identifiers": lexemes beginning with a */
212 /* letter and containing up to 32 chars of alphanumeric and underscore */
213 /* chars. (For example, "default" may refer to the directive or statement */
214 /* of that name, and which token values are returned depends on the */
215 /* current lexical context.) */
217 /* Along with each token, we also store the lexical context it was */
218 /* translated under; because if it is called for again, there may need */
219 /* to be a fresh interpretation of it if the context has changed. */
220 /* ------------------------------------------------------------------------- */
222 #define CIRCLE_SIZE 6
224 /* (The worst case for token lookahead is distinguishing between an
225 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
227 static int circle_position;
228 static token_data circle[CIRCLE_SIZE];
230 static int token_contexts[CIRCLE_SIZE];
232 /* ------------------------------------------------------------------------- */
233 /* A complication, however, is that the text of some lexemes needs to be */
234 /* held in Inform's memory for much longer periods: for example, a */
235 /* dictionary word lexeme (like "'south'") must have its text preserved */
236 /* until the code generation time for the expression it occurs in, when */
237 /* the dictionary reference is actually made. Code generation in general */
238 /* occurs as early as possible in Inform: pending some better method of */
239 /* garbage collection, we simply use a buffer so large that unless */
240 /* expressions spread across 10K of source code are found, there can be */
242 /* ------------------------------------------------------------------------- */
244 static char *lexeme_memory;
245 static char *lex_p; /* Current write position */
247 /* ------------------------------------------------------------------------- */
248 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
249 /* LR(3) grammar to translate characters into tokens). */
250 /* ------------------------------------------------------------------------- */
252 #define LOOKAHEAD_SIZE 3
254 static int current, lookahead, /* The latest character read, and */
255 lookahead2, lookahead3; /* the three characters following it */
257 static int pipeline_made; /* Whether or not the pipeline of
258 characters has been constructed
261 static int (* get_next_char)(void); /* Routine for reading the stream of
262 characters: the lexer does not
263 need any "ungetc" routine for
264 putting them back again. End of
265 stream is signalled by returning
268 static char *source_to_analyse; /* The current lexical source:
269 NULL for "load from source files",
270 otherwise this points to a string
271 containing Inform code */
273 static int tokens_put_back; /* Count of the number of backward
274 moves made from the last-read
277 extern void describe_token(token_data t)
279 /* Many of the token types are not set in this file, but later on in
280 Inform's higher stages (for example, in the expression evaluator);
281 but this routine describes them all. */
287 /* The following token types occur in lexer output: */
289 case SYMBOL_TT: printf("symbol ");
290 describe_symbol(t.value);
292 case NUMBER_TT: printf("literal number %d", t.value);
294 case DQ_TT: printf("string \"%s\"", t.text);
296 case SQ_TT: printf("string '%s'", t.text);
298 case SEP_TT: printf("separator '%s'", t.text);
300 case EOF_TT: printf("end of file");
303 case STATEMENT_TT: printf("statement name '%s'", t.text);
305 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
307 case DIRECTIVE_TT: printf("directive name '%s'", t.text);
309 case CND_TT: printf("textual conditional '%s'", t.text);
311 case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
313 case SYSFUN_TT: printf("built-in function name '%s'", t.text);
315 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
317 case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
319 case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
321 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
323 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
326 /* The remaining are etoken types, not set by the lexer */
328 case OP_TT: printf("operator '%s'",
329 operators[t.value].description);
331 case ENDEXP_TT: printf("end of expression");
333 case SUBOPEN_TT: printf("open bracket");
335 case SUBCLOSE_TT: printf("close bracket");
337 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
339 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
341 case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
343 case DICTWORD_TT: printf("dictionary word '%s'", t.text);
345 case ACTION_TT: printf("action name '%s'", t.text);
349 printf("** unknown token type %d, text='%s', value=%d **",
350 t.type, t.text, t.value);
355 /* ------------------------------------------------------------------------- */
356 /* All but one of the 280 Inform keywords (118 of them opcode names used */
357 /* only by the assembler). (The one left over is "sp", a keyword used in */
358 /* assembly language only.) */
360 /* A "keyword group" is a set of keywords to be searched for. If a match */
361 /* is made on an identifier, the token type becomes that given in the KG */
362 /* and the token value is its index in the KG. */
364 /* The keyword ordering must correspond with the appropriate #define's in */
365 /* "header.h" but is otherwise not significant. */
366 /* ------------------------------------------------------------------------- */
368 #define MAX_KEYWORDS 350
370 /* The values will be filled in at compile time, when we know
371 which opcode set to use. */
372 keyword_group opcode_names =
374 OPCODE_NAME_TT, FALSE, TRUE
377 static char *opcode_list_z[] = {
378 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
379 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
380 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
381 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
382 "print_char", "print_num", "random", "push", "pull", "split_window",
383 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
384 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
385 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
386 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
387 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
388 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
389 "erase_window", "erase_line", "set_cursor", "get_cursor",
390 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
391 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
392 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
393 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
394 "restore_undo", "draw_picture", "picture_data", "erase_picture",
395 "set_margins", "move_window", "window_size", "window_style",
396 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
397 "mouse_window", "push_stack", "put_wind_prop", "print_form",
398 "make_menu", "picture_table", "print_unicode", "check_unicode",
402 static char *opcode_list_g[] = {
403 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
404 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
405 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
406 "jltu", "jgeu", "jgtu", "jleu",
408 "catch", "throw", "tailcall",
409 "copy", "copys", "copyb", "sexs", "sexb", "aload",
410 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
411 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
412 "streamchar", "streamnum", "streamstr",
413 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
414 "random", "setrandom", "quit", "verify",
415 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
416 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
417 "linearsearch", "binarysearch", "linkedsearch",
418 "callf", "callfi", "callfii", "callfiii",
420 "mzero", "mcopy", "malloc", "mfree",
421 "accelfunc", "accelparam",
422 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
423 "fadd", "fsub", "fmul", "fdiv", "fmod",
424 "sqrt", "exp", "log", "pow",
425 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
426 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
430 keyword_group opcode_macros =
432 OPCODE_MACRO_TT, FALSE, TRUE
435 static char *opmacro_list_z[] = { "" };
437 static char *opmacro_list_g[] = {
442 keyword_group directives =
443 { { "abbreviate", "array", "attribute", "class", "constant",
444 "default", "dictionary", "end", "endif", "extend", "fake_action",
445 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
446 "iffalse", "import", "include", "link", "lowstring", "message",
447 "nearby", "object", "property", "release", "replace",
448 "serial", "switches", "statusline", "stub", "system_file", "trace",
449 "undef", "verb", "version", "zcharacter",
451 DIRECTIVE_TT, FALSE, FALSE
454 keyword_group trace_keywords =
455 { { "dictionary", "symbols", "objects", "verbs",
456 "assembly", "expressions", "lines", "tokens", "linker",
458 TRACE_KEYWORD_TT, FALSE, TRUE
461 keyword_group segment_markers =
462 { { "class", "has", "private", "with", "" },
463 SEGMENT_MARKER_TT, FALSE, TRUE
466 keyword_group directive_keywords =
467 { { "alias", "long", "additive",
469 "noun", "held", "multi", "multiheld", "multiexcept",
470 "multiinside", "creature", "special", "number", "scope", "topic",
471 "reverse", "meta", "only", "replace", "first", "last",
472 "string", "table", "buffer", "data", "initial", "initstr",
473 "with", "private", "has", "class",
474 "error", "fatalerror", "warning",
477 DIR_KEYWORD_TT, FALSE, TRUE
480 keyword_group misc_keywords =
481 { { "char", "name", "the", "a", "an", "The", "number",
482 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
483 "to", "address", "string", "object", "near", "from", "property", "A", "" },
484 MISC_KEYWORD_TT, FALSE, TRUE
487 keyword_group statements =
488 { { "box", "break", "continue", "default", "do", "else", "font", "for",
489 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
490 "print", "print_ret", "quit", "read", "remove", "restore", "return",
491 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
492 "until", "while", "" },
493 STATEMENT_TT, FALSE, TRUE
496 keyword_group conditions =
497 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
501 keyword_group system_functions =
502 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
503 "sibling", "younger", "youngest", "metaclass", "glk", "" },
504 SYSFUN_TT, FALSE, TRUE
507 keyword_group system_constants =
508 { { "adjectives_table", "actions_table", "classes_table",
509 "identifiers_table", "preactions_table", "version_number",
510 "largest_object", "strings_offset", "code_offset",
511 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
512 "static_memory_offset", "array_names_offset", "readable_memory_offset",
513 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
514 "array__start", "array__end",
515 "lowest_attribute_number", "highest_attribute_number",
516 "attribute_names_array",
517 "lowest_property_number", "highest_property_number",
518 "property_names_array",
519 "lowest_action_number", "highest_action_number",
520 "action_names_array",
521 "lowest_fake_action_number", "highest_fake_action_number",
522 "fake_action_names_array",
523 "lowest_routine_number", "highest_routine_number", "routines_array",
524 "routine_names_array", "routine_flags_array",
525 "lowest_global_number", "highest_global_number", "globals_array",
526 "global_names_array", "global_flags_array",
527 "lowest_array_number", "highest_array_number", "arrays_array",
528 "array_names_array", "array_flags_array",
529 "lowest_constant_number", "highest_constant_number", "constants_array",
530 "constant_names_array",
531 "lowest_class_number", "highest_class_number", "class_objects_array",
532 "lowest_object_number", "highest_object_number",
534 "grammar_table", "dictionary_table", "dynam_string_table",
536 SYSTEM_CONSTANT_TT, FALSE, TRUE
539 keyword_group *keyword_groups[12]
540 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
541 &directive_keywords, &misc_keywords, &statements, &conditions,
542 &system_functions, &system_constants, &opcode_macros};
544 keyword_group local_variables =
545 { { "" }, /* Filled in when routine declared */
546 LOCAL_VARIABLE_TT, FALSE, FALSE
549 static int lexical_context(void)
551 /* The lexical context is a number representing all of the context
552 information in the lexical analyser: the same input text will
553 always translate to the same output tokens whenever the context
556 In fact, for efficiency reasons this number omits the bit of
557 information held in the variable "dont_enter_into_symbol_table".
558 Inform never needs to backtrack through tokens parsed in that
559 way (thankfully, as it would be expensive indeed to check
563 if (opcode_names.enabled) c |= 1;
564 if (directives.enabled) c |= 2;
565 if (trace_keywords.enabled) c |= 4;
566 if (segment_markers.enabled) c |= 8;
567 if (directive_keywords.enabled) c |= 16;
568 if (misc_keywords.enabled) c |= 32;
569 if (statements.enabled) c |= 64;
570 if (conditions.enabled) c |= 128;
571 if (system_functions.enabled) c |= 256;
572 if (system_constants.enabled) c |= 512;
573 if (local_variables.enabled) c |= 1024;
575 if (return_sp_as_variable) c |= 2048;
579 static void print_context(int c)
581 if ((c & 1) != 0) printf("OPC ");
582 if ((c & 2) != 0) printf("DIR ");
583 if ((c & 4) != 0) printf("TK ");
584 if ((c & 8) != 0) printf("SEG ");
585 if ((c & 16) != 0) printf("DK ");
586 if ((c & 32) != 0) printf("MK ");
587 if ((c & 64) != 0) printf("STA ");
588 if ((c & 128) != 0) printf("CND ");
589 if ((c & 256) != 0) printf("SFUN ");
590 if ((c & 512) != 0) printf("SCON ");
591 if ((c & 1024) != 0) printf("LV ");
592 if ((c & 2048) != 0) printf("sp ");
595 static int *keywords_hash_table;
596 static int *keywords_hash_ends_table;
597 static int *keywords_data_table;
599 static int *local_variable_hash_table;
600 static int *local_variable_hash_codes;
601 char **local_variable_texts;
602 static char *local_variable_text_table;
604 static char one_letter_locals[128];
606 static void make_keywords_tables(void)
608 char **oplist, **maclist;
611 oplist = opcode_list_z;
612 maclist = opmacro_list_z;
615 oplist = opcode_list_g;
616 maclist = opmacro_list_g;
619 for (j=0; *(oplist[j]); j++) {
620 opcode_names.keywords[j] = oplist[j];
622 opcode_names.keywords[j] = "";
624 for (j=0; *(maclist[j]); j++) {
625 opcode_macros.keywords[j] = maclist[j];
627 opcode_macros.keywords[j] = "";
629 for (i=0; i<HASH_TAB_SIZE; i++)
630 { keywords_hash_table[i] = -1;
631 keywords_hash_ends_table[i] = -1;
634 for (i=1; i<=11; i++)
635 { keyword_group *kg = keyword_groups[i];
636 for (j=0; *(kg->keywords[j]) != 0; j++)
637 { h = hash_code_from_string(kg->keywords[j]);
638 if (keywords_hash_table[h] == -1)
639 keywords_hash_table[h] = tp;
641 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
642 keywords_hash_ends_table[h] = tp;
643 *(keywords_data_table + 3*tp) = i;
644 *(keywords_data_table + 3*tp+1) = j;
645 *(keywords_data_table + 3*tp+2) = -1;
651 extern void construct_local_variable_tables(void)
652 { int i, h; char *p = local_variable_text_table;
653 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
654 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
656 for (i=0; i<no_locals; i++)
657 { char *q = local_variables.keywords[i];
659 { one_letter_locals[(uchar)q[0]] = i;
660 if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
661 if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
663 h = hash_code_from_string(q);
664 if (local_variable_hash_table[h] == -1)
665 local_variable_hash_table[h] = i;
666 local_variable_hash_codes[i] = h;
667 local_variable_texts[i] = p;
671 for (;i<MAX_LOCAL_VARIABLES-1;i++)
672 local_variable_texts[i] = "<no such local variable>";
675 static void interpret_identifier(int pos, int dirs_only_flag)
676 { int index, hashcode; char *p = circle[pos].text;
678 /* An identifier is either a keyword or a "symbol", a name which the
679 lexical analyser leaves to higher levels of Inform to understand. */
681 hashcode = hash_code_from_string(p);
683 if (dirs_only_flag) goto KeywordSearch;
685 /* If this is assembly language, perhaps it is "sp"? */
687 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
688 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
692 /* Test for local variables first, quite quickly. */
694 if (local_variables.enabled)
696 { index = one_letter_locals[(uchar)p[0]];
697 if (index<MAX_LOCAL_VARIABLES)
698 { circle[pos].type = LOCAL_VARIABLE_TT;
699 circle[pos].value = index+1;
703 index = local_variable_hash_table[hashcode];
705 { for (;index<no_locals;index++)
706 { if (hashcode == local_variable_hash_codes[index])
707 { if (strcmpcis(p, local_variable_texts[index])==0)
708 { circle[pos].type = LOCAL_VARIABLE_TT;
709 circle[pos].value = index+1;
717 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
718 the name of a system function which has been Replaced. */
721 index = keywords_hash_table[hashcode];
723 { int *i = keywords_data_table + 3*index;
724 keyword_group *kg = keyword_groups[*i];
725 if (((!dirs_only_flag) && (kg->enabled))
726 || (dirs_only_flag && (kg == &directives)))
727 { char *q = kg->keywords[*(i+1)];
728 if (((kg->case_sensitive) && (strcmp(p, q)==0))
729 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
730 { if ((kg != &system_functions)
731 || (system_function_usage[*(i+1)]!=2))
732 { circle[pos].type = kg->change_token_type;
733 circle[pos].value = *(i+1);
741 if (dirs_only_flag) return;
743 /* Search for the name; create it if necessary. */
745 circle[pos].value = symbol_index(p, hashcode);
746 circle[pos].type = SYMBOL_TT;
750 /* ------------------------------------------------------------------------- */
751 /* The tokeniser grid aids a rapid decision about the consequences of a */
752 /* character reached in the buffer. In effect it is an efficiently stored */
753 /* transition table using an algorithm similar to that of S. C. Johnson's */
754 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
755 /* My thanks to Dilip Sequeira for suggesting this. */
757 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
758 /* separator numbers n, n+1, ..., n+m-1 */
759 /* or certain special values (QUOTE_CODE, etc) */
762 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
763 /* number of separators supported exceeds 61. */
764 /* ------------------------------------------------------------------------- */
766 static int tokeniser_grid[256];
768 #define QUOTE_CODE 1000
769 #define DQUOTE_CODE 1001
770 #define NULL_CODE 1002
771 #define SPACE_CODE 1003
772 #define NEGATIVE_CODE 1004
773 #define DIGIT_CODE 1005
774 #define RADIX_CODE 1006
775 #define KEYWORD_CODE 1007
776 #define EOF_CODE 1008
777 #define WHITESPACE_CODE 1009
778 #define COMMENT_CODE 1010
779 #define IDENTIFIER_CODE 1011
781 /* This list cannot safely be changed without also changing the header
782 separator #defines. The ordering is significant in that (i) all entries
783 beginning with the same character must be adjacent and (ii) that if
784 X is a an initial substring of Y then X must come before Y.
786 E.g. --> must occur before -- to prevent "-->0" being tokenised
787 wrongly as "--", ">", "0" rather than "-->", "0". */
789 static const char separators[NUMBER_SEPARATORS][4] =
790 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
791 "||", "|", "&&", "&", "~~",
792 "~=", "~", "==", "=", ">=", ">",
793 "<=", "<", "(", ")", ",",
794 ".&", ".#", "..&", "..#", "..", ".",
795 "::", ":", "@", ";", "[", "]", "{", "}",
797 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
800 static void make_tokeniser_grid(void)
802 /* Construct the grid to the specification above. */
806 for (i=0; i<256; i++) tokeniser_grid[i]=0;
808 for (i=0; i<NUMBER_SEPARATORS; i++)
809 { j=separators[i][0];
810 if (tokeniser_grid[j]==0)
811 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
813 tokeniser_grid['\''] = QUOTE_CODE;
814 tokeniser_grid['\"'] = DQUOTE_CODE;
815 tokeniser_grid[0] = EOF_CODE;
816 tokeniser_grid[' '] = WHITESPACE_CODE;
817 tokeniser_grid['\n'] = WHITESPACE_CODE;
818 tokeniser_grid['$'] = RADIX_CODE;
819 tokeniser_grid['!'] = COMMENT_CODE;
821 tokeniser_grid['0'] = DIGIT_CODE;
822 tokeniser_grid['1'] = DIGIT_CODE;
823 tokeniser_grid['2'] = DIGIT_CODE;
824 tokeniser_grid['3'] = DIGIT_CODE;
825 tokeniser_grid['4'] = DIGIT_CODE;
826 tokeniser_grid['5'] = DIGIT_CODE;
827 tokeniser_grid['6'] = DIGIT_CODE;
828 tokeniser_grid['7'] = DIGIT_CODE;
829 tokeniser_grid['8'] = DIGIT_CODE;
830 tokeniser_grid['9'] = DIGIT_CODE;
832 tokeniser_grid['a'] = IDENTIFIER_CODE;
833 tokeniser_grid['b'] = IDENTIFIER_CODE;
834 tokeniser_grid['c'] = IDENTIFIER_CODE;
835 tokeniser_grid['d'] = IDENTIFIER_CODE;
836 tokeniser_grid['e'] = IDENTIFIER_CODE;
837 tokeniser_grid['f'] = IDENTIFIER_CODE;
838 tokeniser_grid['g'] = IDENTIFIER_CODE;
839 tokeniser_grid['h'] = IDENTIFIER_CODE;
840 tokeniser_grid['i'] = IDENTIFIER_CODE;
841 tokeniser_grid['j'] = IDENTIFIER_CODE;
842 tokeniser_grid['k'] = IDENTIFIER_CODE;
843 tokeniser_grid['l'] = IDENTIFIER_CODE;
844 tokeniser_grid['m'] = IDENTIFIER_CODE;
845 tokeniser_grid['n'] = IDENTIFIER_CODE;
846 tokeniser_grid['o'] = IDENTIFIER_CODE;
847 tokeniser_grid['p'] = IDENTIFIER_CODE;
848 tokeniser_grid['q'] = IDENTIFIER_CODE;
849 tokeniser_grid['r'] = IDENTIFIER_CODE;
850 tokeniser_grid['s'] = IDENTIFIER_CODE;
851 tokeniser_grid['t'] = IDENTIFIER_CODE;
852 tokeniser_grid['u'] = IDENTIFIER_CODE;
853 tokeniser_grid['v'] = IDENTIFIER_CODE;
854 tokeniser_grid['w'] = IDENTIFIER_CODE;
855 tokeniser_grid['x'] = IDENTIFIER_CODE;
856 tokeniser_grid['y'] = IDENTIFIER_CODE;
857 tokeniser_grid['z'] = IDENTIFIER_CODE;
859 tokeniser_grid['A'] = IDENTIFIER_CODE;
860 tokeniser_grid['B'] = IDENTIFIER_CODE;
861 tokeniser_grid['C'] = IDENTIFIER_CODE;
862 tokeniser_grid['D'] = IDENTIFIER_CODE;
863 tokeniser_grid['E'] = IDENTIFIER_CODE;
864 tokeniser_grid['F'] = IDENTIFIER_CODE;
865 tokeniser_grid['G'] = IDENTIFIER_CODE;
866 tokeniser_grid['H'] = IDENTIFIER_CODE;
867 tokeniser_grid['I'] = IDENTIFIER_CODE;
868 tokeniser_grid['J'] = IDENTIFIER_CODE;
869 tokeniser_grid['K'] = IDENTIFIER_CODE;
870 tokeniser_grid['L'] = IDENTIFIER_CODE;
871 tokeniser_grid['M'] = IDENTIFIER_CODE;
872 tokeniser_grid['N'] = IDENTIFIER_CODE;
873 tokeniser_grid['O'] = IDENTIFIER_CODE;
874 tokeniser_grid['P'] = IDENTIFIER_CODE;
875 tokeniser_grid['Q'] = IDENTIFIER_CODE;
876 tokeniser_grid['R'] = IDENTIFIER_CODE;
877 tokeniser_grid['S'] = IDENTIFIER_CODE;
878 tokeniser_grid['T'] = IDENTIFIER_CODE;
879 tokeniser_grid['U'] = IDENTIFIER_CODE;
880 tokeniser_grid['V'] = IDENTIFIER_CODE;
881 tokeniser_grid['W'] = IDENTIFIER_CODE;
882 tokeniser_grid['X'] = IDENTIFIER_CODE;
883 tokeniser_grid['Y'] = IDENTIFIER_CODE;
884 tokeniser_grid['Z'] = IDENTIFIER_CODE;
886 tokeniser_grid['_'] = IDENTIFIER_CODE;
889 /* ------------------------------------------------------------------------- */
890 /* Definition of a lexical block: a source file or a string containing */
891 /* text for lexical analysis; an independent source from the point of */
892 /* view of issuing error reports. */
893 /* ------------------------------------------------------------------------- */
895 typedef struct LexicalBlock_s
896 { char *filename; /* Full translated name */
897 int main_flag; /* TRUE if the main file
898 (the first one opened) */
899 int sys_flag; /* TRUE if a System_File */
900 int source_line; /* Line number count */
901 int line_start; /* Char number within file
902 where the current line
904 int chars_read; /* Char number of read pos */
905 int file_no; /* Or 255 if not from a
910 static LexicalBlock NoFileOpen =
911 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255 };
913 static LexicalBlock MakingOutput =
914 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255 };
916 static LexicalBlock StringLB =
917 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255 };
919 static LexicalBlock *CurrentLB; /* The current lexical
920 block of input text */
922 extern void declare_systemfile(void)
923 { CurrentLB->sys_flag = TRUE;
926 extern int is_systemfile(void)
927 { return ((CurrentLB->sys_flag)?1:0);
930 extern debug_location get_current_debug_location(void)
931 { debug_location result;
932 /* Assume that all input characters are one byte. */
933 result.file_index = CurrentLB->file_no;
934 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
935 result.end_byte_index = result.beginning_byte_index;
936 result.beginning_line_number = CurrentLB->source_line;
937 result.end_line_number = result.beginning_line_number;
938 result.beginning_character_number =
939 CurrentLB->chars_read - CurrentLB->line_start;
940 result.end_character_number = result.beginning_character_number;
944 static debug_location ErrorReport_debug_location;
946 extern void report_errors_at_current_line(void)
947 { ErrorReport.line_number = CurrentLB->source_line;
948 ErrorReport.file_number = CurrentLB->file_no;
949 if (ErrorReport.file_number == 255)
950 ErrorReport.file_number = -1;
951 ErrorReport.source = CurrentLB->filename;
952 ErrorReport.main_flag = CurrentLB->main_flag;
953 if (debugfile_switch)
954 ErrorReport_debug_location = get_current_debug_location();
957 extern debug_location get_error_report_debug_location(void)
958 { return ErrorReport_debug_location;
961 extern int32 get_current_line_start(void)
962 { return CurrentLB->line_start;
965 /* ------------------------------------------------------------------------- */
966 /* Hash printing and line counting */
967 /* ------------------------------------------------------------------------- */
969 static void print_hash(void)
971 /* Hash-printing is the practice of printing a # character every 100
972 lines of source code (the -x switch), reassuring the user that
973 progress is being made */
975 if (no_hash_printed_yet)
976 { printf("::"); no_hash_printed_yet = FALSE;
978 printf("#"); hash_printed_since_newline = TRUE;
981 /* On some systems, text output is buffered to a line at a time, and
982 this would frustrate the point of hash-printing, so: */
988 static void reached_new_line(void)
990 /* Called to signal that a new line has been reached in the source code */
992 forerrors_pointer = 0;
994 CurrentLB->source_line++;
995 CurrentLB->line_start = CurrentLB->chars_read;
997 total_source_line_count++;
999 if (total_source_line_count%100==0)
1000 { if (hash_switch) print_hash();
1002 SpinCursor(32); /* I.e., allow other tasks to run */
1007 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1008 { ProcessEvents (&g_proc);
1012 if (temporary_files_switch)
1013 remove_temp_files();
1015 my_free(&all_text,"transcription text");
1016 abort_transcript_file();
1017 longjmp (g_fallback, 1);
1023 static void new_syntax_line(void)
1024 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1025 report_errors_at_current_line();
1028 /* Return 10 raised to the expo power.
1030 * I'm avoiding the standard pow() function for a rather lame reason:
1031 * it's in the libmath (-lm) library, and I don't want to change the
1032 * build model for the compiler. So, this is implemented with a stupid
1033 * lookup table. It's faster than pow() for small values of expo.
1034 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1035 * anyway, so I don't expect that to be a problem.
1037 * (For some reason, frexp() and ldexp(), which are used later on, do
1038 * not require libmath to be linked in.)
1040 static double pow10_cheap(int expo)
1042 #define POW10_RANGE (8)
1043 static double powers[POW10_RANGE*2+1] = {
1044 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1046 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1052 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1055 return res * powers[POW10_RANGE+expo];
1058 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1059 res *= powers[POW10_RANGE*2];
1061 return res * powers[POW10_RANGE+expo];
1065 /* Return the IEEE-754 single-precision encoding of a floating-point
1066 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
1067 * for an explanation.
1069 * The number is provided in the pieces it was parsed in:
1070 * [+|-] intv "." fracv "e" [+|-]expo
1072 * If the magnitude is too large (beyond about 3.4e+38), this returns
1073 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1074 * small (below about 1e-45), this returns a zero value (0x00000000 or
1075 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1076 * lexer should never do that).
1078 * Note that using a float constant does *not* set the uses_float_features
1079 * flag (which would cause the game file to be labelled 3.1.2). There's
1080 * no VM feature here, just an integer. Of course, any use of the float
1081 * *opcodes* will set the flag.
1083 * The math functions in this routine require #including <math.h>, but
1084 * they should not require linking the math library (-lm). At least,
1085 * they do not on OSX and Linux.
1087 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1089 double absval = (intv + fracv) * pow10_cheap(expo);
1090 int32 sign = (signbit ? 0x80000000 : 0x0);
1094 if (isinf(absval)) {
1095 return sign | 0x7f800000; /* infinity */
1097 if (isnan(absval)) {
1098 return sign | 0x7fc00000;
1101 mant = frexp(absval, &expo);
1103 /* Normalize mantissa to be in the range [1.0, 2.0) */
1104 if (0.5 <= mant && mant < 1.0) {
1108 else if (mant == 0.0) {
1112 return sign | 0x7f800000; /* infinity */
1116 return sign | 0x7f800000; /* infinity */
1118 else if (expo < -126) {
1119 /* Denormalized (very small) number */
1120 mant = ldexp(mant, 126 + expo);
1123 else if (!(expo == 0 && mant == 0.0)) {
1125 mant -= 1.0; /* Get rid of leading 1 */
1128 mant *= 8388608.0; /* 2^23 */
1129 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1131 /* The carry propagated out of a string of 23 1 bits. */
1135 return sign | 0x7f800000; /* infinity */
1139 return (sign) | ((int32)(expo << 23)) | (fbits);
1142 /* ------------------------------------------------------------------------- */
1143 /* Characters are read via a "pipeline" of variables, allowing us to look */
1144 /* up to three characters ahead of the current position. */
1146 /* There are two possible sources: from the source files being loaded in, */
1147 /* and from a string inside Inform (which is where the code for veneer */
1148 /* routines comes from). Each source has its own get-next-character */
1150 /* ------------------------------------------------------------------------- */
1151 /* Source 1: from files */
1153 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1154 /* from the current input file. If the file runs out, then if it was */
1155 /* the last source file 4 EOF characters are placed in the buffer: if it */
1156 /* was only an Include file ending, then a '\n' character is placed there */
1157 /* (essentially to force termination of any comment line) followed by */
1158 /* three harmless spaces. */
1160 /* The routine returns the number of characters it has written, and note */
1161 /* that this conveniently ensures that all characters in the buffer come */
1162 /* from the same file. */
1163 /* ------------------------------------------------------------------------- */
1165 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1167 typedef struct Sourcefile_s
1168 { char *buffer; /* Input buffer */
1169 int read_pos; /* Read position in buffer */
1170 int size; /* Number of meaningful
1171 characters in buffer */
1172 int la, la2, la3; /* Three characters of
1173 lookahead pipeline */
1174 int file_no; /* Internal file number
1179 static Sourcefile *FileStack;
1180 static int File_sp; /* Stack pointer */
1182 static Sourcefile *CF; /* Top entry on stack */
1184 static int last_no_files;
1186 static void begin_buffering_file(int i, int file_no)
1187 { int j, cnt; uchar *p;
1189 if (i >= MAX_INCLUSION_DEPTH)
1190 memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
1192 p = (uchar *) FileStack[i].buffer;
1195 { FileStack[i-1].la = lookahead;
1196 FileStack[i-1].la2 = lookahead2;
1197 FileStack[i-1].la3 = lookahead3;
1200 FileStack[i].file_no = file_no;
1201 FileStack[i].size = file_load_chars(file_no,
1202 (char *) p, SOURCE_BUFFER_SIZE);
1203 lookahead = source_to_iso_grid[p[0]];
1204 lookahead2 = source_to_iso_grid[p[1]];
1205 lookahead3 = source_to_iso_grid[p[2]];
1206 if (LOOKAHEAD_SIZE != 3)
1208 ("Lexer lookahead size does not match hard-coded lookahead code");
1209 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1211 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1212 else FileStack[i].LB.main_flag = FALSE;
1213 FileStack[i].LB.sys_flag = FALSE;
1214 FileStack[i].LB.source_line = 1;
1215 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1216 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1217 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1218 FileStack[i].LB.file_no = file_no;
1220 CurrentLB = &(FileStack[i].LB);
1221 CF = &(FileStack[i]);
1223 /* Check for recursive inclusion */
1226 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1230 warning_named("File included more than once",
1231 FileStack[j].LB.filename);
1234 static void create_char_pipeline(void)
1237 begin_buffering_file(File_sp++, 1);
1238 pipeline_made = TRUE; last_no_files = input_file;
1241 static int get_next_char_from_pipeline(void)
1244 while (last_no_files < input_file)
1246 /* An "Include" file must have opened since the last character
1249 begin_buffering_file(File_sp++, ++last_no_files);
1251 last_no_files = input_file;
1254 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1257 if (CF->read_pos == CF->size)
1259 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1263 if (CF->read_pos == -(CF->size))
1264 { set_token_location(get_current_debug_location());
1267 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1269 CF = &(FileStack[File_sp-1]);
1270 CurrentLB = &(FileStack[File_sp-1].LB);
1271 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1272 if (CF->read_pos == CF->size)
1274 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1277 set_token_location(get_current_debug_location());
1280 p = (uchar *) (CF->buffer);
1282 current = lookahead;
1283 lookahead = lookahead2;
1284 lookahead2 = lookahead3;
1285 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1287 CurrentLB->chars_read++;
1288 if (forerrors_pointer < 511)
1289 forerrors_buff[forerrors_pointer++] = current;
1290 if (current == '\n') reached_new_line();
1294 /* ------------------------------------------------------------------------- */
1295 /* Source 2: from a string */
1296 /* ------------------------------------------------------------------------- */
1298 static int source_to_analyse_pointer; /* Current read position */
1300 static int get_next_char_from_string(void)
1301 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1302 current = source_to_iso_grid[p[0]];
1304 if (current == 0) lookahead = 0;
1305 else lookahead = source_to_iso_grid[p[1]];
1306 if (lookahead == 0) lookahead2 = 0;
1307 else lookahead2 = source_to_iso_grid[p[2]];
1308 if (lookahead2 == 0) lookahead3 = 0;
1309 else lookahead3 = source_to_iso_grid[p[3]];
1311 CurrentLB->chars_read++;
1312 if (forerrors_pointer < 511)
1313 forerrors_buff[forerrors_pointer++] = current;
1314 if (current == '\n') reached_new_line();
1318 /* ========================================================================= */
1319 /* The interface between the lexer and Inform's higher levels: */
1321 /* put_token_back() (effectively) move the read position */
1322 /* back by one token */
1324 /* get_next_token() copy the token at the current read */
1325 /* position into the triple */
1326 /* (token_type, token_value, token_text) */
1327 /* and move the read position forward */
1330 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1331 /* to read from source files; */
1332 /* otherwise, to read from this string. */
1333 /* ------------------------------------------------------------------------- */
1335 extern void put_token_back(void)
1336 { tokens_put_back++;
1338 if (tokens_trace_level > 0)
1339 { if (tokens_trace_level == 1) printf("<- ");
1340 else printf("<-\n");
1343 /* The following error, of course, should never happen! */
1345 if (tokens_put_back == CIRCLE_SIZE)
1346 { compiler_error("The lexical analyser has collapsed because of a wrong \
1347 assumption inside Inform");
1353 extern void get_next_token(void)
1354 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1355 int returning_a_put_back_token = TRUE;
1357 context = lexical_context();
1359 if (tokens_put_back > 0)
1360 { i = circle_position - tokens_put_back + 1;
1361 if (i<0) i += CIRCLE_SIZE;
1363 if (context != token_contexts[i])
1364 { j = circle[i].type;
1365 if ((j==0) || ((j>=100) && (j<200)))
1366 interpret_identifier(i, FALSE);
1370 returning_a_put_back_token = FALSE;
1372 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1373 else circle_position++;
1375 if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
1376 lex_p = lexeme_memory;
1378 circle[circle_position].text = lex_p;
1379 circle[circle_position].value = 0;
1383 d = (*get_next_char)();
1384 e = tokeniser_grid[d];
1386 if (next_token_begins_syntax_line)
1387 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1388 { new_syntax_line();
1389 next_token_begins_syntax_line = FALSE;
1393 circle[circle_position].location = get_current_debug_location();
1396 { case 0: char_error("Illegal character found in source:", d);
1397 goto StartTokenAgain;
1399 case WHITESPACE_CODE:
1400 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1402 goto StartTokenAgain;
1405 while ((lookahead != '\n') && (lookahead != 0))
1407 goto StartTokenAgain;
1410 circle[circle_position].type = EOF_TT;
1411 strcpy(lex_p, "<end of file>");
1412 lex_p += strlen(lex_p) + 1;
1420 { n = n*radix + character_digit_value[d];
1422 } while ((character_digit_value[lookahead] < radix)
1423 && (d = (*get_next_char)(), TRUE));
1426 circle[circle_position].type = NUMBER_TT;
1427 circle[circle_position].value = n;
1431 { int expo=0; double intv=0, fracv=0;
1432 int expocount=0, intcount=0, fraccount=0;
1433 int signbit = (d == '-');
1435 while (character_digit_value[lookahead] < 10) {
1436 intv = 10.0*intv + character_digit_value[lookahead];
1438 *lex_p++ = lookahead;
1441 if (lookahead == '.') {
1442 double fracpow = 1.0;
1443 *lex_p++ = lookahead;
1445 while (character_digit_value[lookahead] < 10) {
1447 fracv = fracv + fracpow*character_digit_value[lookahead];
1449 *lex_p++ = lookahead;
1453 if (lookahead == 'e' || lookahead == 'E') {
1455 *lex_p++ = lookahead;
1457 if (lookahead == '+' || lookahead == '-') {
1458 exposign = (lookahead == '-');
1459 *lex_p++ = lookahead;
1462 while (character_digit_value[lookahead] < 10) {
1463 expo = 10*expo + character_digit_value[lookahead];
1465 *lex_p++ = lookahead;
1469 error("Floating-point literal must have digits after the 'e'");
1470 if (exposign) { expo = -expo; }
1472 if (intcount + fraccount == 0)
1473 error("Floating-point literal must have digits");
1474 n = construct_float(signbit, intv, fracv, expo);
1477 circle[circle_position].type = NUMBER_TT;
1478 circle[circle_position].value = n;
1479 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1483 radix = 16; d = (*get_next_char)();
1484 if (d == '-' || d == '+') { goto FloatNumber; }
1485 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1486 if (character_digit_value[d] >= radix)
1488 error("Binary number expected after '$$'");
1490 error("Hexadecimal number expected after '$'");
1494 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1497 { e = d; d = (*get_next_char)(); *lex_p++ = d;
1498 if (quoted_size++==64)
1500 "Too much text for one pair of quotations '...' to hold");
1503 if ((d == '\'') && (e != '@'))
1504 { if (quoted_size == 1)
1505 { d = (*get_next_char)(); *lex_p++ = d;
1507 error("No text between quotation marks ''");
1512 if (d==EOF) ebf_error("'\''", "end of file");
1514 circle[circle_position].type = SQ_TT;
1517 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1520 { d = (*get_next_char)(); *lex_p++ = d;
1521 if (quoted_size++==MAX_QTEXT_SIZE)
1522 { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
1527 while (*(lex_p-1) == ' ') lex_p--;
1528 if (*(lex_p-1) != '^') *lex_p++ = ' ';
1529 while ((lookahead != EOF) &&
1530 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1534 { int newline_passed = FALSE;
1536 while ((lookahead != EOF) &&
1537 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1538 if ((d = (*get_next_char)()) == '\n')
1539 newline_passed = TRUE;
1540 if (!newline_passed)
1542 chb[0] = '\"'; chb[1] = lookahead;
1543 chb[2] = '\"'; chb[3] = 0;
1544 ebf_error("empty rest of line after '\\' in string",
1548 } while ((d != EOF) && (d!='\"'));
1549 if (d==EOF) ebf_error("'\"'", "end of file");
1551 circle[circle_position].type = DQ_TT;
1554 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1557 while ((n<=MAX_IDENTIFIER_LENGTH)
1558 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1559 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1560 n++, *lex_p++ = (*get_next_char)();
1564 if (n > MAX_IDENTIFIER_LENGTH)
1565 { char bad_length[100];
1567 "Name exceeds the maximum length of %d characters:",
1568 MAX_IDENTIFIER_LENGTH);
1569 error_named(bad_length, circle[circle_position].text);
1570 /* Trim token so that it doesn't violate
1571 MAX_IDENTIFIER_LENGTH during error recovery */
1572 circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
1575 if (dont_enter_into_symbol_table)
1576 { circle[circle_position].type = DQ_TT;
1577 circle[circle_position].value = 0;
1578 if (dont_enter_into_symbol_table == -2)
1579 interpret_identifier(circle_position, TRUE);
1583 interpret_identifier(circle_position, FALSE);
1588 /* The character is initial to at least one of the separators */
1590 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1591 { r = (char *) separators[j];
1593 { *lex_p++=d; *lex_p++=0;
1594 goto SeparatorMatched;
1598 { if (*(r+1) == lookahead)
1600 *lex_p++=(*get_next_char)();
1602 goto SeparatorMatched;
1606 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1608 *lex_p++=(*get_next_char)();
1609 *lex_p++=(*get_next_char)();
1611 goto SeparatorMatched;
1616 /* The following contingency never in fact arises with the
1617 current set of separators, but might in future */
1619 *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
1621 error_named("Unrecognised combination in source:", lex_p);
1622 goto StartTokenAgain;
1626 circle[circle_position].type = SEP_TT;
1627 circle[circle_position].value = j;
1629 { case SEMICOLON_SEP: break;
1630 case HASHNDOLLAR_SEP:
1631 case HASHWDOLLAR_SEP:
1632 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1633 { error_named("Character expected after",
1634 circle[circle_position].text);
1638 *lex_p++ = (*get_next_char)();
1639 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1640 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1641 *lex_p++ = (*get_next_char)();
1644 case HASHADOLLAR_SEP:
1645 case HASHGDOLLAR_SEP:
1646 case HASHRDOLLAR_SEP:
1648 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1649 { error_named("Alphabetic character expected after",
1650 circle[circle_position].text);
1654 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1655 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1656 *lex_p++ = (*get_next_char)();
1663 i = circle_position;
1666 token_value = circle[i].value;
1667 token_type = circle[i].type;
1668 token_text = circle[i].text;
1669 if (!returning_a_put_back_token)
1670 { set_token_location(circle[i].location);
1672 token_contexts[i] = context;
1674 if (tokens_trace_level > 0)
1675 { if (tokens_trace_level == 1)
1676 printf("'%s' ", circle[i].text);
1678 { printf("-> "); describe_token(circle[i]);
1680 if (tokens_trace_level > 2) print_context(token_contexts[i]);
1686 static char veneer_error_title[64];
1688 extern void restart_lexer(char *lexical_source, char *name)
1690 circle_position = 0;
1691 for (i=0; i<CIRCLE_SIZE; i++)
1692 { circle[i].type = 0;
1693 circle[i].value = 0;
1694 circle[i].text = "(if this is ever visible, there is a bug)";
1695 token_contexts[i] = 0;
1698 lex_p = lexeme_memory;
1699 tokens_put_back = 0;
1700 forerrors_pointer = 0;
1701 dont_enter_into_symbol_table = FALSE;
1702 return_sp_as_variable = FALSE;
1703 next_token_begins_syntax_line = TRUE;
1705 source_to_analyse = lexical_source;
1707 if (source_to_analyse == NULL)
1708 { get_next_char = get_next_char_from_pipeline;
1709 if (!pipeline_made) create_char_pipeline();
1710 forerrors_buff[0] = 0; forerrors_pointer = 0;
1713 { get_next_char = get_next_char_from_string;
1714 source_to_analyse_pointer = 0;
1715 CurrentLB = &StringLB;
1716 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1717 StringLB.filename = veneer_error_title;
1719 CurrentLB->source_line = 1;
1720 CurrentLB->line_start = 0;
1721 CurrentLB->chars_read = 0;
1725 /* ========================================================================= */
1726 /* Data structure management routines */
1727 /* ------------------------------------------------------------------------- */
1729 extern void init_lexer_vars(void)
1733 extern void lexer_begin_prepass(void)
1734 { total_source_line_count = 0;
1735 CurrentLB = &NoFileOpen;
1736 report_errors_at_current_line();
1739 extern void lexer_begin_pass(void)
1740 { no_hash_printed_yet = TRUE;
1741 hash_printed_since_newline = FALSE;
1743 pipeline_made = FALSE;
1745 restart_lexer(NULL, NULL);
1748 extern void lexer_endpass(void)
1749 { CurrentLB = &MakingOutput;
1750 report_errors_at_current_line();
1753 extern void lexer_allocate_arrays(void)
1756 FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
1757 "filestack buffer");
1759 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1760 FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1762 lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
1764 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1765 "keyword hash table");
1766 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1767 "keyword hash end table");
1768 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
1769 "keyword hashing linked list");
1770 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1771 "local variable hash table");
1772 local_variable_text_table = my_malloc(
1773 (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
1774 "text of local variable names");
1776 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
1777 "local variable hash codes");
1778 local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
1779 "local variable text pointers");
1781 make_tokeniser_grid();
1782 make_keywords_tables();
1784 first_token_locations =
1785 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
1786 first_token_locations->location.file_index = 0;
1787 first_token_locations->location.beginning_byte_index = 0;
1788 first_token_locations->location.end_byte_index = 0;
1789 first_token_locations->location.beginning_line_number = 0;
1790 first_token_locations->location.end_line_number = 0;
1791 first_token_locations->location.beginning_character_number = 0;
1792 first_token_locations->location.end_character_number = 0;
1793 first_token_locations->next = NULL;
1794 first_token_locations->reference_count = 0;
1795 last_token_location = first_token_locations;
1798 extern void lexer_free_arrays(void)
1801 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1802 { p = FileStack[i].buffer;
1803 my_free(&p, "source file buffer");
1805 my_free(&FileStack, "filestack buffer");
1806 my_free(&lexeme_memory, "lexeme memory");
1808 my_free(&keywords_hash_table, "keyword hash table");
1809 my_free(&keywords_hash_ends_table, "keyword hash end table");
1810 my_free(&keywords_data_table, "keyword hashing linked list");
1811 my_free(&local_variable_hash_table, "local variable hash table");
1812 my_free(&local_variable_text_table, "text of local variable names");
1814 my_free(&local_variable_hash_codes, "local variable hash codes");
1815 my_free(&local_variable_texts, "local variable text pointers");
1817 cleanup_token_locations(NULL);
1820 /* ========================================================================= */