/* ------------------------------------------------------------------------- */
/* "lexer" : Lexical analyser */
/* */
-/* Part of Inform 6.41 */
-/* copyright (c) Graham Nelson 1993 - 2022 */
+/* Part of Inform 6.42 */
+/* copyright (c) Graham Nelson 1993 - 2024 */
/* */
/* Inform is free software: you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
(generally as a result of an error
message or the start of pass) */
dont_enter_into_symbol_table, /* Return names as text (with
- token type DQ_TT, i.e., as if
- they had double-quotes around)
- and not as entries in the symbol
- table, when TRUE. If -2, only the
+ token type UQ_TT) and not as
+ entries in the symbol table,
+ when TRUE. If -2, only the
keyword table is searched. */
return_sp_as_variable; /* When TRUE, the word "sp" denotes
the stack pointer variable
typedef struct lextext_s {
char *text;
- size_t size; /* Allocated size (including terminal null)
- This is always at least MAX_IDENTIFIER_LENGTH+1 */
+ size_t size; /* Allocated size (including terminal null) */
} lextext;
static lextext *lextexts; /* Allocated to no_lextexts */
/* ------------------------------------------------------------------------- */
/* The lexer itself needs up to 3 characters of lookahead (it uses an */
/* LR(3) grammar to translate characters into tokens). */
+/* */
+/* Past the end of the stream, we fill in zeros. This has the awkward */
+/* side effect that a zero byte in a source file will silently terminate */
+/* it, rather than producing an "illegal source character" error. */
+/* On the up side, we can compile veneer routines (which are null- */
+/* terminated strings) with no extra work. */
/* ------------------------------------------------------------------------- */
#define LOOKAHEAD_SIZE 3
static int current, lookahead, /* The latest character read, and */
lookahead2, lookahead3; /* the three characters following it */
+ /* (zero means end-of-stream) */
static int pipeline_made; /* Whether or not the pipeline of
characters has been constructed
break;
case SQ_TT: printf("string '%s'", text);
break;
+ case UQ_TT: printf("barestring %s", text);
+ break;
case SEP_TT: printf("separator '%s'", text);
break;
case EOF_TT: printf("end of file");
"get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
"mouse_window", "push_stack", "put_wind_prop", "print_form",
"make_menu", "picture_table", "print_unicode", "check_unicode",
+ "set_true_colour", "buffer_screen",
""
};
always translate to the same output tokens whenever the context
is the same.
- In fact, for efficiency reasons this number omits the bit of
- information held in the variable "dont_enter_into_symbol_table".
- Inform never needs to backtrack through tokens parsed in that
- way (thankfully, as it would be expensive indeed to check
- the tokens). */
+ (For many years, the "dont_enter_into_symbol_table" variable
+ was omitted from this number. But now we can include it.) */
int c = 0;
if (opcode_names.enabled) c |= 1;
if (local_variables.enabled) c |= 1024;
if (return_sp_as_variable) c |= 2048;
+ if (dont_enter_into_symbol_table) c |= 4096;
+
return(c);
}
static void print_context(int c)
{
+ if (c < 0) {
+ printf("??? ");
+ return;
+ }
if ((c & 1) != 0) printf("OPC ");
if ((c & 2) != 0) printf("DIR ");
if ((c & 4) != 0) printf("TK ");
if ((c & 512) != 0) printf("SCON ");
if ((c & 1024) != 0) printf("LV ");
if ((c & 2048) != 0) printf("sp ");
+ if ((c & 4096) != 0) printf("dontent ");
}
static int *keywords_hash_table;
119 for Glulx.
*/
+/* The number of local variables in the current routine. */
+int no_locals;
+
/* Names of local variables in the current routine.
+ The values are positions in local_variable_names_memlist.
This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local"
is not included in this array.)
(This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1.
But right now we just allocate the max.)
*/
-identstruct *local_variable_names;
+int *local_variable_name_offsets;
+
+static memory_list local_variable_names_memlist;
+/* How much of local_variable_names_memlist is used by the no_local locals. */
+static int local_variable_names_usage;
static char one_letter_locals[128];
}
}
+extern void clear_local_variables(void)
+{
+ no_locals = 0;
+ local_variable_names_usage = 0;
+}
+
+extern void add_local_variable(char *name)
+{
+ int len;
+
+ if (no_locals >= MAX_LOCAL_VARIABLES-1) {
+ /* This should have been caught before we got here */
+ error("too many local variables");
+ return;
+ }
+
+ len = strlen(name)+1;
+ ensure_memory_list_available(&local_variable_names_memlist, local_variable_names_usage + len);
+ local_variable_name_offsets[no_locals++] = local_variable_names_usage;
+ strcpy((char *)local_variable_names_memlist.data+local_variable_names_usage, name);
+ local_variable_names_usage += len;
+}
+
+extern char *get_local_variable_name(int index)
+{
+ if (index < 0 || index >= no_locals)
+ return "???"; /* shouldn't happen */
+
+ return (char *)local_variable_names_memlist.data + local_variable_name_offsets[index];
+}
+
/* Look at the strings stored in local_variable_names (from 0 to no_locals).
Set local_variables.keywords to point to these, and also prepare the
- hash tables. */
+ hash tables.
+ This must be called after add_local_variable(), but before we start
+ compiling function code. */
extern void construct_local_variable_tables(void)
{ int i, h;
for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
for (i=0; i<no_locals; i++)
{
- char *p = local_variable_names[i].text;
+ char *p = (char *)local_variable_names_memlist.data + local_variable_name_offsets[i];
local_variables.keywords[i] = p;
if (p[1] == 0)
{ one_letter_locals[(uchar)p[0]] = i;
}
}
-static void interpret_identifier(char *p, int pos, int dirs_only_flag)
+static void interpret_identifier(char *p, int pos)
{ int index, hashcode;
/* An identifier is either a keyword or a "symbol", a name which the
lexical analyser leaves to higher levels of Inform to understand. */
+ circle[pos].newsymbol = FALSE;
+
hashcode = hash_code_from_string(p);
- if (dirs_only_flag) goto KeywordSearch;
+ /* If dont_enter_into_symbol_table is true, we skip all keywords
+ (and variables) and just mark the name as an unquoted string.
+ Except that if dont_enter_into_symbol_table is -2, we recognize
+ directive keywords (only).
+ */
+ if (dont_enter_into_symbol_table) {
+
+ if (dont_enter_into_symbol_table == -2) {
+ /* This is a simplified version of the keyword-checking loop
+ below. */
+ index = keywords_hash_table[hashcode];
+ while (index >= 0)
+ { int *i = keywords_data_table + 3*index;
+ keyword_group *kg = keyword_groups[*i];
+ if (kg == &directives)
+ { char *q = kg->keywords[*(i+1)];
+ if (((kg->case_sensitive) && (strcmp(p, q)==0))
+ || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
+ { circle[pos].type = kg->change_token_type;
+ circle[pos].value = *(i+1);
+ return;
+ }
+ }
+ index = *(i+2);
+ }
+ }
+
+ circle[pos].type = UQ_TT;
+ circle[pos].value = 0;
+ return;
+ }
+
/* If this is assembly language, perhaps it is "sp"? */
if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
if (index >= 0)
{ for (;index<no_locals;index++)
{ if (hashcode == local_variable_hash_codes[index])
- { if (strcmpcis(p, local_variable_names[index].text)==0)
+ {
+ char *locname = (char *)local_variable_names_memlist.data + local_variable_name_offsets[index];
+ if (strcmpcis(p, locname)==0)
{ circle[pos].type = LOCAL_VARIABLE_TT;
circle[pos].value = index+1;
return;
/* Now the bulk of the keywords. Note that the lexer doesn't recognise
the name of a system function which has been Replaced. */
- KeywordSearch:
index = keywords_hash_table[hashcode];
while (index >= 0)
{ int *i = keywords_data_table + 3*index;
keyword_group *kg = keyword_groups[*i];
- if (((!dirs_only_flag) && (kg->enabled))
- || (dirs_only_flag && (kg == &directives)))
+ if (kg->enabled)
{ char *q = kg->keywords[*(i+1)];
if (((kg->case_sensitive) && (strcmp(p, q)==0))
|| ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
index = *(i+2);
}
- if (dirs_only_flag) return;
-
/* Search for the name; create it if necessary. */
- circle[pos].value = symbol_index(p, hashcode);
+ circle[pos].value = symbol_index(p, hashcode, &circle[pos].newsymbol);
circle[pos].type = SYMBOL_TT;
}
tokeniser_grid[0] = EOF_CODE;
tokeniser_grid[' '] = WHITESPACE_CODE;
tokeniser_grid['\n'] = WHITESPACE_CODE;
+ tokeniser_grid['\r'] = WHITESPACE_CODE;
tokeniser_grid['$'] = RADIX_CODE;
tokeniser_grid['!'] = COMMENT_CODE;
/* */
/* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
/* from the current input file. If the file runs out, then if it was */
-/* the last source file 4 EOF characters are placed in the buffer: if it */
+/* the last source file 4 null characters are placed in the buffer: if it */
/* was only an Include file ending, then a '\n' character is placed there */
/* (essentially to force termination of any comment line) followed by */
/* three harmless spaces. */
CurrentLB->chars_read++;
if (forerrors_pointer < FORERRORS_SIZE-1)
forerrors_buff[forerrors_pointer++] = current;
- if (current == '\n') reached_new_line();
+
+ /* The file is open in binary mode, so we have to do our own newline
+ conversion. (We want to do it consistently across all platforms.)
+
+ The strategy is to convert all \r (CR) characters to \n (LF), but
+ *don't* advance the line counter for \r if it's followed by \n.
+ The rest of the lexer treats multiple \n characters the same as
+ one, so the simple conversion will work out okay.
+
+ (Note that, for historical reasons, a ctrl-L (formfeed) is also
+ treated as \r. This conversion has already been handled by
+ source_to_iso_grid[].)
+ */
+ if (current == '\n') {
+ reached_new_line();
+ }
+ else if (current == '\r') {
+ current = '\n';
+ if (lookahead != '\n')
+ reached_new_line();
+ }
+
return(current);
}
/* ------------------------------------------------------------------------- */
-/* Source 2: from a string */
+/* Source 2: from a (null-terminated) string */
/* ------------------------------------------------------------------------- */
static int source_to_analyse_pointer; /* Current read position */
CurrentLB->chars_read++;
if (forerrors_pointer < FORERRORS_SIZE-1)
forerrors_buff[forerrors_pointer++] = current;
+
+ /* We shouldn't have \r when compiling from string (veneer function).
+ If we do, just shove it under the carpet. */
+ if (current == '\r') current = '\n';
if (current == '\n') reached_new_line();
+
return(current);
}
/* */
/* restart_lexer(source, name) if source is NULL, initialise the lexer */
/* to read from source files; */
-/* otherwise, to read from this string. */
+/* otherwise, to read from this null- */
+/* terminated string. */
/* ------------------------------------------------------------------------- */
extern void release_token_texts(void)
extern void put_token_back(void)
{ tokens_put_back++;
+ int pos = circle_position - tokens_put_back + 1;
+ if (pos<0) pos += CIRCLE_SIZE;
+
if (tokens_trace_level > 0)
- { if (tokens_trace_level == 1) printf("<- ");
- else printf("<-\n");
+ {
+ printf("<- ");
+ if (tokens_trace_level > 1) {
+ describe_token(&circle[pos]);
+ printf("\n");
+ }
}
+ if (circle[pos].type == SYMBOL_TT && circle[pos].newsymbol) {
+ /* Remove the symbol from the symbol table. (Or mark it as unreachable
+ anyhow.) */
+ end_symbol_scope(circle[pos].value, TRUE);
+ /* Remove new-symbol flag, and force reinterpretation next time
+ we see the symbol. */
+ circle[pos].newsymbol = FALSE;
+ circle[pos].context = -1;
+ }
+
/* The following error, of course, should never happen! */
if (tokens_put_back == CIRCLE_SIZE)
}
extern void get_next_token(void)
-{ int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
+{ int d, i, j, k, quoted_size, e, radix, context;
+ uint32 n;
+ char *r;
int floatend;
int returning_a_put_back_token = TRUE;
if (context != circle[i].context)
{ j = circle[i].type;
if ((j==0) || ((j>=100) && (j<200)))
- interpret_identifier(circle[i].text, i, FALSE);
+ interpret_identifier(circle[i].text, i);
circle[i].context = context;
}
goto ReturnBack;
/* fresh lextext block; must init it */
no_lextexts = lex_index+1;
ensure_memory_list_available(&lextexts_memlist, no_lextexts);
- lextexts[lex_index].size = MAX_IDENTIFIER_LENGTH + 1;
+ lextexts[lex_index].size = 64; /* this can grow */
lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text");
}
lex_pos = 0;
circle[circle_position].text = NULL; /* will fill in later */
circle[circle_position].value = 0;
circle[circle_position].type = 0;
+ circle[circle_position].newsymbol = FALSE;
circle[circle_position].context = context;
StartTokenAgain:
goto StartTokenAgain;
case COMMENT_CODE:
- while ((lookahead != '\n') && (lookahead != 0))
+ while ((lookahead != '\n') && (lookahead != '\r') && (lookahead != 0))
(*get_next_char)();
goto StartTokenAgain;
lexaddc(0);
circle[circle_position].type = NUMBER_TT;
- circle[circle_position].value = n;
+ circle[circle_position].value = (int32)n;
break;
FloatNumber:
quoted_size=0;
do
{ e = d; d = (*get_next_char)(); lexaddc(d);
- if (quoted_size++==64)
- { error(
- "Too much text for one pair of quotations '...' to hold");
- lexaddc('\''); break;
- }
+ quoted_size++;
if ((d == '\'') && (e != '@'))
{ if (quoted_size == 1)
{ d = (*get_next_char)(); lexaddc(d);
}
break;
}
- } while (d != EOF);
- if (d==EOF) ebf_error("'\''", "end of file");
+ } while (d != 0);
+ if (d==0) ebf_error("'\''", "end of file");
lexdelc();
circle[circle_position].type = SQ_TT;
break;
case DQUOTE_CODE: /* Double-quotes: scan a literal string */
- quoted_size=0;
do
{ d = (*get_next_char)(); lexaddc(d);
if (d == '\n')
{ lex_pos--;
while (lexlastc() == ' ') lex_pos--;
if (lexlastc() != '^') lexaddc(' ');
- while ((lookahead != EOF) &&
+ while ((lookahead != 0) &&
(tokeniser_grid[lookahead] == WHITESPACE_CODE))
(*get_next_char)();
}
else if (d == '\\')
{ int newline_passed = FALSE;
lex_pos--;
- while ((lookahead != EOF) &&
+ while ((lookahead != 0) &&
(tokeniser_grid[lookahead] == WHITESPACE_CODE))
if ((d = (*get_next_char)()) == '\n')
newline_passed = TRUE;
chb);
}
}
- } while ((d != EOF) && (d!='\"'));
- if (d==EOF) ebf_error("'\"'", "end of file");
+ } while ((d != 0) && (d!='\"'));
+ if (d==0) ebf_error("'\"'", "end of file");
lexdelc();
circle[circle_position].type = DQ_TT;
break;
case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
lexaddc(d); n=1;
- while ((n<=MAX_IDENTIFIER_LENGTH)
- && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
+ while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
|| (tokeniser_grid[lookahead] == DIGIT_CODE)))
n++, lexaddc((*get_next_char)());
lexaddc(0);
- if (n > MAX_IDENTIFIER_LENGTH)
- { char bad_length[100];
- sprintf(bad_length,
- "Name exceeds the maximum length of %d characters:",
- MAX_IDENTIFIER_LENGTH);
- error_named(bad_length, lextexts[lex_index].text);
- /* Eat any further extra characters in the identifier */
- while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
- || (tokeniser_grid[lookahead] == DIGIT_CODE)))
- (*get_next_char)();
- /* Trim token so that it doesn't violate
- MAX_IDENTIFIER_LENGTH during error recovery */
- lextexts[lex_index].text[MAX_IDENTIFIER_LENGTH] = 0;
- }
-
- if (dont_enter_into_symbol_table)
- { circle[circle_position].type = DQ_TT;
- circle[circle_position].value = 0;
- if (dont_enter_into_symbol_table == -2)
- interpret_identifier(lextexts[lex_index].text, circle_position, TRUE);
- break;
- }
-
- interpret_identifier(lextexts[lex_index].text, circle_position, FALSE);
+ interpret_identifier(lextexts[lex_index].text, circle_position);
break;
default:
else
{ printf("-> "); describe_token(&circle[i]);
printf(" ");
- if (tokens_trace_level > 2) print_context(circle[i].context);
+ if (tokens_trace_level > 2) {
+ if (circle[i].newsymbol) printf("newsym ");
+ print_context(circle[i].context);
+ }
printf("\n");
}
}
for (i=0; i<CIRCLE_SIZE; i++)
{ circle[i].type = 0;
circle[i].value = 0;
+ circle[i].newsymbol = FALSE;
circle[i].text = "(if this is ever visible, there is a bug)";
circle[i].lextext = -1;
circle[i].context = 0;
cur_lextexts = 0;
lex_index = -1;
lex_pos = -1;
+
+ no_locals = 0;
+ local_variable_names_usage = 0;
blank_brief_location.file_index = -1;
blank_brief_location.line_number = 0;
pipeline_made = FALSE;
+ no_locals = 0;
+
restart_lexer(NULL, NULL);
}
keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
"keyword hashing linked list");
- local_variable_names = my_calloc(sizeof(identstruct), MAX_LOCAL_VARIABLES-1,
+ initialise_memory_list(&local_variable_names_memlist,
+ sizeof(char), MAX_LOCAL_VARIABLES*32, NULL,
"text of local variable names");
+ local_variable_name_offsets = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES-1,
+ "offsets of local variable names");
local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
"local variable hash table");
local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
my_free(&keywords_hash_ends_table, "keyword hash end table");
my_free(&keywords_data_table, "keyword hashing linked list");
- my_free(&local_variable_names, "text of local variable names");
+ deallocate_memory_list(&local_variable_names_memlist);
+ my_free(&local_variable_name_offsets, "offsets of local variable names");
my_free(&local_variable_hash_table, "local variable hash table");
my_free(&local_variable_hash_codes, "local variable hash codes");