X-Git-Url: https://jxself.org/git/?a=blobdiff_plain;f=src%2Flexer.c;h=1843b0aee63e2cbcc07a4a144c757085e7ad4356;hb=56a5292888e1d46fe3033cd1d5c636051692453f;hp=63cafbcd4a41747c9197193b8b84ab5c38d3ab04;hpb=20cbfff96015938809d0e3da6cd0d83b76d27f14;p=inform.git diff --git a/src/lexer.c b/src/lexer.c index 63cafbc..1843b0a 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,8 +1,8 @@ /* ------------------------------------------------------------------------- */ /* "lexer" : Lexical analyser */ /* */ -/* Part of Inform 6.41 */ -/* copyright (c) Graham Nelson 1993 - 2022 */ +/* Part of Inform 6.42 */ +/* copyright (c) Graham Nelson 1993 - 2024 */ /* */ /* Inform is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ @@ -29,10 +29,9 @@ int total_source_line_count, /* Number of source lines so far */ (generally as a result of an error message or the start of pass) */ dont_enter_into_symbol_table, /* Return names as text (with - token type DQ_TT, i.e., as if - they had double-quotes around) - and not as entries in the symbol - table, when TRUE. If -2, only the + token type UQ_TT) and not as + entries in the symbol table, + when TRUE. If -2, only the keyword table is searched. */ return_sp_as_variable; /* When TRUE, the word "sp" denotes the stack pointer variable @@ -269,8 +268,7 @@ static lexeme_data circle[CIRCLE_SIZE]; typedef struct lextext_s { char *text; - size_t size; /* Allocated size (including terminal null) - This is always at least MAX_IDENTIFIER_LENGTH+1 */ + size_t size; /* Allocated size (including terminal null) */ } lextext; static lextext *lextexts; /* Allocated to no_lextexts */ @@ -286,12 +284,19 @@ static int lex_pos; /* Current write position in that lextext */ /* ------------------------------------------------------------------------- */ /* The lexer itself needs up to 3 characters of lookahead (it uses an */ /* LR(3) grammar to translate characters into tokens). */ +/* */ +/* Past the end of the stream, we fill in zeros. This has the awkward */ +/* side effect that a zero byte in a source file will silently terminate */ +/* it, rather than producing an "illegal source character" error. */ +/* On the up side, we can compile veneer routines (which are null- */ +/* terminated strings) with no extra work. */ /* ------------------------------------------------------------------------- */ #define LOOKAHEAD_SIZE 3 static int current, lookahead, /* The latest character read, and */ lookahead2, lookahead3; /* the three characters following it */ + /* (zero means end-of-stream) */ static int pipeline_made; /* Whether or not the pipeline of characters has been constructed @@ -337,6 +342,8 @@ extern void describe_token_triple(const char *text, int32 value, int type) break; case SQ_TT: printf("string '%s'", text); break; + case UQ_TT: printf("barestring %s", text); + break; case SEP_TT: printf("separator '%s'", text); break; case EOF_TT: printf("end of file"); @@ -440,6 +447,7 @@ static char *opcode_list_z[] = { "get_wind_prop", "scroll_window", "pop_stack", "read_mouse", "mouse_window", "push_stack", "put_wind_prop", "print_form", "make_menu", "picture_table", "print_unicode", "check_unicode", + "set_true_colour", "buffer_screen", "" }; @@ -605,11 +613,8 @@ static int lexical_context(void) always translate to the same output tokens whenever the context is the same. - In fact, for efficiency reasons this number omits the bit of - information held in the variable "dont_enter_into_symbol_table". - Inform never needs to backtrack through tokens parsed in that - way (thankfully, as it would be expensive indeed to check - the tokens). */ + (For many years, the "dont_enter_into_symbol_table" variable + was omitted from this number. But now we can include it.) */ int c = 0; if (opcode_names.enabled) c |= 1; @@ -625,11 +630,17 @@ static int lexical_context(void) if (local_variables.enabled) c |= 1024; if (return_sp_as_variable) c |= 2048; + if (dont_enter_into_symbol_table) c |= 4096; + return(c); } static void print_context(int c) { + if (c < 0) { + printf("??? "); + return; + } if ((c & 1) != 0) printf("OPC "); if ((c & 2) != 0) printf("DIR "); if ((c & 4) != 0) printf("TK "); @@ -642,6 +653,7 @@ static void print_context(int c) if ((c & 512) != 0) printf("SCON "); if ((c & 1024) != 0) printf("LV "); if ((c & 2048) != 0) printf("sp "); + if ((c & 4096) != 0) printf("dontent "); } static int *keywords_hash_table; @@ -657,14 +669,22 @@ static int *local_variable_hash_codes; 119 for Glulx. */ +/* The number of local variables in the current routine. */ +int no_locals; + /* Names of local variables in the current routine. + The values are positions in local_variable_names_memlist. This is allocated to MAX_LOCAL_VARIABLES-1. (The stack pointer "local" is not included in this array.) (This could be a memlist, growing as needed up to MAX_LOCAL_VARIABLES-1. But right now we just allocate the max.) */ -identstruct *local_variable_names; +int *local_variable_name_offsets; + +static memory_list local_variable_names_memlist; +/* How much of local_variable_names_memlist is used by the no_local locals. */ +static int local_variable_names_usage; static char one_letter_locals[128]; @@ -729,9 +749,42 @@ static void make_keywords_tables(void) } } +extern void clear_local_variables(void) +{ + no_locals = 0; + local_variable_names_usage = 0; +} + +extern void add_local_variable(char *name) +{ + int len; + + if (no_locals >= MAX_LOCAL_VARIABLES-1) { + /* This should have been caught before we got here */ + error("too many local variables"); + return; + } + + len = strlen(name)+1; + ensure_memory_list_available(&local_variable_names_memlist, local_variable_names_usage + len); + local_variable_name_offsets[no_locals++] = local_variable_names_usage; + strcpy((char *)local_variable_names_memlist.data+local_variable_names_usage, name); + local_variable_names_usage += len; +} + +extern char *get_local_variable_name(int index) +{ + if (index < 0 || index >= no_locals) + return "???"; /* shouldn't happen */ + + return (char *)local_variable_names_memlist.data + local_variable_name_offsets[index]; +} + /* Look at the strings stored in local_variable_names (from 0 to no_locals). Set local_variables.keywords to point to these, and also prepare the - hash tables. */ + hash tables. + This must be called after add_local_variable(), but before we start + compiling function code. */ extern void construct_local_variable_tables(void) { int i, h; for (i=0; i= 0) + { int *i = keywords_data_table + 3*index; + keyword_group *kg = keyword_groups[*i]; + if (kg == &directives) + { char *q = kg->keywords[*(i+1)]; + if (((kg->case_sensitive) && (strcmp(p, q)==0)) + || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0))) + { circle[pos].type = kg->change_token_type; + circle[pos].value = *(i+1); + return; + } + } + index = *(i+2); + } + } + + circle[pos].type = UQ_TT; + circle[pos].value = 0; + return; + } + /* If this is assembly language, perhaps it is "sp"? */ if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0)) @@ -790,7 +876,9 @@ static void interpret_identifier(char *p, int pos, int dirs_only_flag) if (index >= 0) { for (;index= 0) { int *i = keywords_data_table + 3*index; keyword_group *kg = keyword_groups[*i]; - if (((!dirs_only_flag) && (kg->enabled)) - || (dirs_only_flag && (kg == &directives))) + if (kg->enabled) { char *q = kg->keywords[*(i+1)]; if (((kg->case_sensitive) && (strcmp(p, q)==0)) || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0))) @@ -824,11 +910,9 @@ static void interpret_identifier(char *p, int pos, int dirs_only_flag) index = *(i+2); } - if (dirs_only_flag) return; - /* Search for the name; create it if necessary. */ - circle[pos].value = symbol_index(p, hashcode); + circle[pos].value = symbol_index(p, hashcode, &circle[pos].newsymbol); circle[pos].type = SYMBOL_TT; } @@ -901,6 +985,7 @@ static void make_tokeniser_grid(void) tokeniser_grid[0] = EOF_CODE; tokeniser_grid[' '] = WHITESPACE_CODE; tokeniser_grid['\n'] = WHITESPACE_CODE; + tokeniser_grid['\r'] = WHITESPACE_CODE; tokeniser_grid['$'] = RADIX_CODE; tokeniser_grid['!'] = COMMENT_CODE; @@ -1378,7 +1463,7 @@ static int32 construct_double(int wanthigh, int signbit, double intv, double fra /* */ /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */ /* from the current input file. If the file runs out, then if it was */ -/* the last source file 4 EOF characters are placed in the buffer: if it */ +/* the last source file 4 null characters are placed in the buffer: if it */ /* was only an Include file ending, then a '\n' character is placed there */ /* (essentially to force termination of any comment line) followed by */ /* three harmless spaces. */ @@ -1541,12 +1626,33 @@ static int get_next_char_from_pipeline(void) CurrentLB->chars_read++; if (forerrors_pointer < FORERRORS_SIZE-1) forerrors_buff[forerrors_pointer++] = current; - if (current == '\n') reached_new_line(); + + /* The file is open in binary mode, so we have to do our own newline + conversion. (We want to do it consistently across all platforms.) + + The strategy is to convert all \r (CR) characters to \n (LF), but + *don't* advance the line counter for \r if it's followed by \n. + The rest of the lexer treats multiple \n characters the same as + one, so the simple conversion will work out okay. + + (Note that, for historical reasons, a ctrl-L (formfeed) is also + treated as \r. This conversion has already been handled by + source_to_iso_grid[].) + */ + if (current == '\n') { + reached_new_line(); + } + else if (current == '\r') { + current = '\n'; + if (lookahead != '\n') + reached_new_line(); + } + return(current); } /* ------------------------------------------------------------------------- */ -/* Source 2: from a string */ +/* Source 2: from a (null-terminated) string */ /* ------------------------------------------------------------------------- */ static int source_to_analyse_pointer; /* Current read position */ @@ -1565,7 +1671,12 @@ static int get_next_char_from_string(void) CurrentLB->chars_read++; if (forerrors_pointer < FORERRORS_SIZE-1) forerrors_buff[forerrors_pointer++] = current; + + /* We shouldn't have \r when compiling from string (veneer function). + If we do, just shove it under the carpet. */ + if (current == '\r') current = '\n'; if (current == '\n') reached_new_line(); + return(current); } @@ -1586,7 +1697,8 @@ static int get_next_char_from_string(void) /* */ /* restart_lexer(source, name) if source is NULL, initialise the lexer */ /* to read from source files; */ -/* otherwise, to read from this string. */ +/* otherwise, to read from this null- */ +/* terminated string. */ /* ------------------------------------------------------------------------- */ extern void release_token_texts(void) @@ -1632,11 +1744,28 @@ extern void release_token_texts(void) extern void put_token_back(void) { tokens_put_back++; + int pos = circle_position - tokens_put_back + 1; + if (pos<0) pos += CIRCLE_SIZE; + if (tokens_trace_level > 0) - { if (tokens_trace_level == 1) printf("<- "); - else printf("<-\n"); + { + printf("<- "); + if (tokens_trace_level > 1) { + describe_token(&circle[pos]); + printf("\n"); + } } + if (circle[pos].type == SYMBOL_TT && circle[pos].newsymbol) { + /* Remove the symbol from the symbol table. (Or mark it as unreachable + anyhow.) */ + end_symbol_scope(circle[pos].value, TRUE); + /* Remove new-symbol flag, and force reinterpretation next time + we see the symbol. */ + circle[pos].newsymbol = FALSE; + circle[pos].context = -1; + } + /* The following error, of course, should never happen! */ if (tokens_put_back == CIRCLE_SIZE) @@ -1695,7 +1824,9 @@ static void lexadds(char *str) } extern void get_next_token(void) -{ int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r; +{ int d, i, j, k, quoted_size, e, radix, context; + uint32 n; + char *r; int floatend; int returning_a_put_back_token = TRUE; @@ -1708,7 +1839,7 @@ extern void get_next_token(void) if (context != circle[i].context) { j = circle[i].type; if ((j==0) || ((j>=100) && (j<200))) - interpret_identifier(circle[i].text, i, FALSE); + interpret_identifier(circle[i].text, i); circle[i].context = context; } goto ReturnBack; @@ -1723,7 +1854,7 @@ extern void get_next_token(void) /* fresh lextext block; must init it */ no_lextexts = lex_index+1; ensure_memory_list_available(&lextexts_memlist, no_lextexts); - lextexts[lex_index].size = MAX_IDENTIFIER_LENGTH + 1; + lextexts[lex_index].size = 64; /* this can grow */ lextexts[lex_index].text = my_malloc(lextexts[lex_index].size, "one lexeme text"); } lex_pos = 0; @@ -1733,6 +1864,7 @@ extern void get_next_token(void) circle[circle_position].text = NULL; /* will fill in later */ circle[circle_position].value = 0; circle[circle_position].type = 0; + circle[circle_position].newsymbol = FALSE; circle[circle_position].context = context; StartTokenAgain: @@ -1758,7 +1890,7 @@ extern void get_next_token(void) goto StartTokenAgain; case COMMENT_CODE: - while ((lookahead != '\n') && (lookahead != 0)) + while ((lookahead != '\n') && (lookahead != '\r') && (lookahead != 0)) (*get_next_char)(); goto StartTokenAgain; @@ -1779,7 +1911,7 @@ extern void get_next_token(void) lexaddc(0); circle[circle_position].type = NUMBER_TT; - circle[circle_position].value = n; + circle[circle_position].value = (int32)n; break; FloatNumber: @@ -1869,11 +2001,7 @@ extern void get_next_token(void) quoted_size=0; do { e = d; d = (*get_next_char)(); lexaddc(d); - if (quoted_size++==64) - { error( - "Too much text for one pair of quotations '...' to hold"); - lexaddc('\''); break; - } + quoted_size++; if ((d == '\'') && (e != '@')) { if (quoted_size == 1) { d = (*get_next_char)(); lexaddc(d); @@ -1882,28 +2010,27 @@ extern void get_next_token(void) } break; } - } while (d != EOF); - if (d==EOF) ebf_error("'\''", "end of file"); + } while (d != 0); + if (d==0) ebf_error("'\''", "end of file"); lexdelc(); circle[circle_position].type = SQ_TT; break; case DQUOTE_CODE: /* Double-quotes: scan a literal string */ - quoted_size=0; do { d = (*get_next_char)(); lexaddc(d); if (d == '\n') { lex_pos--; while (lexlastc() == ' ') lex_pos--; if (lexlastc() != '^') lexaddc(' '); - while ((lookahead != EOF) && + while ((lookahead != 0) && (tokeniser_grid[lookahead] == WHITESPACE_CODE)) (*get_next_char)(); } else if (d == '\\') { int newline_passed = FALSE; lex_pos--; - while ((lookahead != EOF) && + while ((lookahead != 0) && (tokeniser_grid[lookahead] == WHITESPACE_CODE)) if ((d = (*get_next_char)()) == '\n') newline_passed = TRUE; @@ -1915,8 +2042,8 @@ extern void get_next_token(void) chb); } } - } while ((d != EOF) && (d!='\"')); - if (d==EOF) ebf_error("'\"'", "end of file"); + } while ((d != 0) && (d!='\"')); + if (d==0) ebf_error("'\"'", "end of file"); lexdelc(); circle[circle_position].type = DQ_TT; break; @@ -1924,37 +2051,13 @@ extern void get_next_token(void) case IDENTIFIER_CODE: /* Letter or underscore: an identifier */ lexaddc(d); n=1; - while ((n<=MAX_IDENTIFIER_LENGTH) - && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE) + while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE) || (tokeniser_grid[lookahead] == DIGIT_CODE))) n++, lexaddc((*get_next_char)()); lexaddc(0); - if (n > MAX_IDENTIFIER_LENGTH) - { char bad_length[100]; - sprintf(bad_length, - "Name exceeds the maximum length of %d characters:", - MAX_IDENTIFIER_LENGTH); - error_named(bad_length, lextexts[lex_index].text); - /* Eat any further extra characters in the identifier */ - while (((tokeniser_grid[lookahead] == IDENTIFIER_CODE) - || (tokeniser_grid[lookahead] == DIGIT_CODE))) - (*get_next_char)(); - /* Trim token so that it doesn't violate - MAX_IDENTIFIER_LENGTH during error recovery */ - lextexts[lex_index].text[MAX_IDENTIFIER_LENGTH] = 0; - } - - if (dont_enter_into_symbol_table) - { circle[circle_position].type = DQ_TT; - circle[circle_position].value = 0; - if (dont_enter_into_symbol_table == -2) - interpret_identifier(lextexts[lex_index].text, circle_position, TRUE); - break; - } - - interpret_identifier(lextexts[lex_index].text, circle_position, FALSE); + interpret_identifier(lextexts[lex_index].text, circle_position); break; default: @@ -2059,7 +2162,10 @@ extern void get_next_token(void) else { printf("-> "); describe_token(&circle[i]); printf(" "); - if (tokens_trace_level > 2) print_context(circle[i].context); + if (tokens_trace_level > 2) { + if (circle[i].newsymbol) printf("newsym "); + print_context(circle[i].context); + } printf("\n"); } } @@ -2073,6 +2179,7 @@ extern void restart_lexer(char *lexical_source, char *name) for (i=0; i