X-Git-Url: https://jxself.org/git/?a=blobdiff_plain;f=src%2Ftext.c;h=525ecec6da03a40df06bc9b28d51f9b25bc28cbc;hb=8e63120c630c94c598d4e2d6ba823dac59bce8fa;hp=88ceaecba825ae552ad94f1f9fc53f69131dd06a;hpb=d11f2f726ed7feea617476d99cf7505ddd9a27ce;p=inform.git diff --git a/src/text.c b/src/text.c index 88ceaec..525ecec 100644 --- a/src/text.c +++ b/src/text.c @@ -1,8 +1,8 @@ /* ------------------------------------------------------------------------- */ /* "text" : Text translation, the abbreviations optimiser, the dictionary */ /* */ -/* Part of Inform 6.35 */ -/* copyright (c) Graham Nelson 1993 - 2021 */ +/* Part of Inform 6.40 */ +/* copyright (c) Graham Nelson 1993 - 2022 */ /* */ /* Inform is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ @@ -15,29 +15,30 @@ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ -/* along with Inform. If not, see https://gnu.org/licenses/ * +/* along with Inform. If not, see https://gnu.org/licenses/ */ /* */ /* ------------------------------------------------------------------------- */ #include "header.h" -uchar *low_strings, *low_strings_top; /* Start and next free byte in the low - strings pool */ +uchar *low_strings; /* Allocated to low_strings_top */ +int32 low_strings_top; +static memory_list low_strings_memlist; int32 static_strings_extent; /* Number of bytes of static strings made so far */ -memory_block static_strings_area; /* Used if (!temporary_files_switch) to - hold the static strings area so far */ +uchar *static_strings_area; /* Used to hold the static strings + area so far + Allocated to static_strings_extent */ +memory_list static_strings_area_memlist; -static uchar *strings_holding_area; /* Area holding translated strings - until they are moved into either - a temporary file, or the - static_strings_area below */ - -char *all_text, *all_text_top; /* Start and next byte free in (large) - text buffer holding the entire text +static char *all_text; /* Text buffer holding the entire text of the game, when it is being - recorded */ + recorded + (Allocated to all_text_top) */ +static memory_list all_text_memlist; +static int32 all_text_top; + int abbrevs_lookup_table_made, /* The abbreviations lookup table is constructed when the first non- abbreviation string is translated: @@ -48,8 +49,6 @@ int abbrevs_lookup_table_made, /* The abbreviations lookup table is with ASCII character n, or -1 if none of the abbreviations do */ int no_abbreviations; /* No of abbreviations defined so far */ -uchar *abbreviations_at; /* Memory to hold the text of any - abbreviation strings declared */ /* ------------------------------------------------------------------------- */ /* Glulx string compression storage */ /* ------------------------------------------------------------------------- */ @@ -62,7 +61,6 @@ int no_dynamic_strings; /* No. of @.. string escapes used int no_unicode_chars; /* Number of distinct Unicode chars used. (Beyond 0xFF.) */ -static int MAX_CHARACTER_SET; /* Number of possible entities */ huffentity_t *huff_entities; /* The list of entities (characters, abbreviations, @.. escapes, and the terminator) */ @@ -87,11 +85,16 @@ int32 compression_string_size; /* Length of the compressed string int32 *compressed_offsets; /* The beginning of every string in the game, relative to the beginning of the Huffman table. (So entry 0 - is equal to compression_table_size)*/ + is equal to compression_table_size). + Allocated to no_strings at + compress_game_text() time. */ +static memory_list compressed_offsets_memlist; + +unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars */ +static memory_list unicode_usage_entries_memlist; #define UNICODE_HASH_BUCKETS (64) -unicode_usage_t *unicode_usage_entries; -static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS]; +static int unicode_usage_hash[UNICODE_HASH_BUCKETS]; static int unicode_entity_index(int32 unicode); @@ -99,9 +102,20 @@ static int unicode_entity_index(int32 unicode); /* Abbreviation arrays */ /* ------------------------------------------------------------------------- */ -int *abbrev_values; -int *abbrev_quality; -int *abbrev_freqs; +abbreviation *abbreviations; /* Allocated up to no_abbreviations */ +static memory_list abbreviations_memlist; + +/* Memory to hold the text of any abbreviation strings declared. This is + counted in units of MAX_ABBREV_LENGTH bytes. (An abbreviation must fit + in that many bytes, null included.) */ +uchar *abbreviations_at; /* Allocated up to no_abbreviations */ +static memory_list abbreviations_at_memlist; + +static int *abbreviations_optimal_parse_schedule; +static memory_list abbreviations_optimal_parse_schedule_memlist; + +static int *abbreviations_optimal_parse_scores; +static memory_list abbreviations_optimal_parse_scores_memlist; /* ------------------------------------------------------------------------- */ @@ -110,26 +124,29 @@ int32 total_chars_trans, /* Number of ASCII chars of text in */ zchars_trans_in_last_string; /* Number of Z-chars in last string: needed only for abbrev efficiency calculation in "directs.c" */ -static int32 total_zchars_trans, /* Number of Z-chars of text out +static int32 total_zchars_trans; /* Number of Z-chars of text out (only used to calculate the above) */ - no_chars_transcribed; /* Number of ASCII chars written to - the text transcription area (used - for the -r and -u switches) */ static int zchars_out_buffer[3], /* During text translation, a buffer of 3 Z-chars at a time: when it's full these are written as a 2-byte word */ zob_index; /* Index (0 to 2) into it */ -static unsigned char *text_out_pc; /* The "program counter" during text - translation: the next address to +uchar *translated_text; /* Area holding translated strings + until they are moved into the + static_strings_area below */ +static memory_list translated_text_memlist; + +static int32 text_out_pos; /* The "program counter" during text + translation: the next position to write Z-coded text output to */ -static unsigned char *text_out_limit; /* The upper limit of text_out_pc - during text translation */ +static int32 text_out_limit; /* The upper limit of text_out_pos + during text translation (or -1 + for no limit) */ static int text_out_overflow; /* During text translation, becomes - true if text_out_pc tries to pass + true if text_out_pos tries to pass text_out_limit */ /* ------------------------------------------------------------------------- */ @@ -154,10 +171,10 @@ static void make_abbrevs_lookup(void) p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH; if (strcmp(p1,p2)<0) { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p); - l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k]; - abbrev_values[k]=l; - l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k]; - abbrev_quality[k]=l; + l=abbreviations[j].value; abbreviations[j].value=abbreviations[k].value; + abbreviations[k].value=l; + l=abbreviations[j].quality; abbreviations[j].quality=abbreviations[k].quality; + abbreviations[k].quality=l; bubble_sort = TRUE; } } @@ -166,7 +183,7 @@ static void make_abbrevs_lookup(void) for (j=no_abbreviations-1; j>=0; j--) { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH; abbrevs_lookup[(uchar)p1[0]]=j; - abbrev_freqs[j]=0; + abbreviations[j].freq=0; } abbrevs_lookup_table_made = TRUE; } @@ -197,7 +214,7 @@ static int try_abbreviations_from(unsigned char *text, int i, int from) if (!glulx_mode) { for (k=0; p[k]!=0; k++) text[i+k]=1; } - abbrev_freqs[j]++; + abbreviations[j].freq++; return(j); NotMatched: ; } @@ -207,15 +224,25 @@ static int try_abbreviations_from(unsigned char *text, int i, int from) extern void make_abbreviation(char *text) { + ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1); + ensure_memory_list_available(&abbreviations_at_memlist, no_abbreviations+1); + strcpy((char *)abbreviations_at + no_abbreviations*MAX_ABBREV_LENGTH, text); - abbrev_values[no_abbreviations] = compile_string(text, STRCTX_ABBREV); + abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV); + abbreviations[no_abbreviations].freq = 0; /* The quality is the number of Z-chars saved by using this */ /* abbreviation: note that it takes 2 Z-chars to print it. */ - abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2; + abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2; + + if (abbreviations[no_abbreviations].quality <= 0) { + warning_named("Abbreviation does not save any characters:", text); + } + + no_abbreviations++; } /* ------------------------------------------------------------------------- */ @@ -226,29 +253,47 @@ extern void make_abbreviation(char *text) /* ------------------------------------------------------------------------- */ extern int32 compile_string(char *b, int strctx) -{ int i, j; uchar *c; - +{ int32 i, j, k; + uchar *c; + int in_low_memory; + + if (execution_never_reaches_here) { + /* No need to put strings into gametext.txt or the static/low + strings areas. */ + if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) { + /* VENEER and VENEEROPC are only used at the translate_text level, + so we don't have to catch them here. */ + return 0; + } + } + /* In Z-code, abbreviations go in the low memory pool (0x100). So do strings explicitly defined with the Lowstring directive. (In Glulx, the in_low_memory flag is ignored.) */ - int in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); + in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); if (!glulx_mode && in_low_memory) - { j=subtract_pointers(low_strings_top,low_strings); - low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b, strctx); - if (!low_strings_top) - memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS); + { + k = translate_text(-1, b, strctx); + if (k<0) { + error("text translation failed"); + k = 0; + } + ensure_memory_list_available(&low_strings_memlist, low_strings_top+k); + memcpy(low_strings+low_strings_top, translated_text, k); + j = low_strings_top; + low_strings_top += k; return(0x21+(j/2)); } if (glulx_mode && done_compression) compiler_error("Tried to add a string after compression was done."); - c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b, strctx); - if (!c) - memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS); - - i = subtract_pointers(c, strings_holding_area); + i = translate_text(-1, b, strctx); + if (i < 0) { + error("text translation failed"); + i = 0; + } /* Insert null bytes as needed to ensure that the next static string */ /* also occurs at an address expressible as a packed address */ @@ -261,23 +306,18 @@ extern int32 compile_string(char *b, int strctx) textalign = scale_factor; while ((i%textalign)!=0) { - if (i+2 > MAX_STATIC_STRINGS) - memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS); - i+=2; *c++ = 0; *c++ = 0; + ensure_memory_list_available(&translated_text_memlist, i+2); + translated_text[i++] = 0; + translated_text[i++] = 0; } } j = static_strings_extent; - if (temporary_files_switch) - for (c=strings_holding_area; c text_out_limit) { - text_out_overflow = TRUE; - return; + + if (text_out_limit >= 0) { + if (text_out_pos+2 > text_out_limit) { + text_out_overflow = TRUE; + return; + } } - text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2; + else { + ensure_memory_list_available(&translated_text_memlist, text_out_pos+2); + } + + translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256; total_bytes_trans+=2; } @@ -342,57 +389,84 @@ static void write_zscii(int zsc) /* ------------------------------------------------------------------------- */ static void end_z_chars(void) -{ unsigned char *p; +{ zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string; while (zob_index!=0) write_z_char_z(5); - p=(unsigned char *) text_out_pc; - *(p-2)= *(p-2)+128; + if (text_out_pos < 2) { + /* Something went wrong. */ + text_out_overflow = TRUE; + return; + } + translated_text[text_out_pos-2] += 128; } /* Glulx handles this much more simply -- compression is done elsewhere. */ static void write_z_char_g(int i) { - ASSERT_GLULX(); - if (text_out_pc+1 > text_out_limit) { - text_out_overflow = TRUE; - return; - } - total_zchars_trans++; - text_out_pc[0] = i; - text_out_pc++; - total_bytes_trans++; + ASSERT_GLULX(); + if (text_out_limit >= 0) { + if (text_out_pos+1 > text_out_limit) { + text_out_overflow = TRUE; + return; + } + } + else { + ensure_memory_list_available(&translated_text_memlist, text_out_pos+1); + } + total_zchars_trans++; + translated_text[text_out_pos++] = i; + total_bytes_trans++; +} + +/* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */ +static int zchar_weight(int c) +{ + int lookup = iso_to_alphabet_grid[c]; + if (lookup < 0) return 4; + if (lookup < 26) return 1; + return 2; } /* ------------------------------------------------------------------------- */ /* The main routine "text.c" provides to the rest of Inform: the text */ -/* translator. p is the address to write output to, s_text the source text */ -/* and the return value is the next free address to write output to. */ -/* The return value will not exceed p_limit. If the translation tries to */ -/* overflow this boundary, the return value will be NULL (and you should */ -/* display an error). */ +/* translator. s_text is the source text and the return value is the */ +/* number of bytes translated. */ +/* The translated text will be stored in translated_text. */ +/* */ +/* If p_limit is >= 0, the text length will not exceed that many bytes. */ +/* If the translation tries to overflow this boundary, the return value */ +/* will be -1. (You should display an error and not read translated_text.) */ +/* */ +/* If p_limit is negative, any amount of text is accepted (up to int32 */ +/* anyway). */ +/* */ /* Note that the source text may be corrupted by this routine. */ /* ------------------------------------------------------------------------- */ -extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text, int strctx) -{ int i, j, k, in_alphabet, lookup_value; +extern int32 translate_text(int32 p_limit, char *s_text, int strctx) +{ int i, j, k, in_alphabet, lookup_value, is_abbreviation; int32 unicode; int zscii; unsigned char *text_in; + if (p_limit >= 0) { + ensure_memory_list_available(&translated_text_memlist, p_limit); + } + /* For STRCTX_ABBREV, the string being translated is itself an abbreviation string, so it can't make use of abbreviations. Set the is_abbreviation flag to indicate this. The compiler has historically set this flag for the Lowstring directive as well -- the in_low_memory and is_abbreviation flag were always the same. I am preserving that convention. */ - int is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); + is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); - /* Cast the input and output streams to unsigned char: text_out_pc will + /* Cast the input and output streams to unsigned char: text_out_pos will advance as bytes of Z-coded text are written, but text_in doesn't */ text_in = (unsigned char *) s_text; - text_out_pc = (unsigned char *) p; - text_out_limit = (unsigned char *) p_limit; + text_out_pos = 0; + text_out_limit = p_limit; text_out_overflow = FALSE; /* Remember the Z-chars total so that later we can subtract to find the @@ -415,14 +489,17 @@ extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text, int strctx) && (!is_abbreviation)) make_abbrevs_lookup(); - /* If we're storing the whole game text to memory, then add this text */ + /* If we're storing the whole game text to memory, then add this text. + We will put two newlines between each text and four at the very end. + (The optimise code does a lot of sloppy text[i+2], so the extra + two newlines past all_text_top are necessary.) */ if ((!is_abbreviation) && (store_the_text)) - { no_chars_transcribed += strlen(s_text)+2; - if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE) - memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE); - sprintf(all_text_top, "%s\n\n", s_text); - all_text_top += strlen(all_text_top); + { int addlen = strlen(s_text); + ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5); + sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text); + /* Advance past two newlines. */ + all_text_top += (addlen+2); } if (transcript_switch) { @@ -439,6 +516,52 @@ extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text, int strctx) } } + /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */ + /* (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */ + /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */ + if (economy_switch) + { + uchar *q, c; + int l, min_score, from; + int text_in_length; + + text_in_length = strlen( (char*) text_in); + ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length); + ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1); + + abbreviations_optimal_parse_scores[text_in_length] = 0; + for(j=text_in_length-1; j>=0; j--) + { /* Initial values: empty schedule, score = just write the letter without abbreviating. */ + abbreviations_optimal_parse_schedule[j] = -1; + min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1]; + /* If there's an abbreviation starting with that letter... */ + if ( (from = abbrevs_lookup[text_in[j]]) != -1) + { + c = text_in[j]; + /* Loop on all abbreviations starting with what is in c. */ + for (k=from, q=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH; + (k 2 + abbreviations_optimal_parse_scores[j+l]) + { /* It is indeed smaller, so let's write it down in our schedule. */ + min_score = 2 + abbreviations_optimal_parse_scores[j+l]; + abbreviations_optimal_parse_schedule[j] = k; + } + NotMatched: ; + } + } + /* We gave it our best, this is the smallest we got. */ + abbreviations_optimal_parse_scores[j] = min_score; + } + } + + + if (!glulx_mode) { /* The empty string of Z-text is illegal, since it can't carry an end @@ -466,16 +589,24 @@ extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text, int strctx) } } - /* Try abbreviations if the economy switch set */ - - if ((economy_switch) && (!is_abbreviation) - && ((k=abbrevs_lookup[text_in[i]])!=-1)) - { if ((j=try_abbreviations_from(text_in, i, k))!=-1) - { /* abbreviations run from MAX_DYNAMIC_STRINGS to 96 */ - j += MAX_DYNAMIC_STRINGS; - write_z_char_z(j/32+1); write_z_char_z(j%32); - } + /* Try abbreviations if the economy switch set. */ + /* Look at the abbreviation schedule to see if we should abbreviate here. */ + /* Note: Just because the schedule has something doesn't mean we should abbreviate there; */ + /* sometimes you abbreviate before because it's better. If we have already replaced the */ + /* char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */ + if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 && + ((j = abbreviations_optimal_parse_schedule[i]) != -1)) + { + /* Fill with 1s, which will get ignored by everyone else. */ + uchar *p = (uchar *)abbreviations_at+j*MAX_ABBREV_LENGTH; + for (k=0; p[k]!=0; k++) text_in[i+k]=1; + /* Actually write the abbreviation in the story file. */ + abbreviations[j].freq++; + /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */ + j += MAX_DYNAMIC_STRINGS; + write_z_char_z(j/32+1); write_z_char_z(j%32); } + /* If Unicode switch set, use text_to_unicode to perform UTF-8 decoding */ @@ -495,12 +626,11 @@ advance as part of 'Zcharacter table':", unicode); /* '@' is the escape character in Inform string notation: the various possibilities are: - (printing only) @@decimalnumber : write this ZSCII char (0 to 1023) - @twodigits : write the abbreviation string with this - decimal number - - (any string context) + @twodigits or : write the abbreviation string with this + @(digits) decimal number + @(symbol) : write the abbreviation string with this + (constant) value @accentcode : this accented character: e.g., for @'e write an E-acute @{...} : this Unicode char (in hex) */ @@ -508,7 +638,7 @@ advance as part of 'Zcharacter table':", unicode); if (text_in[i]=='@') { if (text_in[i+1]=='@') { - /* @@... */ + /* @@... (ascii value) */ i+=2; j=atoi((char *) (text_in+i)); switch(j) @@ -526,6 +656,55 @@ advance as part of 'Zcharacter table':", unicode); } while (isdigit(text_in[i])) i++; i--; } + else if (text_in[i+1]=='(') + { + /* @(...) (dynamic string) */ + char dsymbol[MAX_IDENTIFIER_LENGTH+1]; + int len = 0, digits = 0; + i += 2; + /* This accepts "12xyz" as a symbol, which it really isn't, + but that just means it won't be found. */ + while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) { + char ch = text_in[i++]; + if (isdigit(ch)) digits++; + dsymbol[len++] = ch; + } + dsymbol[len] = '\0'; + j = -1; + /* We would like to parse dsymbol as *either* a decimal + number or a constant symbol. */ + if (text_in[i] != ')' || len == 0) { + error("'@(...)' abbreviation must contain a symbol"); + } + else if (digits == len) { + /* all digits; parse as decimal */ + j = atoi(dsymbol); + } + else { + int sym = symbol_index(dsymbol, -1); + if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) { + error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol); + } + else { + symbols[sym].flags |= USED_SFLAG; + j = symbols[sym].value; + } + } + if (!glulx_mode && j >= 96) { + error_max_dynamic_strings(j); + j = -1; + } + if (j >= MAX_DYNAMIC_STRINGS) { + error_max_dynamic_strings(j); + j = -1; + } + if (j >= 0) { + write_z_char_z(j/32+1); write_z_char_z(j%32); + } + else { + write_z_char_z(' '); /* error fallback */ + } + } else if (isdigit(text_in[i+1])!=0) { int d1, d2; @@ -538,16 +717,22 @@ advance as part of 'Zcharacter table':", unicode); else { j = d1*10 + d2; - if (!glulx_mode && j >= 96) - { error("Z-machine dynamic strings are limited to 96"); - j = 0; + if (!glulx_mode && j >= 96) { + error_max_dynamic_strings(j); + j = -1; } if (j >= MAX_DYNAMIC_STRINGS) { - memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS); - j = 0; + /* Shouldn't get here with two digits */ + error_max_dynamic_strings(j); + j = -1; } i+=2; - write_z_char_z(j/32+1); write_z_char_z(j%32); + if (j >= 0) { + write_z_char_z(j/32+1); write_z_char_z(j%32); + } + else { + write_z_char_z(' '); /* error fallback */ + } } } else @@ -609,7 +794,6 @@ advance as part of 'Zcharacter table':", unicode); /* Flush the Z-characters output buffer and set the "end" bit */ end_z_chars(); - } else { @@ -673,6 +857,56 @@ string."); write_z_char_g(j); while (isdigit(text_in[i])) i++; i--; } + else if (text_in[i+1]=='(') { + char dsymbol[MAX_IDENTIFIER_LENGTH+1]; + int len = 0, digits = 0; + i += 2; + /* This accepts "12xyz" as a symbol, which it really isn't, + but that just means it won't be found. */ + while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) { + char ch = text_in[i++]; + if (isdigit(ch)) digits++; + dsymbol[len++] = ch; + } + dsymbol[len] = '\0'; + j = -1; + /* We would like to parse dsymbol as *either* a decimal + number or a constant symbol. */ + if (text_in[i] != ')' || len == 0) { + error("'@(...)' abbreviation must contain a symbol"); + } + else if (digits == len) { + /* all digits; parse as decimal */ + j = atoi(dsymbol); + } + else { + int sym = symbol_index(dsymbol, -1); + if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) { + error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol); + } + else { + symbols[sym].flags |= USED_SFLAG; + j = symbols[sym].value; + } + } + if (j >= MAX_DYNAMIC_STRINGS) { + error_max_dynamic_strings(j); + j = -1; + } + if (j+1 >= no_dynamic_strings) + no_dynamic_strings = j+1; + if (j >= 0) { + write_z_char_g('@'); + write_z_char_g('D'); + write_z_char_g('A' + ((j >>12) & 0x0F)); + write_z_char_g('A' + ((j >> 8) & 0x0F)); + write_z_char_g('A' + ((j >> 4) & 0x0F)); + write_z_char_g('A' + ((j ) & 0x0F)); + } + else { + write_z_char_g(' '); /* error fallback */ + } + } else if (isdigit(text_in[i+1])) { int d1, d2; d1 = character_digit_value[text_in[i+1]]; @@ -687,17 +921,22 @@ string; substituting ' '."); i += 2; j = d1*10 + d2; if (j >= MAX_DYNAMIC_STRINGS) { - memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS); - j = 0; + error_max_dynamic_strings(j); + j = -1; } if (j+1 >= no_dynamic_strings) no_dynamic_strings = j+1; - write_z_char_g('@'); - write_z_char_g('D'); - write_z_char_g('A' + ((j >>12) & 0x0F)); - write_z_char_g('A' + ((j >> 8) & 0x0F)); - write_z_char_g('A' + ((j >> 4) & 0x0F)); - write_z_char_g('A' + ((j ) & 0x0F)); + if (j >= 0) { + write_z_char_g('@'); + write_z_char_g('D'); + write_z_char_g('A' + ((j >>12) & 0x0F)); + write_z_char_g('A' + ((j >> 8) & 0x0F)); + write_z_char_g('A' + ((j >> 4) & 0x0F)); + write_z_char_g('A' + ((j ) & 0x0F)); + } + else { + write_z_char_g(' '); /* error fallback */ + } } } else { @@ -784,41 +1023,31 @@ string; substituting '?'."); } } write_z_char_g(0); + zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string; } if (text_out_overflow) - return NULL; + return -1; else - return((uchar *) text_out_pc); + return text_out_pos; } static int unicode_entity_index(int32 unicode) { - unicode_usage_t *uptr; int j; int buck = unicode % UNICODE_HASH_BUCKETS; - for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) { - if (uptr->ch == unicode) + for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) { + if (unicode_usage_entries[j].ch == unicode) break; } - if (uptr) { - j = (uptr - unicode_usage_entries); - } - else { - if (no_unicode_chars >= MAX_UNICODE_CHARS) { - memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS); - j = 0; - } - else { - j = no_unicode_chars; - no_unicode_chars++; - uptr = unicode_usage_entries + j; - uptr->ch = unicode; - uptr->next = unicode_usage_hash[buck]; - unicode_usage_hash[buck] = uptr; - } + if (j < 0) { + ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1); + j = no_unicode_chars++; + unicode_usage_entries[j].ch = unicode; + unicode_usage_entries[j].next = unicode_usage_hash[buck]; + unicode_usage_hash[buck] = j; } return j; @@ -841,9 +1070,16 @@ void compress_game_text() int jx; int ch; int32 ix; + int max_char_set; huffbitlist_t bits; if (compression_switch) { + max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars; + + huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1, + "huffman entities"); + hufflist = my_calloc(sizeof(huffentity_t *), max_char_set, + "huffman node list"); /* How many entities have we currently got? Well, 256 plus the string-terminator plus Unicode chars plus abbrevations plus @@ -857,8 +1093,8 @@ void compress_game_text() huff_dynam_start = entities; entities += no_dynamic_strings; - if (entities > MAX_CHARACTER_SET) - memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET); + if (entities > max_char_set) + compiler_error("Too many entities for max_char_set"); /* Characters */ for (jx=0; jx<256; jx++) { @@ -893,17 +1129,10 @@ void compress_game_text() no_huff_entities = 257; huff_unicode_start = 257; huff_abbrev_start = 257; - huff_dynam_start = 257+MAX_ABBREVS; + huff_dynam_start = 257+no_abbreviations; compression_table_size = 0; } - if (temporary_files_switch) { - fclose(Temp1_fp); - Temp1_fp=fopen(Temp1_Name,"rb"); - if (Temp1_fp==NULL) - fatalerror("I/O failure: couldn't reopen temporary file 1"); - } - if (compression_switch) { for (lx=0, ix=0; lx static_strings_extent || ch < 0) compiler_error("Read too much not-yet-compressed text."); @@ -1039,12 +1265,7 @@ void compress_game_text() without actually doing the compression. */ compression_string_size = 0; - if (temporary_files_switch) { - fseek(Temp1_fp, 0, SEEK_SET); - } - - if (no_strings >= MAX_NUM_STATIC_STRINGS) - memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS); + ensure_memory_list_available(&compressed_offsets_memlist, no_strings); for (lx=0, ix=0; lx static_strings_extent || ch < 0) compiler_error("Read too much not-yet-compressed text."); @@ -1182,11 +1400,16 @@ static void compress_makebits(int entnum, int depth, int prevbit, /* for compatibility with previous releases. */ /* ------------------------------------------------------------------------- */ +/* The complete game text. */ +static char *opttext; +static int32 opttextlen; + typedef struct tlb_s { char text[4]; int32 intab, occurrences; } tlb; -static tlb *tlbtab; +static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */ +static memory_list tlbtab_memlist; static int32 no_occs; static int32 *grandtable; @@ -1198,16 +1421,19 @@ typedef struct optab_s int32 location; char text[MAX_ABBREV_LENGTH]; } optab; -static optab *bestyet, *bestyet2; +static int32 MAX_BESTYET; +static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */ +static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */ static int pass_no; -static char *sub_buffer; - static void optimise_pass(void) -{ int32 i; int t1, t2; +{ + TIMEVALUE t1, t2; + float duration; + int32 i; int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c; - for (i=0; i<256; i++) bestyet[i].length=0; + for (i=0; i= 2) { + printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ", + pass_no, (long int) i, (long int) no_occs, tlbtab[i].text, + (long int) tlbtab[i].occurrences); + } + TIMEVALUE_NOW(&t1); for (j=0; j=2)&&(nl<=62)) + while ((noflags>=2)&&(nl-nl)&&(x= 2) { + TIMEVALUE_NOW(&t2); + duration = TIMEVALUE_DIFFERENCE(&t1, &t2); + printf(" (%.4f seconds)\n", duration); + } } } } @@ -1304,22 +1530,35 @@ static int any_overlap(char *s1, char *s2) return(0); } -#define MAX_TLBS 8000 - extern void optimise_abbreviations(void) -{ int32 i, j, t, max=0, MAX_GTABLE; +{ int32 i, j, tcount, max=0, MAX_GTABLE; int32 j2, selected, available, maxat=0, nl; - tlb test; + if (opttext == NULL) + return; + + /* We insist that the first two abbreviations will be ". " and ", ". */ + if (MAX_ABBREVS < 2) + return; + + /* Note that it's safe to access opttext[opttextlen+2]. There are + two newlines and a null beyond opttextlen. */ + printf("Beginning calculation of optimal abbreviations...\n"); pass_no = 0; - tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0; - sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer"); - for (i=0; i=2) - { tlbtab[no_occs]=test; - tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences; + { + ensure_memory_list_available(&tlbtab_memlist, no_occs+1); + tlbtab[no_occs]=test; + tlbtab[no_occs].intab=tcount; + tcount += tlbtab[no_occs].occurrences; if (max= 1) { + printf("Cross-reference table (%ld entries) built...\n", + (long int) no_occs); + } /* for (i=0; i0)&&(selected<64)) - { printf("Pass %d\n", ++pass_no); - + for (i=0; i0)&&(selected= 1) { + printf("Pass %d\n", pass_no); + } + optimise_pass(); available=0; - for (i=0; i<256; i++) + for (i=0; i0) printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score, bestyet[i].popularity, bestyet[i].text); @@ -1438,40 +1685,44 @@ extern void optimise_abbreviations(void) do { max=0; - for (i=0; i<256; i++) + for (i=0; i0) - { bestyet2[selected++]=bestyet[maxat]; - - printf( - "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n", - (long int) selected,bestyet[maxat].text, - (long int) bestyet[maxat].popularity, - (long int) bestyet[maxat].score); + { + char testtext[4]; + bestyet2[selected++]=bestyet[maxat]; + + if (optabbrevs_trace_setting >= 1) { + printf( + "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n", + (long int) selected,bestyet[maxat].text, + (long int) bestyet[maxat].popularity, + (long int) bestyet[maxat].score); + } - test.text[0]=bestyet[maxat].text[0]; - test.text[1]=bestyet[maxat].text[1]; - test.text[2]=bestyet[maxat].text[2]; - test.text[3]=0; + testtext[0]=bestyet[maxat].text[0]; + testtext[1]=bestyet[maxat].text[1]; + testtext[2]=bestyet[maxat].text[2]; + testtext[3]=0; for (i=0; i0)&& (any_overlap(bestyet[maxat].text,bestyet[i].text)==1)) { bestyet[i].score=0; @@ -1479,7 +1730,7 @@ extern void optimise_abbreviations(void) bestyet[i].text); */ } } - } while ((max>0)&&(available>0)&&(selected<64)); + } while ((max>0)&&(available>0)&&(selected */ /* 4 or 6 bytes byte byte byte */ /* */ -/* For Glulx, the form is instead: (But see below about Unicode-valued */ -/* dictionaries and my heinie.) */ +/* For Glulx, the form is instead: (See below about Unicode-valued */ +/* dictionaries and DICT_WORD_BYTES.) */ /* */ /* */ -/* $60 DICT_WORD_SIZE short short short */ +/* $60 DICT_WORD_BYTES short short short */ /* */ /* These records are stored in "accession order" (i.e. in order of their */ /* first being received by these routines) and only alphabetically sorted */ @@ -1538,28 +1789,31 @@ extern void optimise_abbreviations(void) /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */ /* ------------------------------------------------------------------------- */ -uchar *dictionary, /* (These two pointers are externally +uchar *dictionary; /* (These two variables are externally used only in "tables.c" when building the story-file) */ - *dictionary_top; /* Pointer to next free record */ +static memory_list dictionary_memlist; +int32 dictionary_top; /* Position of the next free record + in dictionary (i.e., the current + number of bytes) */ int dict_entries; /* Total number of records entered */ /* ------------------------------------------------------------------------- */ -/* dict_word is a typedef for a struct of 6 unsigned chars (defined in */ -/* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */ +/* dict_word was originally a typedef for a struct of 6 unsigned chars. */ +/* It held the (4 or) 6 bytes of Z-coded text of a word. */ /* Usefully, because the PAD character 5 is < all alphabetic characters, */ /* alphabetic order corresponds to numeric order. For this reason, the */ /* dict_word is called the "sort code" of the original text word. */ /* */ -/* ###- In modifying the compiler, I've found it easier to discard the */ +/* In modifying the compiler for Glulx, I found it easier to discard the */ /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */ /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */ /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */ /* (That limit is defined as 40 in the header; it exists only for a few */ /* static buffers, and can be increased without using significant memory.) */ /* */ -/* ###- Well, that certainly bit me on the butt, didn't it. In further */ +/* ...Well, that certainly bit me on the butt, didn't it. In further */ /* modifying the compiler to generate a Unicode dictionary, I have to */ /* store four-byte values in the uchar array. This is handled by making */ /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/ @@ -1806,10 +2060,13 @@ typedef struct dict_tree_node_s char colour; /* The colour of the branch to the parent */ } dict_tree_node; -static dict_tree_node *dtree; +static dict_tree_node *dtree; /* Allocated to dict_entries */ +static memory_list dtree_memlist; + +static uchar *dict_sort_codes; /* Allocated to dict_entries*DICT_WORD_BYTES */ +static memory_list dict_sort_codes_memlist; -int *final_dict_order; -static uchar *dict_sort_codes; +int *final_dict_order; /* Allocated at sort_dictionary() time */ static void dictionary_begin_pass(void) { @@ -1817,10 +2074,12 @@ static void dictionary_begin_pass(void) /* Glulx has a 4-byte header instead. */ if (!glulx_mode) - dictionary_top=dictionary+7; + dictionary_top = 7; else - dictionary_top=dictionary+4; + dictionary_top = 4; + ensure_memory_list_available(&dictionary_memlist, dictionary_top); + root = VACANT; dict_entries = 0; } @@ -1836,6 +2095,9 @@ static void recursively_sort(int node) extern void sort_dictionary(void) { int i; + + final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table"); + if (module_switch) { for (i=0; i 3) { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; } - p[res]=x; p[res+1]=y; p[res+2]=z; + p[res]=x; p[res+1]=y; + if (!ZCODE_LESS_DICT_DATA) p[res+2]=z; if (x & 128) p[res] = (p[res])|number_and_case; - dictionary_top += res+3; + dictionary_top += DICT_ENTRY_BYTE_LENGTH; } else { int i; + ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH); p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries; p[0] = 0x60; /* type byte -- dict word */ @@ -2052,7 +2319,7 @@ extern void dictionary_set_verb_number(char *dword, int to) if (i!=0) { if (!glulx_mode) { - p=dictionary+7+(i-1)*(3+res)+res; + p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res; p[1]=to; } else { @@ -2156,6 +2423,10 @@ extern void word_to_ascii(uchar *p, char *results) encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32; encoded_word[8] = ((int) p[5])&0x1f; } + else + { + encoded_word[6] = encoded_word[7] = encoded_word[8] = 0; + } shift = 0; cc = 0; for (i=0; i< ((version_number==3)?6:9); i++) @@ -2185,15 +2456,49 @@ extern void word_to_ascii(uchar *p, char *results) results[cc] = 0; } -static void recursively_show_z(int node) +/* Print a dictionary word to stdout. + (This assumes that d_show_buf is null.) + */ +void print_dict_word(int node) +{ + uchar *p; + int cprinted; + + if (!glulx_mode) { + char textual_form[32]; + p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node; + + word_to_ascii(p, textual_form); + + for (cprinted = 0; textual_form[cprinted]!=0; cprinted++) + show_char(textual_form[cprinted]); + } + else { + p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node; + + for (cprinted = 0; cprinted= 1) + { + if (level >= 2) { + for (i=0; i= 1) + { int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4); int flags = (p[flagpos+0] << 8) | (p[flagpos+1]); int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]); + if (level >= 2) { + for (i=0; i