X-Git-Url: https://jxself.org/git/?a=blobdiff_plain;f=src%2Ftext.c;h=149f0f9a98a8ff45ea33b607c0a9deffd21b3a28;hb=56a5292888e1d46fe3033cd1d5c636051692453f;hp=cd2fcd8c7b589fc8a642eb983621cb635f1d6aa1;hpb=81ffe9a7de1db0b3a318a053b38882d1b7ab304c;p=inform.git diff --git a/src/text.c b/src/text.c index cd2fcd8..149f0f9 100644 --- a/src/text.c +++ b/src/text.c @@ -1,9 +1,8 @@ /* ------------------------------------------------------------------------- */ /* "text" : Text translation, the abbreviations optimiser, the dictionary */ /* */ -/* Copyright (c) Graham Nelson 1993 - 2018 */ -/* */ -/* This file is part of Inform. */ +/* Part of Inform 6.42 */ +/* copyright (c) Graham Nelson 1993 - 2024 */ /* */ /* Inform is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ @@ -22,30 +21,25 @@ #include "header.h" -uchar *low_strings, *low_strings_top; /* Start and next free byte in the low - strings pool */ +uchar *low_strings; /* Allocated to low_strings_top */ +int32 low_strings_top; +static memory_list low_strings_memlist; int32 static_strings_extent; /* Number of bytes of static strings made so far */ -memory_block static_strings_area; /* Used if (!temporary_files_switch) to - hold the static strings area so far */ - -static uchar *strings_holding_area; /* Area holding translated strings - until they are moved into either - a temporary file, or the - static_strings_area below */ +uchar *static_strings_area; /* Used to hold the static strings + area so far + Allocated to static_strings_extent */ +memory_list static_strings_area_memlist; -char *all_text, *all_text_top; /* Start and next byte free in (large) - text buffer holding the entire text +static char *all_text; /* Text buffer holding the entire text of the game, when it is being - recorded */ -int put_strings_in_low_memory, /* When TRUE, put static strings in - the low strings pool at 0x100 rather - than in the static strings area */ - is_abbreviation, /* When TRUE, the string being trans - is itself an abbreviation string - so can't make use of abbreviations */ - abbrevs_lookup_table_made, /* The abbreviations lookup table is + recorded + (Allocated to all_text_top) */ +static memory_list all_text_memlist; +static int32 all_text_top; + +int abbrevs_lookup_table_made, /* The abbreviations lookup table is constructed when the first non- abbreviation string is translated: this flag is TRUE after that */ @@ -55,8 +49,6 @@ int put_strings_in_low_memory, /* When TRUE, put static strings in with ASCII character n, or -1 if none of the abbreviations do */ int no_abbreviations; /* No of abbreviations defined so far */ -uchar *abbreviations_at; /* Memory to hold the text of any - abbreviation strings declared */ /* ------------------------------------------------------------------------- */ /* Glulx string compression storage */ /* ------------------------------------------------------------------------- */ @@ -69,7 +61,6 @@ int no_dynamic_strings; /* No. of @.. string escapes used int no_unicode_chars; /* Number of distinct Unicode chars used. (Beyond 0xFF.) */ -static int MAX_CHARACTER_SET; /* Number of possible entities */ huffentity_t *huff_entities; /* The list of entities (characters, abbreviations, @.. escapes, and the terminator) */ @@ -94,11 +85,16 @@ int32 compression_string_size; /* Length of the compressed string int32 *compressed_offsets; /* The beginning of every string in the game, relative to the beginning of the Huffman table. (So entry 0 - is equal to compression_table_size)*/ + is equal to compression_table_size). + Allocated to no_strings at + compress_game_text() time. */ +static memory_list compressed_offsets_memlist; + +unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars */ +static memory_list unicode_usage_entries_memlist; #define UNICODE_HASH_BUCKETS (64) -unicode_usage_t *unicode_usage_entries; -static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS]; +static int unicode_usage_hash[UNICODE_HASH_BUCKETS]; static int unicode_entity_index(int32 unicode); @@ -106,9 +102,19 @@ static int unicode_entity_index(int32 unicode); /* Abbreviation arrays */ /* ------------------------------------------------------------------------- */ -int *abbrev_values; -int *abbrev_quality; -int *abbrev_freqs; +abbreviation *abbreviations; /* Allocated up to no_abbreviations */ +static memory_list abbreviations_memlist; + +/* Memory to hold the text of any abbreviation strings declared. */ +static int32 abbreviations_totaltext; +static char *abbreviations_text; /* Allocated up to abbreviations_totaltext */ +static memory_list abbreviations_text_memlist; + +static int *abbreviations_optimal_parse_schedule; +static memory_list abbreviations_optimal_parse_schedule_memlist; + +static int *abbreviations_optimal_parse_scores; +static memory_list abbreviations_optimal_parse_scores_memlist; /* ------------------------------------------------------------------------- */ @@ -117,26 +123,34 @@ int32 total_chars_trans, /* Number of ASCII chars of text in */ zchars_trans_in_last_string; /* Number of Z-chars in last string: needed only for abbrev efficiency calculation in "directs.c" */ -static int32 total_zchars_trans, /* Number of Z-chars of text out +static int32 total_zchars_trans; /* Number of Z-chars of text out (only used to calculate the above) */ - no_chars_transcribed; /* Number of ASCII chars written to - the text transcription area (used - for the -r and -u switches) */ static int zchars_out_buffer[3], /* During text translation, a buffer of 3 Z-chars at a time: when it's full these are written as a 2-byte word */ zob_index; /* Index (0 to 2) into it */ -static unsigned char *text_out_pc; /* The "program counter" during text - translation: the next address to +uchar *translated_text; /* Area holding translated strings + until they are moved into the + static_strings_area below */ +static memory_list translated_text_memlist; + +static char *temp_symbol; /* Temporary symbol name used while + processing "@(...)". */ +static memory_list temp_symbol_memlist; + + +static int32 text_out_pos; /* The "program counter" during text + translation: the next position to write Z-coded text output to */ -static unsigned char *text_out_limit; /* The upper limit of text_out_pc - during text translation */ +static int32 text_out_limit; /* The upper limit of text_out_pos + during text translation (or -1 + for no limit) */ static int text_out_overflow; /* During text translation, becomes - true if text_out_pc tries to pass + true if text_out_pos tries to pass text_out_limit */ /* ------------------------------------------------------------------------- */ @@ -152,28 +166,28 @@ static int text_out_overflow; /* During text translation, becomes /* ------------------------------------------------------------------------- */ static void make_abbrevs_lookup(void) -{ int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2; +{ int bubble_sort, j, k; + char *p1, *p2; do { bubble_sort = FALSE; for (j=0; j=0; j--) - { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH; + { p1=abbreviation_text(j); abbrevs_lookup[(uchar)p1[0]]=j; - abbrev_freqs[j]=0; + abbreviations[j].freq=0; } abbrevs_lookup_table_made = TRUE; } @@ -196,15 +210,19 @@ static void make_abbrevs_lookup(void) static int try_abbreviations_from(unsigned char *text, int i, int from) { int j, k; uchar *p, c; c=text[i]; - for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH; - (j= no_abbreviations) { + compiler_error("Invalid abbrev for abbreviation_text()"); + return ""; + } + + return abbreviations_text + abbreviations[num].textpos; } /* ------------------------------------------------------------------------- */ -/* The front end routine for text translation */ +/* The front end routine for text translation. */ +/* strctx indicates the purpose of the string. This is mostly used for */ +/* informational output (gametext.txt), but we treat some string contexts */ +/* specially during compilation. */ /* ------------------------------------------------------------------------- */ -extern int32 compile_string(char *b, int in_low_memory, int is_abbrev) -{ int i, j; uchar *c; - - is_abbreviation = is_abbrev; - - /* Put into the low memory pool (at 0x100 in the Z-machine) of strings */ - /* which may be wanted as possible entries in the abbreviations table */ +/* TODO: When called from a print statement (parse_print()), it would be + nice to detect if the generated string is exactly one character. In that + case, we could return the character value and a flag to indicate the + caller could use @print_char/@streamchar/@new_line/@streamunichar + instead of printing a compiled string. + + We'd need a new STRCTX value or two to distinguish direct-printed strings + from referenceable strings. + + Currently, parse_print() checks for the "^" case manually, which is a + bit icky. */ + +extern int32 compile_string(char *b, int strctx) +{ int32 i, j, k; + uchar *c; + int in_low_memory; + + if (execution_never_reaches_here) { + /* No need to put strings into gametext.txt or the static/low + strings areas. */ + if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) { + /* VENEER and VENEEROPC are only used at the translate_text level, + so we don't have to catch them here. */ + return 0; + } + } + + /* In Z-code, abbreviations go in the low memory pool (0x100). So + do strings explicitly defined with the Lowstring directive. + (In Glulx, the in_low_memory flag is ignored.) */ + in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); if (!glulx_mode && in_low_memory) - { j=subtract_pointers(low_strings_top,low_strings); - low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b); - if (!low_strings_top) - memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS); - is_abbreviation = FALSE; + { + k = translate_text(-1, b, strctx); + if (k<0) { + error("text translation failed"); + k = 0; + } + ensure_memory_list_available(&low_strings_memlist, low_strings_top+k); + memcpy(low_strings+low_strings_top, translated_text, k); + j = low_strings_top; + low_strings_top += k; return(0x21+(j/2)); } if (glulx_mode && done_compression) compiler_error("Tried to add a string after compression was done."); - c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b); - if (!c) - memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS); - - i = subtract_pointers(c, strings_holding_area); + i = translate_text(-1, b, strctx); + if (i < 0) { + error("text translation failed"); + i = 0; + } /* Insert null bytes as needed to ensure that the next static string */ /* also occurs at an address expressible as a packed address */ @@ -268,25 +352,18 @@ extern int32 compile_string(char *b, int in_low_memory, int is_abbrev) textalign = scale_factor; while ((i%textalign)!=0) { - if (i+2 > MAX_STATIC_STRINGS) - memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS); - i+=2; *c++ = 0; *c++ = 0; + ensure_memory_list_available(&translated_text_memlist, i+2); + translated_text[i++] = 0; + translated_text[i++] = 0; } } j = static_strings_extent; - if (temporary_files_switch) - for (c=strings_holding_area; c text_out_limit) { - text_out_overflow = TRUE; - return; + + if (text_out_limit >= 0) { + if (text_out_pos+2 > text_out_limit) { + text_out_overflow = TRUE; + return; + } } - text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2; + else { + ensure_memory_list_available(&translated_text_memlist, text_out_pos+2); + } + + translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256; total_bytes_trans+=2; } @@ -351,48 +435,86 @@ static void write_zscii(int zsc) /* ------------------------------------------------------------------------- */ static void end_z_chars(void) -{ unsigned char *p; +{ zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string; while (zob_index!=0) write_z_char_z(5); - p=(unsigned char *) text_out_pc; - *(p-2)= *(p-2)+128; + if (text_out_pos < 2) { + /* Something went wrong. */ + text_out_overflow = TRUE; + return; + } + translated_text[text_out_pos-2] += 128; } /* Glulx handles this much more simply -- compression is done elsewhere. */ static void write_z_char_g(int i) { - ASSERT_GLULX(); - if (text_out_pc+1 > text_out_limit) { - text_out_overflow = TRUE; - return; - } - total_zchars_trans++; - text_out_pc[0] = i; - text_out_pc++; - total_bytes_trans++; + ASSERT_GLULX(); + if (text_out_limit >= 0) { + if (text_out_pos+1 > text_out_limit) { + text_out_overflow = TRUE; + return; + } + } + else { + ensure_memory_list_available(&translated_text_memlist, text_out_pos+1); + } + total_zchars_trans++; + translated_text[text_out_pos++] = i; + total_bytes_trans++; +} + +/* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */ +static int zchar_weight(int c) +{ + int lookup; + if (c == ' ') return 1; + lookup = iso_to_alphabet_grid[c]; + if (lookup < 0) return 4; + if (lookup < 26) return 1; + return 2; } /* ------------------------------------------------------------------------- */ /* The main routine "text.c" provides to the rest of Inform: the text */ -/* translator. p is the address to write output to, s_text the source text */ -/* and the return value is the next free address to write output to. */ -/* The return value will not exceed p_limit. If the translation tries to */ -/* overflow this boundary, the return value will be NULL (and you should */ -/* display an error). */ +/* translator. s_text is the source text and the return value is the */ +/* number of bytes translated. */ +/* The translated text will be stored in translated_text. */ +/* */ +/* If p_limit is >= 0, the text length will not exceed that many bytes. */ +/* If the translation tries to overflow this boundary, the return value */ +/* will be -1. (You should display an error and not read translated_text.) */ +/* */ +/* If p_limit is negative, any amount of text is accepted (up to int32 */ +/* anyway). */ +/* */ /* Note that the source text may be corrupted by this routine. */ /* ------------------------------------------------------------------------- */ -extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text) -{ int i, j, k, in_alphabet, lookup_value; +extern int32 translate_text(int32 p_limit, char *s_text, int strctx) +{ int i, j, k, in_alphabet, lookup_value, is_abbreviation; int32 unicode; int zscii; unsigned char *text_in; - /* Cast the input and output streams to unsigned char: text_out_pc will + if (p_limit >= 0) { + ensure_memory_list_available(&translated_text_memlist, p_limit); + } + + /* For STRCTX_ABBREV, the string being translated is itself an + abbreviation string, so it can't make use of abbreviations. Set + the is_abbreviation flag to indicate this. + The compiler has historically set this flag for the Lowstring + directive as well -- the in_low_memory and is_abbreviation flag were + always the same. I am preserving that convention. */ + is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING); + + + /* Cast the input and output streams to unsigned char: text_out_pos will advance as bytes of Z-coded text are written, but text_in doesn't */ text_in = (unsigned char *) s_text; - text_out_pc = (unsigned char *) p; - text_out_limit = (unsigned char *) p_limit; + text_out_pos = 0; + text_out_limit = p_limit; text_out_overflow = FALSE; /* Remember the Z-chars total so that later we can subtract to find the @@ -415,19 +537,82 @@ extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text) && (!is_abbreviation)) make_abbrevs_lookup(); - /* If we're storing the whole game text to memory, then add this text */ + /* If we're storing the whole game text to memory, then add this text. + We will put two newlines between each text and four at the very end. + (The optimise code does a lot of sloppy text[i+2], so the extra + two newlines past all_text_top are necessary.) */ if ((!is_abbreviation) && (store_the_text)) - { no_chars_transcribed += strlen(s_text)+2; - if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE) - memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE); - sprintf(all_text_top, "%s\n\n", s_text); - all_text_top += strlen(all_text_top); + { int addlen = strlen(s_text); + ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5); + sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text); + /* Advance past two newlines. */ + all_text_top += (addlen+2); + } + + if (transcript_switch) { + /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */ + if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) { + int label = strctx; + if (veneer_mode) { + if (label == STRCTX_GAME) + label = STRCTX_VENEER; + else if (label == STRCTX_GAMEOPC) + label = STRCTX_VENEEROPC; + } + write_to_transcript_file(s_text, label); + } + } + + /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */ + /* (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */ + /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */ + if (economy_switch) + { + uchar *q, c; + int l, min_score, from; + int text_in_length; + + text_in_length = strlen( (char*) text_in); + ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length); + ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1); + + abbreviations_optimal_parse_scores[text_in_length] = 0; + for(j=text_in_length-1; j>=0; j--) + { /* Initial values: empty schedule, score = just write the letter without abbreviating. */ + abbreviations_optimal_parse_schedule[j] = -1; + min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1]; + /* If there's an abbreviation starting with that letter... */ + if ( (from = abbrevs_lookup[text_in[j]]) != -1) + { + c = text_in[j]; + /* Loop on all abbreviations starting with what is in c. */ + for (k=from; + k 2 + abbreviations_optimal_parse_scores[j+l]) + { /* It is indeed smaller, so let's write it down in our schedule. */ + min_score = 2 + abbreviations_optimal_parse_scores[j+l]; + abbreviations_optimal_parse_schedule[j] = k; + } + NotMatched: ; + } + } + /* We gave it our best, this is the smallest we got. */ + abbreviations_optimal_parse_scores[j] = min_score; + } } - if (transcript_switch && (!veneer_mode)) - write_to_transcript_file(s_text); + if (!glulx_mode) { /* The empty string of Z-text is illegal, since it can't carry an end @@ -455,15 +640,24 @@ extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text) } } - /* Try abbreviations if the economy switch set */ - - if ((economy_switch) && (!is_abbreviation) - && ((k=abbrevs_lookup[text_in[i]])!=-1)) - { if ((j=try_abbreviations_from(text_in, i, k))!=-1) - { if (j<32) { write_z_char_z(2); write_z_char_z(j); } - else { write_z_char_z(3); write_z_char_z(j-32); } - } + /* Try abbreviations if the economy switch set. */ + /* Look at the abbreviation schedule to see if we should abbreviate here. */ + /* Note: Just because the schedule has something doesn't mean we should abbreviate there; */ + /* sometimes you abbreviate before because it's better. If we have already replaced the */ + /* char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */ + if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 && + ((j = abbreviations_optimal_parse_schedule[i]) != -1)) + { + /* Fill with 1s, which will get ignored by everyone else. */ + uchar *p = (uchar *)abbreviation_text(j); + for (k=0; p[k]!=0; k++) text_in[i+k]=1; + /* Actually write the abbreviation in the story file. */ + abbreviations[j].freq++; + /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */ + j += MAX_DYNAMIC_STRINGS; + write_z_char_z(j/32+1); write_z_char_z(j%32); } + /* If Unicode switch set, use text_to_unicode to perform UTF-8 decoding */ @@ -483,12 +677,11 @@ advance as part of 'Zcharacter table':", unicode); /* '@' is the escape character in Inform string notation: the various possibilities are: - (printing only) @@decimalnumber : write this ZSCII char (0 to 1023) - @twodigits : write the abbreviation string with this - decimal number - - (any string context) + @twodigits or : write the abbreviation string with this + @(digits) decimal number + @(symbol) : write the abbreviation string with this + (constant) value @accentcode : this accented character: e.g., for @'e write an E-acute @{...} : this Unicode char (in hex) */ @@ -496,7 +689,7 @@ advance as part of 'Zcharacter table':", unicode); if (text_in[i]=='@') { if (text_in[i+1]=='@') { - /* @@... */ + /* @@... (ascii value) */ i+=2; j=atoi((char *) (text_in+i)); switch(j) @@ -514,18 +707,84 @@ advance as part of 'Zcharacter table':", unicode); } while (isdigit(text_in[i])) i++; i--; } + else if (text_in[i+1]=='(') + { + /* @(...) (dynamic string) */ + int len = 0, digits = 0; + i += 2; + /* This accepts "12xyz" as a symbol, which it really isn't, + but that just means it won't be found. */ + while ((text_in[i] == '_' || isalnum(text_in[i]))) { + char ch = text_in[i++]; + if (isdigit(ch)) digits++; + ensure_memory_list_available(&temp_symbol_memlist, len+1); + temp_symbol[len++] = ch; + } + ensure_memory_list_available(&temp_symbol_memlist, len+1); + temp_symbol[len] = '\0'; + j = -1; + /* We would like to parse temp_symbol as *either* a decimal + number or a constant symbol. */ + if (text_in[i] != ')' || len == 0) { + error("'@(...)' abbreviation must contain a symbol"); + } + else if (digits == len) { + /* all digits; parse as decimal */ + j = atoi(temp_symbol); + } + else { + int sym = get_symbol_index(temp_symbol); + if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) { + error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol); + } + else { + symbols[sym].flags |= USED_SFLAG; + j = symbols[sym].value; + } + } + if (!glulx_mode && j >= 96) { + error_max_dynamic_strings(j); + j = -1; + } + if (j >= MAX_DYNAMIC_STRINGS) { + error_max_dynamic_strings(j); + j = -1; + } + if (j >= 0) { + write_z_char_z(j/32+1); write_z_char_z(j%32); + } + else { + write_z_char_z(' '); /* error fallback */ + } + } else if (isdigit(text_in[i+1])!=0) { int d1, d2; - /* @.. */ + /* @.. (dynamic string) */ d1 = character_digit_value[text_in[i+1]]; d2 = character_digit_value[text_in[i+2]]; if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) error("'@..' must have two decimal digits"); else - { i+=2; - write_z_char_z(1); write_z_char_z(d1*10 + d2); + { + j = d1*10 + d2; + if (!glulx_mode && j >= 96) { + error_max_dynamic_strings(j); + j = -1; + } + if (j >= MAX_DYNAMIC_STRINGS) { + /* Shouldn't get here with two digits */ + error_max_dynamic_strings(j); + j = -1; + } + i+=2; + if (j >= 0) { + write_z_char_z(j/32+1); write_z_char_z(j%32); + } + else { + write_z_char_z(' '); /* error fallback */ + } } } else @@ -587,7 +846,6 @@ advance as part of 'Zcharacter table':", unicode); /* Flush the Z-characters output buffer and set the "end" bit */ end_z_chars(); - } else { @@ -626,7 +884,7 @@ advance as part of 'Zcharacter table':", unicode); if ((economy_switch) && (compression_switch) && (!is_abbreviation) && ((k=abbrevs_lookup[text_in[i]])!=-1) && ((j=try_abbreviations_from(text_in, i, k)) != -1)) { - char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH; + char *cx = abbreviation_text(j); i += (strlen(cx)-1); write_z_char_g('@'); write_z_char_g('A'); @@ -651,6 +909,57 @@ string."); write_z_char_g(j); while (isdigit(text_in[i])) i++; i--; } + else if (text_in[i+1]=='(') { + int len = 0, digits = 0; + i += 2; + /* This accepts "12xyz" as a symbol, which it really isn't, + but that just means it won't be found. */ + while ((text_in[i] == '_' || isalnum(text_in[i]))) { + char ch = text_in[i++]; + if (isdigit(ch)) digits++; + ensure_memory_list_available(&temp_symbol_memlist, len+1); + temp_symbol[len++] = ch; + } + ensure_memory_list_available(&temp_symbol_memlist, len+1); + temp_symbol[len] = '\0'; + j = -1; + /* We would like to parse temp_symbol as *either* a decimal + number or a constant symbol. */ + if (text_in[i] != ')' || len == 0) { + error("'@(...)' abbreviation must contain a symbol"); + } + else if (digits == len) { + /* all digits; parse as decimal */ + j = atoi(temp_symbol); + } + else { + int sym = get_symbol_index(temp_symbol); + if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) { + error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol); + } + else { + symbols[sym].flags |= USED_SFLAG; + j = symbols[sym].value; + } + } + if (j >= MAX_DYNAMIC_STRINGS) { + error_max_dynamic_strings(j); + j = -1; + } + if (j+1 >= no_dynamic_strings) + no_dynamic_strings = j+1; + if (j >= 0) { + write_z_char_g('@'); + write_z_char_g('D'); + write_z_char_g('A' + ((j >>12) & 0x0F)); + write_z_char_g('A' + ((j >> 8) & 0x0F)); + write_z_char_g('A' + ((j >> 4) & 0x0F)); + write_z_char_g('A' + ((j ) & 0x0F)); + } + else { + write_z_char_g(' '); /* error fallback */ + } + } else if (isdigit(text_in[i+1])) { int d1, d2; d1 = character_digit_value[text_in[i+1]]; @@ -665,17 +974,22 @@ string; substituting ' '."); i += 2; j = d1*10 + d2; if (j >= MAX_DYNAMIC_STRINGS) { - memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS); - j = 0; + error_max_dynamic_strings(j); + j = -1; } if (j+1 >= no_dynamic_strings) no_dynamic_strings = j+1; - write_z_char_g('@'); - write_z_char_g('D'); - write_z_char_g('A' + ((j >>12) & 0x0F)); - write_z_char_g('A' + ((j >> 8) & 0x0F)); - write_z_char_g('A' + ((j >> 4) & 0x0F)); - write_z_char_g('A' + ((j ) & 0x0F)); + if (j >= 0) { + write_z_char_g('@'); + write_z_char_g('D'); + write_z_char_g('A' + ((j >>12) & 0x0F)); + write_z_char_g('A' + ((j >> 8) & 0x0F)); + write_z_char_g('A' + ((j >> 4) & 0x0F)); + write_z_char_g('A' + ((j ) & 0x0F)); + } + else { + write_z_char_g(' '); /* error fallback */ + } } } else { @@ -762,41 +1076,31 @@ string; substituting '?'."); } } write_z_char_g(0); + zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string; } if (text_out_overflow) - return NULL; + return -1; else - return((uchar *) text_out_pc); + return text_out_pos; } static int unicode_entity_index(int32 unicode) { - unicode_usage_t *uptr; int j; int buck = unicode % UNICODE_HASH_BUCKETS; - for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) { - if (uptr->ch == unicode) + for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) { + if (unicode_usage_entries[j].ch == unicode) break; } - if (uptr) { - j = (uptr - unicode_usage_entries); - } - else { - if (no_unicode_chars >= MAX_UNICODE_CHARS) { - memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS); - j = 0; - } - else { - j = no_unicode_chars; - no_unicode_chars++; - uptr = unicode_usage_entries + j; - uptr->ch = unicode; - uptr->next = unicode_usage_hash[buck]; - unicode_usage_hash[buck] = uptr; - } + if (j < 0) { + ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1); + j = no_unicode_chars++; + unicode_usage_entries[j].ch = unicode; + unicode_usage_entries[j].next = unicode_usage_hash[buck]; + unicode_usage_hash[buck] = j; } return j; @@ -819,9 +1123,16 @@ void compress_game_text() int jx; int ch; int32 ix; + int max_char_set; huffbitlist_t bits; if (compression_switch) { + max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars; + + huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1, + "huffman entities"); + hufflist = my_calloc(sizeof(huffentity_t *), max_char_set, + "huffman node list"); /* How many entities have we currently got? Well, 256 plus the string-terminator plus Unicode chars plus abbrevations plus @@ -835,8 +1146,8 @@ void compress_game_text() huff_dynam_start = entities; entities += no_dynamic_strings; - if (entities > MAX_CHARACTER_SET) - memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET); + if (entities > max_char_set) + compiler_error("Too many entities for max_char_set"); /* Characters */ for (jx=0; jx<256; jx++) { @@ -871,17 +1182,10 @@ void compress_game_text() no_huff_entities = 257; huff_unicode_start = 257; huff_abbrev_start = 257; - huff_dynam_start = 257+MAX_ABBREVS; + huff_dynam_start = 257+no_abbreviations; compression_table_size = 0; } - if (temporary_files_switch) { - fclose(Temp1_fp); - Temp1_fp=fopen(Temp1_Name,"rb"); - if (Temp1_fp==NULL) - fatalerror("I/O failure: couldn't reopen temporary file 1"); - } - if (compression_switch) { for (lx=0, ix=0; lx static_strings_extent || ch < 0) compiler_error("Read too much not-yet-compressed text."); @@ -1017,12 +1318,7 @@ void compress_game_text() without actually doing the compression. */ compression_string_size = 0; - if (temporary_files_switch) { - fseek(Temp1_fp, 0, SEEK_SET); - } - - if (no_strings >= MAX_NUM_STATIC_STRINGS) - memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS); + ensure_memory_list_available(&compressed_offsets_memlist, no_strings); for (lx=0, ix=0; lx static_strings_extent || ch < 0) compiler_error("Read too much not-yet-compressed text."); @@ -1140,7 +1433,7 @@ static void compress_makebits(int entnum, int depth, int prevbit, compression_table_size += 2; break; case 3: - cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH; + cx = abbreviation_text(ent->u.val); compression_table_size += (1 + 1 + strlen(cx)); break; case 4: @@ -1160,11 +1453,16 @@ static void compress_makebits(int entnum, int depth, int prevbit, /* for compatibility with previous releases. */ /* ------------------------------------------------------------------------- */ +/* The complete game text. */ +static char *opttext; +static int32 opttextlen; + typedef struct tlb_s { char text[4]; int32 intab, occurrences; } tlb; -static tlb *tlbtab; +static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */ +static memory_list tlbtab_memlist; static int32 no_occs; static int32 *grandtable; @@ -1174,18 +1472,36 @@ typedef struct optab_s int32 popularity; int32 score; int32 location; - char text[MAX_ABBREV_LENGTH]; + char *text; /* allocated to textsize, min 4 */ + int32 textsize; } optab; -static optab *bestyet, *bestyet2; +static int32 MAX_BESTYET; +static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */ +static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */ -static int pass_no; +static void optab_copy(optab *dest, const optab *src) +{ + dest->length = src->length; + dest->popularity = src->popularity; + dest->score = src->score; + dest->location = src->location; + if (src->length+1 > dest->textsize) { + int32 oldsize = dest->textsize; + dest->textsize = (src->length+1)*2; + my_realloc(&dest->text, oldsize, dest->textsize, "bestyet2.text"); + } + strcpy(dest->text, src->text); +} -static char *sub_buffer; +static int pass_no; static void optimise_pass(void) -{ int32 i; int t1, t2; +{ + TIMEVALUE t1, t2; + float duration; + int32 i; int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c; - for (i=0; i<256; i++) bestyet[i].length=0; + for (i=0; i= 2) { + printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ", + pass_no, (long int) i, (long int) no_occs, tlbtab[i].text, + (long int) tlbtab[i].occurrences); + } + TIMEVALUE_NOW(&t1); for (j=0; j=2)&&(nl<=62)) + while (noflags>=2) { nl++; for (j2=0; j2-nl)&&(x= 2) { + TIMEVALUE_NOW(&t2); + duration = TIMEVALUE_DIFFERENCE(&t1, &t2); + printf(" (%.4f seconds)\n", duration); + } } } } @@ -1282,22 +1598,52 @@ static int any_overlap(char *s1, char *s2) return(0); } -#define MAX_TLBS 8000 - extern void optimise_abbreviations(void) -{ int32 i, j, t, max=0, MAX_GTABLE; +{ int32 i, j, tcount, max=0, MAX_GTABLE; int32 j2, selected, available, maxat=0, nl; - tlb test; + if (opttext == NULL) + return; + + /* We insist that the first two abbreviations will be ". " and ", ". */ + if (MAX_ABBREVS < 2) + return; + + /* Note that it's safe to access opttext[opttextlen+2]. There are + two newlines and a null beyond opttextlen. */ + printf("Beginning calculation of optimal abbreviations...\n"); pass_no = 0; - tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0; - sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer"); - for (i=0; i=2) - { tlbtab[no_occs]=test; - tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences; + { + ensure_memory_list_available(&tlbtab_memlist, no_occs+1); + tlbtab[no_occs]=test; + tlbtab[no_occs].intab=tcount; + tcount += tlbtab[no_occs].occurrences; if (max= 1) { + printf("Cross-reference table (%ld entries) built...\n", + (long int) no_occs); + } /* for (i=0; i0)&&(selected<64)) - { printf("Pass %d\n", ++pass_no); - + for (i=0; i0)&&(selected= 1) { + printf("Pass %d\n", pass_no); + } + optimise_pass(); available=0; - for (i=0; i<256; i++) + for (i=0; i bestyet[i].textsize) { + int32 oldsize = bestyet[i].textsize; + bestyet[i].textsize = (nl+1)*2; + my_realloc(&bestyet[i].text, oldsize, bestyet[i].textsize, "bestyet.text"); + } for (j2=0; j20) printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score, bestyet[i].popularity, bestyet[i].text); @@ -1416,40 +1775,44 @@ extern void optimise_abbreviations(void) do { max=0; - for (i=0; i<256; i++) + for (i=0; i0) - { bestyet2[selected++]=bestyet[maxat]; - - printf( - "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n", - (long int) selected,bestyet[maxat].text, - (long int) bestyet[maxat].popularity, - (long int) bestyet[maxat].score); + { + char testtext[4]; + optab_copy(&bestyet2[selected++], &bestyet[maxat]); + + if (optabbrevs_trace_setting >= 1) { + printf( + "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n", + (long int) selected,bestyet[maxat].text, + (long int) bestyet[maxat].popularity, + (long int) bestyet[maxat].score); + } - test.text[0]=bestyet[maxat].text[0]; - test.text[1]=bestyet[maxat].text[1]; - test.text[2]=bestyet[maxat].text[2]; - test.text[3]=0; + testtext[0]=bestyet[maxat].text[0]; + testtext[1]=bestyet[maxat].text[1]; + testtext[2]=bestyet[maxat].text[2]; + testtext[3]=0; for (i=0; i0)&& (any_overlap(bestyet[maxat].text,bestyet[i].text)==1)) { bestyet[i].score=0; @@ -1457,7 +1820,7 @@ extern void optimise_abbreviations(void) bestyet[i].text); */ } } - } while ((max>0)&&(available>0)&&(selected<64)); + } while ((max>0)&&(available>0)&&(selected */ /* 4 or 6 bytes byte byte byte */ /* */ -/* For Glulx, the form is instead: (But see below about Unicode-valued */ -/* dictionaries and my heinie.) */ +/* For Glulx, the form is instead: (See below about Unicode-valued */ +/* dictionaries and DICT_WORD_BYTES.) */ /* */ -/* */ -/* DICT_WORD_SIZE short short short */ +/* */ +/* $60 DICT_WORD_BYTES short short short */ /* */ /* These records are stored in "accession order" (i.e. in order of their */ /* first being received by these routines) and only alphabetically sorted */ @@ -1516,31 +1879,31 @@ extern void optimise_abbreviations(void) /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */ /* ------------------------------------------------------------------------- */ -uchar *dictionary, /* (These two pointers are externally +uchar *dictionary; /* (These two variables are externally used only in "tables.c" when building the story-file) */ - *dictionary_top; /* Pointer to next free record */ +static memory_list dictionary_memlist; +int32 dictionary_top; /* Position of the next free record + in dictionary (i.e., the current + number of bytes) */ int dict_entries; /* Total number of records entered */ /* ------------------------------------------------------------------------- */ -/* dict_word is a typedef for a struct of 6 unsigned chars (defined in */ -/* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */ +/* dict_word was originally a typedef for a struct of 6 unsigned chars. */ +/* It held the (4 or) 6 bytes of Z-coded text of a word. */ /* Usefully, because the PAD character 5 is < all alphabetic characters, */ /* alphabetic order corresponds to numeric order. For this reason, the */ /* dict_word is called the "sort code" of the original text word. */ /* */ -/* ###- In modifying the compiler, I've found it easier to discard the */ +/* In modifying the compiler for Glulx, I found it easier to discard the */ /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */ /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */ -/* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */ -/* (That limit is defined as 40 in the header; it exists only for a few */ -/* static buffers, and can be increased without using significant memory.) */ +/* as before. In Glulx, it can be any value. */ /* */ -/* ###- Well, that certainly bit me on the butt, didn't it. In further */ -/* modifying the compiler to generate a Unicode dictionary, I have to */ -/* store four-byte values in the uchar array. This is handled by making */ -/* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/ +/* In further modifying the compiler to generate a Unicode dictionary, */ +/* I have to store four-byte values in the uchar array. We make the array */ +/* size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE). */ /* Then we store the 32-bit character value big-endian. This lets us */ /* continue to compare arrays bytewise, which is a nice simplification. */ /* ------------------------------------------------------------------------- */ @@ -1560,35 +1923,68 @@ extern void copy_sorts(uchar *d1, uchar *d2) d1[i] = d2[i]; } -static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code - of current word */ +static memory_list prepared_sort_memlist; +static uchar *prepared_sort; /* Holds the sort code of current word */ -static int number_and_case; +static int prepared_dictflags_pos; /* Dict flags set by the current word */ +static int prepared_dictflags_neg; /* Dict flags *not* set by the word */ /* Also used by verbs.c */ static void dictionary_prepare_z(char *dword, uchar *optresult) -{ int i, j, k, k2, wd[13]; int32 tot; +{ int i, j, k, k2, wd[13]; + int32 tot; + int negflag; /* A rapid text translation algorithm using only the simplified rules applying to the text of dictionary entries: first produce a sequence of 6 (v3) or 9 (v4+) Z-characters */ - number_and_case = 0; + int dictsize = (version_number==3) ? 6 : 9; + + prepared_dictflags_pos = 0; + prepared_dictflags_neg = 0; - for (i=0, j=0; dword[j]!=0; i++, j++) - { if ((dword[j] == '/') && (dword[j+1] == '/')) - { for (j+=2; dword[j] != 0; j++) - { switch(dword[j]) - { case 'p': number_and_case |= 4; break; + for (i=0, j=0; dword[j]!=0; j++) + { + if ((dword[j] == '/') && (dword[j+1] == '/')) + { + /* The rest of the word is dict flags. Run through them. */ + negflag = FALSE; + for (j+=2; dword[j] != 0; j++) + { + switch(dword[j]) + { + case '~': + if (!dword[j+1]) + error_named("'//~' with no flag character (pn) in dict word", dword); + negflag = !negflag; + break; + case 'p': + if (!negflag) + prepared_dictflags_pos |= 4; + else + prepared_dictflags_neg |= 4; + negflag = FALSE; + break; + case 'n': + if (!negflag) + prepared_dictflags_pos |= 128; + else + prepared_dictflags_neg |= 128; + negflag = FALSE; + break; default: - error_named("Expected 'p' after '//' \ -to give number of dictionary word", dword); + error_named("Expected flag character (pn~) after '//' in dict word", dword); break; } } break; } - if (i>=9) break; + + /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping + at dictsize. */ + if (LONG_DICT_FLAG_BUG && i>=dictsize) + break; k=(int) dword[j]; if (k==(int) '\'') @@ -1619,33 +2015,47 @@ apostrophe in", dword); char_error("Character can be printed but not input:", k); else { /* Use 4 more Z-chars to encode a ZSCII escape sequence */ - - wd[i++] = 5; wd[i++] = 6; + if (i dictsize) + compiler_error("dict word buffer overflow"); - for (; i<9; i++) wd[i]=5; + /* Fill up to the end of the dictionary block with PAD characters + (for safety, we right-pad to 9 chars even in V3) */ - /* The array of Z-chars is converted to three 2-byte blocks */ + for (; i<9; i++) wd[i]=5; + /* The array of Z-chars is converted to two or three 2-byte blocks */ + ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES); + tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10); prepared_sort[1]=tot%0x100; prepared_sort[0]=(tot/0x100)%0x100; tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10); prepared_sort[3]=tot%0x100; prepared_sort[2]=(tot/0x100)%0x100; - tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10); + if (version_number==3) + tot = 0; + else + tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10); prepared_sort[5]=tot%0x100; prepared_sort[4]=(tot/0x100)%0x100; @@ -1662,25 +2072,48 @@ static void dictionary_prepare_g(char *dword, uchar *optresult) { int i, j, k; int32 unicode; + int negflag; - number_and_case = 0; + prepared_dictflags_pos = 0; + prepared_dictflags_neg = 0; - for (i=0, j=0; (dword[j]!=0); i++, j++) { + for (i=0, j=0; (dword[j]!=0); j++) { if ((dword[j] == '/') && (dword[j+1] == '/')) { + /* The rest of the word is dict flags. Run through them. */ + negflag = FALSE; for (j+=2; dword[j] != 0; j++) { switch(dword[j]) { + case '~': + if (!dword[j+1]) + error_named("'//~' with no flag character (pn) in dict word", dword); + negflag = !negflag; + break; case 'p': - number_and_case |= 4; - break; + if (!negflag) + prepared_dictflags_pos |= 4; + else + prepared_dictflags_neg |= 4; + negflag = FALSE; + break; + case 'n': + if (!negflag) + prepared_dictflags_pos |= 128; + else + prepared_dictflags_neg |= 128; + negflag = FALSE; + break; default: - error_named("Expected 'p' after '//' \ -to give gender or number of dictionary word", dword); + error_named("Expected flag character (pn~) after '//' in dict word", dword); break; } } break; } - if (i>=DICT_WORD_SIZE) break; + + /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping + at DICT_WORD_SIZE. */ + if (LONG_DICT_FLAG_BUG && i>=DICT_WORD_SIZE) + break; k= ((unsigned char *)dword)[j]; if (k=='\'') @@ -1711,17 +2144,27 @@ Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary."); if (k >= (unsigned)'A' && k <= (unsigned)'Z') k += ('a' - 'A'); + ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES); + if (DICT_CHAR_SIZE == 1) { - prepared_sort[i] = k; + if (i> 24) & 0xFF; - prepared_sort[4*i+1] = (k >> 16) & 0xFF; - prepared_sort[4*i+2] = (k >> 8) & 0xFF; - prepared_sort[4*i+3] = (k) & 0xFF; + if (i> 24) & 0xFF; + prepared_sort[4*i+1] = (k >> 16) & 0xFF; + prepared_sort[4*i+2] = (k >> 8) & 0xFF; + prepared_sort[4*i+3] = (k) & 0xFF; + i++; + } } } + if (i > DICT_WORD_SIZE) + compiler_error("dict word buffer overflow"); + + /* Right-pad with zeroes */ if (DICT_CHAR_SIZE == 1) { for (; i 3) { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; } - p[res]=x; p[res+1]=y; p[res+2]=z; - if (x & 128) p[res] = (p[res])|number_and_case; + p[res]=flag1; p[res+1]=flag2; + if (!ZCODE_LESS_DICT_DATA) p[res+2]=flag3; - dictionary_top += res+3; + dictionary_top += DICT_ENTRY_BYTE_LENGTH; } else { int i; + ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH); p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries; p[0] = 0x60; /* type byte -- dict word */ @@ -1997,11 +2449,9 @@ extern int dictionary_add(char *dword, int x, int y, int z) p[i] = prepared_sort[i]; p += DICT_WORD_BYTES; - p[0] = 0; p[1] = x; - p[2] = y/256; p[3] = y%256; - p[4] = 0; p[5] = z; - if (x & 128) - p[1] |= number_and_case; + p[0] = (flag1/256); p[1] = (flag1%256); + p[2] = (flag2/256); p[3] = (flag2%256); + p[4] = (flag3/256); p[5] = (flag3%256); dictionary_top += DICT_ENTRY_BYTE_LENGTH; @@ -2025,7 +2475,7 @@ extern void dictionary_set_verb_number(char *dword, int to) if (i!=0) { if (!glulx_mode) { - p=dictionary+7+(i-1)*(3+res)+res; + p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res; p[1]=to; } else { @@ -2036,22 +2486,85 @@ extern void dictionary_set_verb_number(char *dword, int to) } /* ------------------------------------------------------------------------- */ -/* Tracing code for the dictionary: used not only by "trace" and text */ -/* transcription, but also (in the case of "word_to_ascii") in a vital */ -/* by the linker. */ +/* Tracing code for the dictionary: used by "trace" and text */ +/* transcription. */ /* ------------------------------------------------------------------------- */ -static char *d_show_to; -static int d_show_total; +/* In the dictionary-showing code, if d_show_buf is NULL, the text is + printed directly. (The "Trace dictionary" directive does this.) + If d_show_buf is not NULL, we add words to it (reallocing if necessary) + until it's a page-width. +*/ +static char *d_show_buf = NULL; +static int d_show_size; /* allocated size */ +static int d_show_len; /* current length */ static void show_char(char c) -{ if (d_show_to == NULL) printf("%c", c); - else - { int i = strlen(d_show_to); - d_show_to[i] = c; d_show_to[i+1] = 0; +{ + if (d_show_buf == NULL) { + printf("%c", c); + } + else { + if (d_show_len+2 >= d_show_size) { + int newsize = 2 * d_show_len + 16; + my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer"); + d_show_size = newsize; + } + d_show_buf[d_show_len++] = c; + d_show_buf[d_show_len] = '\0'; } } +/* Display a Unicode character in user-readable form. This uses the same + character encoding as the source code. */ +static void show_uchar(uint32 c) +{ + char buf[16]; + int ix; + + if (c < 0x80) { + /* ASCII always works */ + show_char(c); + return; + } + if (character_set_unicode) { + /* UTF-8 the character */ + if (c < 0x80) { + show_char(c); + } + else if (c < 0x800) { + show_char((0xC0 | ((c & 0x7C0) >> 6))); + show_char((0x80 | (c & 0x03F) )); + } + else if (c < 0x10000) { + show_char((0xE0 | ((c & 0xF000) >> 12))); + show_char((0x80 | ((c & 0x0FC0) >> 6))); + show_char((0x80 | (c & 0x003F) )); + } + else if (c < 0x200000) { + show_char((0xF0 | ((c & 0x1C0000) >> 18))); + show_char((0x80 | ((c & 0x03F000) >> 12))); + show_char((0x80 | ((c & 0x000FC0) >> 6))); + show_char((0x80 | (c & 0x00003F) )); + } + else { + show_char('?'); + } + return; + } + if (character_set_setting == 1 && c < 0x100) { + /* Fits in Latin-1 */ + show_char(c); + return; + } + /* Supporting other character_set_setting is harder; not currently implemented. */ + + /* Use the escaped form */ + sprintf(buf, "@{%x}", c); + for (ix=0; buf[ix]; ix++) + show_char(buf[ix]); +} + extern void word_to_ascii(uchar *p, char *results) { int i, shift, cc, zchar; uchar encoded_word[9]; encoded_word[0] = (((int) p[0])&0x7c)/4; @@ -2065,6 +2578,10 @@ extern void word_to_ascii(uchar *p, char *results) encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32; encoded_word[8] = ((int) p[5])&0x1f; } + else + { + encoded_word[6] = encoded_word[7] = encoded_word[8] = 0; + } shift = 0; cc = 0; for (i=0; i< ((version_number==3)?6:9); i++) @@ -2094,15 +2611,49 @@ extern void word_to_ascii(uchar *p, char *results) results[cc] = 0; } -static void recursively_show_z(int node) +/* Print a dictionary word to stdout. + (This assumes that d_show_buf is null.) + */ +void print_dict_word(int node) +{ + uchar *p; + int cprinted; + + if (!glulx_mode) { + char textual_form[32]; + p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node; + + word_to_ascii(p, textual_form); + + for (cprinted = 0; textual_form[cprinted]!=0; cprinted++) + show_char(textual_form[cprinted]); + } + else { + p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node; + + for (cprinted = 0; cprinted= 1) + { + if (level >= 2) { + for (i=0; i= 64 || TRANSCRIPT_FORMAT == 1)) + { + write_to_transcript_file(d_show_buf, STRCTX_DICT); + d_show_len = 0; } if (dtree[node].branch[1] != VACANT) - recursively_show_z(dtree[node].branch[1]); + recursively_show_z(dtree[node].branch[1], level); } -static void recursively_show_g(int node) -{ - warning("### Glulx dictionary-show not yet implemented.\n"); +static void recursively_show_g(int node, int level) +{ int i, cprinted; + uchar *p; + + if (dtree[node].branch[0] != VACANT) + recursively_show_g(dtree[node].branch[0], level); + + p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node; + + for (cprinted = 0; cprinted= 1) + { int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4); + int flags = (p[flagpos+0] << 8) | (p[flagpos+1]); + int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]); + if (level >= 2) { + for (i=0; i= 64 || TRANSCRIPT_FORMAT == 1)) + { + write_to_transcript_file(d_show_buf, STRCTX_DICT); + d_show_len = 0; + } + + if (dtree[node].branch[1] != VACANT) + recursively_show_g(dtree[node].branch[1], level); } static void show_alphabet(int i) @@ -2165,36 +2774,49 @@ static void show_alphabet(int i) printf("\n"); } -extern void show_dictionary(void) -{ printf("Dictionary contains %d entries:\n",dict_entries); +extern void show_dictionary(int level) +{ + /* Level 0: show words only. Level 1: show words and flags. + Level 2: also show bytes.*/ + printf("Dictionary contains %d entries:\n",dict_entries); if (dict_entries != 0) - { d_show_total = 0; d_show_to = NULL; + { d_show_len = 0; d_show_buf = NULL; if (!glulx_mode) - recursively_show_z(root); + recursively_show_z(root, level); else - recursively_show_g(root); + recursively_show_g(root, level); + } + if (!glulx_mode) + { + printf("\nZ-machine alphabet entries:\n"); + show_alphabet(0); + show_alphabet(1); + show_alphabet(2); } - printf("\nZ-machine alphabet entries:\n"); - show_alphabet(0); - show_alphabet(1); - show_alphabet(2); } extern void write_dictionary_to_transcript(void) -{ char d_buffer[81]; - - sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries); +{ + d_show_size = 80; /* initial size */ + d_show_buf = my_malloc(d_show_size, "dictionary display buffer"); - d_buffer[0] = 0; write_to_transcript_file(d_buffer); + write_to_transcript_file("", STRCTX_INFO); + sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries); + write_to_transcript_file(d_show_buf, STRCTX_INFO); + + d_show_len = 0; if (dict_entries != 0) - { d_show_total = 0; d_show_to = d_buffer; + { if (!glulx_mode) - recursively_show_z(root); + recursively_show_z(root, 0); else - recursively_show_g(root); + recursively_show_g(root, 0); } - if (d_show_total != 0) write_to_transcript_file(d_buffer); + if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT); + + my_free(&d_show_buf, "dictionary display buffer"); + d_show_len = 0; d_show_buf = NULL; } /* ========================================================================= */ @@ -2203,34 +2825,49 @@ extern void write_dictionary_to_transcript(void) extern void init_text_vars(void) { int j; + + opttext = NULL; + opttextlen = 0; bestyet = NULL; bestyet2 = NULL; tlbtab = NULL; grandtable = NULL; grandflags = NULL; - no_chars_transcribed = 0; - is_abbreviation = FALSE; - put_strings_in_low_memory = FALSE; + + translated_text = NULL; + temp_symbol = NULL; + all_text = NULL; for (j=0; j<256; j++) abbrevs_lookup[j] = -1; total_zchars_trans = 0; + dictionary = NULL; + dictionary_top = 0; dtree = NULL; final_dict_order = NULL; dict_sort_codes = NULL; + prepared_sort = NULL; dict_entries=0; - initialise_memory_block(&static_strings_area); + static_strings_area = NULL; + abbreviations_optimal_parse_schedule = NULL; + abbreviations_optimal_parse_scores = NULL; + + compressed_offsets = NULL; + huff_entities = NULL; + hufflist = NULL; + unicode_usage_entries = NULL; } extern void text_begin_pass(void) { abbrevs_lookup_table_made = FALSE; no_abbreviations=0; + abbreviations_totaltext=0; total_chars_trans=0; total_bytes_trans=0; - if (store_the_text) all_text_top=all_text; + all_text_top=0; dictionary_begin_pass(); - low_strings_top = low_strings; + low_strings_top = 0; static_strings_extent = 0; no_strings = 0; @@ -2241,29 +2878,66 @@ extern void text_begin_pass(void) /* Note: for allocation and deallocation of all_the_text, see inform.c */ extern void text_allocate_arrays(void) -{ abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH, - "abbreviations"); - abbrev_values = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values"); - abbrev_quality = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality"); - abbrev_freqs = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev freqs"); +{ + int ix; - dtree = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES, - "red-black tree for dictionary"); - final_dict_order = my_calloc(sizeof(int), MAX_DICT_ENTRIES, - "final dictionary ordering table"); - dict_sort_codes = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES, - "dictionary sort codes"); + initialise_memory_list(&translated_text_memlist, + sizeof(uchar), 8000, (void**)&translated_text, + "translated text holding area"); + + initialise_memory_list(&temp_symbol_memlist, + sizeof(char), 32, (void**)&temp_symbol, + "temporary symbol name"); + + initialise_memory_list(&all_text_memlist, + sizeof(char), 0, (void**)&all_text, + "transcription text for optimise"); + + initialise_memory_list(&static_strings_area_memlist, + sizeof(uchar), 128, (void**)&static_strings_area, + "static strings area"); + + initialise_memory_list(&abbreviations_text_memlist, + sizeof(char), 64, (void**)&abbreviations_text, + "abbreviation text"); - if (!glulx_mode) - dictionary = my_malloc(9*MAX_DICT_ENTRIES+7, - "dictionary"); - else - dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4, - "dictionary"); + initialise_memory_list(&abbreviations_memlist, + sizeof(abbreviation), 64, (void**)&abbreviations, + "abbreviations"); - strings_holding_area - = my_malloc(MAX_STATIC_STRINGS,"static strings holding area"); - low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings"); + initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist, + sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule, + "abbreviations optimal parse schedule"); + initialise_memory_list(&abbreviations_optimal_parse_scores_memlist, + sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores, + "abbreviations optimal parse scores"); + + initialise_memory_list(&dtree_memlist, + sizeof(dict_tree_node), 1500, (void**)&dtree, + "red-black tree for dictionary"); + initialise_memory_list(&dict_sort_codes_memlist, + sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes, + "dictionary sort codes"); + initialise_memory_list(&prepared_sort_memlist, + sizeof(uchar), DICT_WORD_BYTES, (void**)&prepared_sort, + "prepared sort buffer"); + + final_dict_order = NULL; /* will be allocated at sort_dictionary() time */ + + /* The exact size will be 7+7*num for z3, 7+9*num for z4+, + 4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial + allocation; we don't have to be precise. */ + initialise_memory_list(&dictionary_memlist, + sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary, + "dictionary"); + + initialise_memory_list(&low_strings_memlist, + sizeof(uchar), 1024, (void**)&low_strings, + "low (abbreviation) strings"); + + d_show_buf = NULL; + d_show_size = 0; + d_show_len = 0; huff_entities = NULL; hufflist = NULL; @@ -2272,57 +2946,92 @@ extern void text_allocate_arrays(void) compression_table_size = 0; compressed_offsets = NULL; - MAX_CHARACTER_SET = 0; + initialise_memory_list(&unicode_usage_entries_memlist, + sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries, + "unicode entity entries"); - if (glulx_mode) { - if (compression_switch) { - int ix; - MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS - + MAX_UNICODE_CHARS; - huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1, - "huffman entities"); - hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET, - "huffman node list"); - unicode_usage_entries = my_calloc(sizeof(unicode_usage_t), - MAX_UNICODE_CHARS, "unicode entity entries"); - for (ix=0; ix