src/text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /*   Part of Inform 6.40                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2022                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 uchar *low_strings;                    /* Allocated to low_strings_top       */
  25 int32 low_strings_top;
  26 static memory_list low_strings_memlist;
  27
  28 int32 static_strings_extent;           /* Number of bytes of static strings
  29                                           made so far */
  30 uchar *static_strings_area;            /* Used to hold the static strings
  31                                           area so far
  32                                           Allocated to static_strings_extent */
  33 memory_list static_strings_area_memlist;
  34
  35 static char *all_text;                 /* Text buffer holding the entire text
  36                                           of the game, when it is being
  37                                           recorded
  38                                           (Allocated to all_text_top) */
  39 static memory_list all_text_memlist;
  40 static int32 all_text_top;
  41
  42 int abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  43                                           constructed when the first non-
  44                                           abbreviation string is translated:
  45                                           this flag is TRUE after that       */
  46     abbrevs_lookup[256];               /* Once this has been constructed,
  47                                           abbrevs_lookup[n] = the smallest
  48                                           number of any abbreviation beginning
  49                                           with ASCII character n, or -1
  50                                           if none of the abbreviations do    */
  51 int no_abbreviations;                  /* No of abbreviations defined so far */
  52 /* ------------------------------------------------------------------------- */
  53 /*   Glulx string compression storage                                        */
  54 /* ------------------------------------------------------------------------- */
  55
  56 int no_strings;                        /* No of strings in static strings
  57                                           area.                              */
  58 int no_dynamic_strings;                /* No. of @.. string escapes used
  59                                           (actually, the highest value used
  60                                           plus one)                          */
  61 int no_unicode_chars;                  /* Number of distinct Unicode chars
  62                                           used. (Beyond 0xFF.)               */
  63
  64 huffentity_t *huff_entities;           /* The list of entities (characters,
  65                                           abbreviations, @.. escapes, and
  66                                           the terminator)                    */
  67 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  68
  69 int no_huff_entities;                  /* The number of entities in the list */
  70 int huff_unicode_start;                /* Position in the list where Unicode
  71                                           chars begin.                       */
  72 int huff_abbrev_start;                 /* Position in the list where string
  73                                           abbreviations begin.               */
  74 int huff_dynam_start;                  /* Position in the list where @..
  75                                           entities begin.                    */
  76 int huff_entity_root;                  /* The position in the list of the root
  77                                           entry (when considering the table
  78                                           as a tree).                        */
  79
  80 int done_compression;                  /* Has the game text been compressed? */
  81 int32 compression_table_size;          /* Length of the Huffman table, in
  82                                           bytes                              */
  83 int32 compression_string_size;         /* Length of the compressed string
  84                                           data, in bytes                     */
  85 int32 *compressed_offsets;             /* The beginning of every string in
  86                                           the game, relative to the beginning
  87                                           of the Huffman table. (So entry 0
  88                                           is equal to compression_table_size).
  89                                           Allocated to no_strings at
  90                                           compress_game_text() time.         */
  91 static memory_list compressed_offsets_memlist;
  92
  93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars     */
  94 static memory_list unicode_usage_entries_memlist;
  95
  96 #define UNICODE_HASH_BUCKETS (64)
  97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
  98
  99 static int unicode_entity_index(int32 unicode);
 100
 101 /* ------------------------------------------------------------------------- */
 102 /*   Abbreviation arrays                                                     */
 103 /* ------------------------------------------------------------------------- */
 104
 105 abbreviation *abbreviations;             /* Allocated up to no_abbreviations */
 106 static memory_list abbreviations_memlist;
 107
 108 /* Memory to hold the text of any abbreviation strings declared. This is
 109    counted in units of MAX_ABBREV_LENGTH bytes. (An abbreviation must fit
 110    in that many bytes, null included.)                                       */
 111 uchar *abbreviations_at;                 /* Allocated up to no_abbreviations */
 112 static memory_list abbreviations_at_memlist;
 113
 114 static int *abbreviations_optimal_parse_schedule;
 115 static memory_list abbreviations_optimal_parse_schedule_memlist;
 116
 117 static int *abbreviations_optimal_parse_scores;
 118 static memory_list abbreviations_optimal_parse_scores_memlist;
 119
 120 /* ------------------------------------------------------------------------- */
 121
 122 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 123       total_bytes_trans,               /* Number of bytes of Z-code text out */
 124       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 125                                           needed only for abbrev efficiency
 126                                           calculation in "directs.c"         */
 127 static int32 total_zchars_trans;       /* Number of Z-chars of text out
 128                                           (only used to calculate the above) */
 129
 130 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 131                                           3 Z-chars at a time: when it's full
 132                                           these are written as a 2-byte word */
 133            zob_index;                  /* Index (0 to 2) into it             */
 134
 135 uchar *translated_text;                /* Area holding translated strings
 136                                           until they are moved into the
 137                                           static_strings_area below */
 138 static memory_list translated_text_memlist;
 139
 140 static int32 text_out_pos;             /* The "program counter" during text
 141                                           translation: the next position to
 142                                           write Z-coded text output to       */
 143
 144 static int32 text_out_limit;           /* The upper limit of text_out_pos
 145                                           during text translation (or -1
 146                                           for no limit)                      */
 147
 148 static int text_out_overflow;          /* During text translation, becomes
 149                                           true if text_out_pos tries to pass
 150                                           text_out_limit                     */
 151
 152 /* ------------------------------------------------------------------------- */
 153 /*   For variables/arrays used by the dictionary manager, see below          */
 154 /* ------------------------------------------------------------------------- */
 155
 156 /* ------------------------------------------------------------------------- */
 157 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 158 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 159 /*   alphabetical order (this is necessary for the detection algorithm to    */
 160 /*   to work).  Since the table is only prepared once, and for a table       */
 161 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 162 /* ------------------------------------------------------------------------- */
 163
 164 static void make_abbrevs_lookup(void)
 165 {   int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
 166     do
 167     {   bubble_sort = FALSE;
 168         for (j=0; j<no_abbreviations; j++)
 169             for (k=j+1; k<no_abbreviations; k++)
 170             {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 171                 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
 172                 if (strcmp(p1,p2)<0)
 173                 {   strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
 174                     l=abbreviations[j].value; abbreviations[j].value=abbreviations[k].value;
 175                     abbreviations[k].value=l;
 176                     l=abbreviations[j].quality; abbreviations[j].quality=abbreviations[k].quality;
 177                     abbreviations[k].quality=l;
 178                     bubble_sort = TRUE;
 179                 }
 180             }
 181     } while (bubble_sort);
 182
 183     for (j=no_abbreviations-1; j>=0; j--)
 184     {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 185         abbrevs_lookup[(uchar)p1[0]]=j;
 186         abbreviations[j].freq=0;
 187     }
 188     abbrevs_lookup_table_made = TRUE;
 189 }
 190
 191 /* ------------------------------------------------------------------------- */
 192 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 193 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 194 /*   is only called if text[i] is indeed the first character of at least one */
 195 /*   abbreviation, "from" begin the least index into the abbreviations table */
 196 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 197 /*   that the abbrevs table is in alphabetical order.                        */
 198 /*                                                                           */
 199 /*   The return value is -1 if there is no match.  If there is a match, the  */
 200 /*   text to be abbreviated out is over-written by a string of null chars    */
 201 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 202 /*                                                                           */
 203 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 204 /* ------------------------------------------------------------------------- */
 205
 206 static int try_abbreviations_from(unsigned char *text, int i, int from)
 207 {   int j, k; uchar *p, c;
 208     c=text[i];
 209     for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 210          (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
 211     {   if (text[i+1]==p[1])
 212         {   for (k=2; p[k]!=0; k++)
 213                 if (text[i+k]!=p[k]) goto NotMatched;
 214             if (!glulx_mode) {
 215                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 216             }
 217             abbreviations[j].freq++;
 218             return(j);
 219             NotMatched: ;
 220         }
 221     }
 222     return(-1);
 223 }
 224
 225 extern void make_abbreviation(char *text)
 226 {
 227     ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
 228     ensure_memory_list_available(&abbreviations_at_memlist, no_abbreviations+1);
 229
 230     strcpy((char *)abbreviations_at
 231             + no_abbreviations*MAX_ABBREV_LENGTH, text);
 232
 233     abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
 234     abbreviations[no_abbreviations].freq = 0;
 235
 236     /*   The quality is the number of Z-chars saved by using this            */
 237     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 238
 239     abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
 240
 241     if (abbreviations[no_abbreviations].quality <= 0) {
 242         warning_named("Abbreviation does not save any characters:", text);
 243     }
 244
 245     no_abbreviations++;
 246 }
 247
 248 /* ------------------------------------------------------------------------- */
 249 /*   The front end routine for text translation.                             */
 250 /*   strctx indicates the purpose of the string. This is mostly used for     */
 251 /*   informational output (gametext.txt), but we treat some string contexts  */
 252 /*   specially during compilation.                                           */
 253 /* ------------------------------------------------------------------------- */
 254
 255 extern int32 compile_string(char *b, int strctx)
 256 {   int32 i, j, k;
 257     uchar *c;
 258     int in_low_memory;
 259
 260     if (execution_never_reaches_here) {
 261         /* No need to put strings into gametext.txt or the static/low
 262            strings areas. */
 263         if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
 264             /* VENEER and VENEEROPC are only used at the translate_text level,
 265                so we don't have to catch them here. */
 266             return 0;
 267         }
 268     }
 269
 270     /* In Z-code, abbreviations go in the low memory pool (0x100). So
 271        do strings explicitly defined with the Lowstring directive.
 272        (In Glulx, the in_low_memory flag is ignored.) */
 273     in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 274
 275     if (!glulx_mode && in_low_memory)
 276     {
 277         k = translate_text(-1, b, strctx);
 278         if (k<0) {
 279             error("text translation failed");
 280             k = 0;
 281         }
 282         ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
 283         memcpy(low_strings+low_strings_top, translated_text, k);
 284         j = low_strings_top;
 285         low_strings_top += k;
 286         return(0x21+(j/2));
 287     }
 288
 289     if (glulx_mode && done_compression)
 290         compiler_error("Tried to add a string after compression was done.");
 291
 292     i = translate_text(-1, b, strctx);
 293     if (i < 0) {
 294         error("text translation failed");
 295         i = 0;
 296     }
 297
 298     /* Insert null bytes as needed to ensure that the next static string */
 299     /* also occurs at an address expressible as a packed address         */
 300
 301     if (!glulx_mode) {
 302         int textalign;
 303         if (oddeven_packing_switch)
 304             textalign = scale_factor*2;
 305         else
 306             textalign = scale_factor;
 307         while ((i%textalign)!=0)
 308         {
 309             ensure_memory_list_available(&translated_text_memlist, i+2);
 310             translated_text[i++] = 0;
 311             translated_text[i++] = 0;
 312         }
 313     }
 314
 315     j = static_strings_extent;
 316
 317     ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
 318     for (c=translated_text; c<translated_text+i;
 319          c++, static_strings_extent++)
 320         static_strings_area[static_strings_extent] = *c;
 321
 322     if (!glulx_mode) {
 323         return(j/scale_factor);
 324     }
 325     else {
 326         /* The marker value is a one-based string number. (We reserve zero
 327            to mean "not a string at all". */
 328         return (++no_strings);
 329     }
 330 }
 331
 332 /* ------------------------------------------------------------------------- */
 333 /*   Output a single Z-character into the buffer, and flush it if full       */
 334 /* ------------------------------------------------------------------------- */
 335
 336 static void write_z_char_z(int i)
 337 {   uint32 j;
 338     ASSERT_ZCODE();
 339     total_zchars_trans++;
 340     zchars_out_buffer[zob_index++]=(i%32);
 341     if (zob_index!=3) return;
 342     zob_index=0;
 343     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 344        + zchars_out_buffer[2];
 345
 346     if (text_out_limit >= 0) {
 347         if (text_out_pos+2 > text_out_limit) {
 348             text_out_overflow = TRUE;
 349             return;
 350         }
 351     }
 352     else {
 353         ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
 354     }
 355
 356     translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
 357     total_bytes_trans+=2;
 358 }
 359
 360 static void write_zscii(int zsc)
 361 {
 362     int lookup_value, in_alphabet;
 363
 364     if (zsc==' ')
 365     {   write_z_char_z(0);
 366         return;
 367     }
 368
 369     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 370
 371     else lookup_value = -1;
 372
 373     if (lookup_value >= 0)
 374     {   alphabet_used[lookup_value] = 'Y';
 375         in_alphabet = lookup_value/26;
 376         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 377         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 378         write_z_char_z(lookup_value%26 + 6);
 379     }
 380     else
 381     {   write_z_char_z(5); write_z_char_z(6);
 382         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 383     }
 384 }
 385
 386 /* ------------------------------------------------------------------------- */
 387 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 388 /*   setting the "end" bit on the final 2-byte word                          */
 389 /* ------------------------------------------------------------------------- */
 390
 391 static void end_z_chars(void)
 392 {
 393     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 394     while (zob_index!=0) write_z_char_z(5);
 395     if (text_out_pos < 2) {
 396         /* Something went wrong. */
 397         text_out_overflow = TRUE;
 398         return;
 399     }
 400     translated_text[text_out_pos-2] += 128;
 401 }
 402
 403 /* Glulx handles this much more simply -- compression is done elsewhere. */
 404 static void write_z_char_g(int i)
 405 {
 406     ASSERT_GLULX();
 407     if (text_out_limit >= 0) {
 408         if (text_out_pos+1 > text_out_limit) {
 409             text_out_overflow = TRUE;
 410             return;
 411         }
 412     }
 413     else {
 414         ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
 415     }
 416     total_zchars_trans++;
 417     translated_text[text_out_pos++] = i;
 418     total_bytes_trans++;
 419 }
 420
 421 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
 422 static int zchar_weight(int c)
 423 {
 424     int lookup = iso_to_alphabet_grid[c];
 425     if (lookup < 0) return 4;
 426     if (lookup < 26) return 1;
 427     return 2;
 428 }
 429
 430 /* ------------------------------------------------------------------------- */
 431 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 432 /*   translator. s_text is the source text and the return value is the       */
 433 /*   number of bytes translated.                                             */
 434 /*   The translated text will be stored in translated_text.                  */
 435 /*                                                                           */
 436 /*   If p_limit is >= 0, the text length will not exceed that many bytes.    */
 437 /*   If the translation tries to overflow this boundary, the return value    */
 438 /*   will be -1. (You should display an error and not read translated_text.) */
 439 /*                                                                           */
 440 /*   If p_limit is negative, any amount of text is accepted (up to int32     */
 441 /*   anyway).                                                                */
 442 /*                                                                           */
 443 /*   Note that the source text may be corrupted by this routine.             */
 444 /* ------------------------------------------------------------------------- */
 445
 446 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
 447 {   int i, j, k, in_alphabet, lookup_value, is_abbreviation;
 448     int32 unicode; int zscii;
 449     unsigned char *text_in;
 450
 451     if (p_limit >= 0) {
 452         ensure_memory_list_available(&translated_text_memlist, p_limit);
 453     }
 454
 455     /* For STRCTX_ABBREV, the string being translated is itself an
 456        abbreviation string, so it can't make use of abbreviations. Set
 457        the is_abbreviation flag to indicate this.
 458        The compiler has historically set this flag for the Lowstring
 459        directive as well -- the in_low_memory and is_abbreviation flag were
 460        always the same. I am preserving that convention. */
 461     is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 462
 463
 464     /*  Cast the input and output streams to unsigned char: text_out_pos will
 465         advance as bytes of Z-coded text are written, but text_in doesn't    */
 466
 467     text_in     = (unsigned char *) s_text;
 468     text_out_pos = 0;
 469     text_out_limit = p_limit;
 470     text_out_overflow = FALSE;
 471
 472     /*  Remember the Z-chars total so that later we can subtract to find the
 473         number of Z-chars translated on this string                          */
 474
 475     zchars_trans_in_last_string = total_zchars_trans;
 476
 477     /*  Start with the Z-characters output buffer empty                      */
 478
 479     zob_index=0;
 480
 481     /*  If this is the first text translated since the abbreviations were
 482         declared, and if some were declared, then it's time to make the
 483         lookup table for abbreviations
 484
 485         (Except: we don't if the text being translated is itself
 486         the text of an abbreviation currently being defined)                 */
 487
 488     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 489         && (!is_abbreviation))
 490         make_abbrevs_lookup();
 491
 492     /*  If we're storing the whole game text to memory, then add this text.
 493         We will put two newlines between each text and four at the very end.
 494         (The optimise code does a lot of sloppy text[i+2], so the extra
 495         two newlines past all_text_top are necessary.) */
 496
 497     if ((!is_abbreviation) && (store_the_text))
 498     {   int addlen = strlen(s_text);
 499         ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
 500         sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
 501         /* Advance past two newlines. */
 502         all_text_top += (addlen+2);
 503     }
 504
 505     if (transcript_switch) {
 506         /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
 507         if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
 508             int label = strctx;
 509             if (veneer_mode) {
 510                 if (label == STRCTX_GAME)
 511                     label = STRCTX_VENEER;
 512                 else if (label == STRCTX_GAMEOPC)
 513                     label = STRCTX_VENEEROPC;
 514             }
 515             write_to_transcript_file(s_text, label);
 516         }
 517     }
 518
 519     /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
 520     /*  (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
 521     /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
 522     if (economy_switch)
 523     {
 524         uchar *q, c;
 525         int l, min_score, from;
 526         int text_in_length;
 527
 528         text_in_length = strlen( (char*) text_in);
 529         ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
 530         ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
 531
 532         abbreviations_optimal_parse_scores[text_in_length] = 0;
 533         for(j=text_in_length-1; j>=0; j--)
 534         {   /* Initial values: empty schedule, score = just write the letter without abbreviating. */
 535             abbreviations_optimal_parse_schedule[j] = -1;
 536             min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
 537             /* If there's an abbreviation starting with that letter... */
 538             if ( (from = abbrevs_lookup[text_in[j]]) != -1)
 539             {
 540                 c = text_in[j];
 541                 /* Loop on all abbreviations starting with what is in c. */
 542                 for (k=from, q=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 543                     (k<no_abbreviations)&&(c==q[0]); k++, q+=MAX_ABBREV_LENGTH)
 544                 {
 545                     /* Let's compare; we also keep track of the length of the abbreviation. */
 546                     for (l=1; q[l]!=0; l++)
 547                     {    if (text_in[j+l]!=q[l]) {goto NotMatched;}
 548                     }
 549                     /* We have a match (length l), but is it smaller in size? */
 550                     if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
 551                     {   /* It is indeed smaller, so let's write it down in our schedule. */
 552                         min_score = 2 + abbreviations_optimal_parse_scores[j+l];
 553                         abbreviations_optimal_parse_schedule[j] = k;
 554                     }
 555                     NotMatched: ;
 556                 }
 557             }
 558             /* We gave it our best, this is the smallest we got. */
 559             abbreviations_optimal_parse_scores[j] = min_score;
 560         }
 561     }
 562
 563
 564
 565   if (!glulx_mode) {
 566
 567     /*  The empty string of Z-text is illegal, since it can't carry an end
 568         bit: so we translate an empty string of ASCII text to just the
 569         pad character 5.  Printing this causes nothing to appear on screen.  */
 570
 571     if (text_in[0]==0) write_z_char_z(5);
 572
 573     /*  Loop through the characters of the null-terminated input text: note
 574         that if 1 is written over a character in the input text, it is
 575         afterwards ignored                                                   */
 576
 577     for (i=0; text_in[i]!=0; i++)
 578     {   total_chars_trans++;
 579
 580         /*  Contract ".  " into ". " if double-space-removing switch set:
 581             likewise "?  " and "!  " if the setting is high enough           */
 582
 583         if ((double_space_setting >= 1)
 584             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 585         {   if (text_in[i]=='.') text_in[i+2]=1;
 586             if (double_space_setting >= 2)
 587             {   if (text_in[i]=='?') text_in[i+2]=1;
 588                 if (text_in[i]=='!') text_in[i+2]=1;
 589             }
 590         }
 591
 592         /*  Try abbreviations if the economy switch set. */
 593         /*  Look at the abbreviation schedule to see if we should abbreviate here. */
 594         /*  Note: Just because the schedule has something doesn't mean we should abbreviate there; */
 595         /*  sometimes you abbreviate before because it's better. If we have already replaced the */
 596         /*  char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
 597         if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
 598             ((j = abbreviations_optimal_parse_schedule[i]) != -1))
 599         {
 600             /* Fill with 1s, which will get ignored by everyone else. */
 601             uchar *p = (uchar *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 602             for (k=0; p[k]!=0; k++) text_in[i+k]=1;
 603             /* Actually write the abbreviation in the story file. */
 604             abbreviations[j].freq++;
 605             /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
 606             j += MAX_DYNAMIC_STRINGS;
 607             write_z_char_z(j/32+1); write_z_char_z(j%32);
 608         }
 609
 610
 611         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 612            decoding */
 613         if (character_set_unicode && (text_in[i] & 0x80))
 614         {   unicode = text_to_unicode((char *) (text_in+i));
 615             zscii = unicode_to_zscii(unicode);
 616             if (zscii != 5) write_zscii(zscii);
 617             else
 618             {   unicode_char_error(
 619                     "Character can only be used if declared in \
 620 advance as part of 'Zcharacter table':", unicode);
 621             }
 622             i += textual_form_length - 1;
 623             continue;
 624         }
 625
 626         /*  '@' is the escape character in Inform string notation: the various
 627             possibilities are:
 628
 629                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 630                 @twodigits or    :  write the abbreviation string with this
 631                 @(digits)           decimal number
 632                 @(symbol)        :  write the abbreviation string with this
 633                                     (constant) value
 634                 @accentcode      :  this accented character: e.g.,
 635                                         for @'e write an E-acute
 636                 @{...}           :  this Unicode char (in hex)              */
 637
 638         if (text_in[i]=='@')
 639         {   if (text_in[i+1]=='@')
 640             {
 641                 /*   @@... (ascii value)  */
 642
 643                 i+=2; j=atoi((char *) (text_in+i));
 644                 switch(j)
 645                 {   /* Prevent ~ and ^ from being translated to double-quote
 646                        and new-line, as they ordinarily would be */
 647
 648                     case 94:   write_z_char_z(5); write_z_char_z(6);
 649                                write_z_char_z(94/32); write_z_char_z(94%32);
 650                                break;
 651                     case 126:  write_z_char_z(5); write_z_char_z(6);
 652                                write_z_char_z(126/32); write_z_char_z(126%32);
 653                                break;
 654
 655                     default:   write_zscii(j); break;
 656                 }
 657                 while (isdigit(text_in[i])) i++; i--;
 658             }
 659             else if (text_in[i+1]=='(')
 660             {
 661                 /*   @(...) (dynamic string)   */
 662                 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
 663                 int len = 0, digits = 0;
 664                 i += 2;
 665                 /* This accepts "12xyz" as a symbol, which it really isn't,
 666                    but that just means it won't be found. */
 667                 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
 668                     char ch = text_in[i++];
 669                     if (isdigit(ch)) digits++;
 670                     dsymbol[len++] = ch;
 671                 }
 672                 dsymbol[len] = '\0';
 673                 j = -1;
 674                 /* We would like to parse dsymbol as *either* a decimal
 675                    number or a constant symbol. */
 676                 if (text_in[i] != ')' || len == 0) {
 677                     error("'@(...)' abbreviation must contain a symbol");
 678                 }
 679                 else if (digits == len) {
 680                     /* all digits; parse as decimal */
 681                     j = atoi(dsymbol);
 682                 }
 683                 else {
 684                     int sym = symbol_index(dsymbol, -1);
 685                     if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 686                         error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
 687                     }
 688                     else {
 689                         symbols[sym].flags |= USED_SFLAG;
 690                         j = symbols[sym].value;
 691                     }
 692                 }
 693                 if (!glulx_mode && j >= 96) {
 694                     error_max_dynamic_strings(j);
 695                     j = -1;
 696                 }
 697                 if (j >= MAX_DYNAMIC_STRINGS) {
 698                     error_max_dynamic_strings(j);
 699                     j = -1;
 700                 }
 701                 if (j >= 0) {
 702                     write_z_char_z(j/32+1); write_z_char_z(j%32);
 703                 }
 704                 else {
 705                     write_z_char_z(' '); /* error fallback */
 706                 }
 707             }
 708             else if (isdigit(text_in[i+1])!=0)
 709             {   int d1, d2;
 710
 711                 /*   @.. (dynamic string)   */
 712
 713                 d1 = character_digit_value[text_in[i+1]];
 714                 d2 = character_digit_value[text_in[i+2]];
 715                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 716                     error("'@..' must have two decimal digits");
 717                 else
 718                 {
 719                     j = d1*10 + d2;
 720                     if (!glulx_mode && j >= 96) {
 721                         error_max_dynamic_strings(j);
 722                         j = -1;
 723                     }
 724                     if (j >= MAX_DYNAMIC_STRINGS) {
 725                         /* Shouldn't get here with two digits */
 726                         error_max_dynamic_strings(j);
 727                         j = -1;
 728                     }
 729                     i+=2;
 730                     if (j >= 0) {
 731                         write_z_char_z(j/32+1); write_z_char_z(j%32);
 732                     }
 733                     else {
 734                         write_z_char_z(' '); /* error fallback */
 735                     }
 736                 }
 737             }
 738             else
 739             {
 740                 /*   A string escape specifying an unusual character   */
 741
 742                 unicode = text_to_unicode((char *) (text_in+i));
 743                 zscii = unicode_to_zscii(unicode);
 744                 if (zscii != 5) write_zscii(zscii);
 745                 else
 746                 {   unicode_char_error(
 747                        "Character can only be used if declared in \
 748 advance as part of 'Zcharacter table':", unicode);
 749                 }
 750                 i += textual_form_length - 1;
 751             }
 752         }
 753         else
 754         {   /*  Skip a character which has been over-written with the null
 755                 value 1 earlier on                                           */
 756
 757             if (text_in[i]!=1)
 758             {   if (text_in[i]==' ') write_z_char_z(0);
 759                 else
 760                 {   j = (int) text_in[i];
 761                     lookup_value = iso_to_alphabet_grid[j];
 762                     if (lookup_value < 0)
 763                     {   /*  The character isn't in the standard alphabets, so
 764                             we have to use the ZSCII 4-Z-char sequence */
 765
 766                         if (lookup_value == -5)
 767                         {   /*  Character isn't in the ZSCII set at all */
 768
 769                             unicode = iso_to_unicode(j);
 770                             unicode_char_error(
 771                                 "Character can only be used if declared in \
 772 advance as part of 'Zcharacter table':", unicode);
 773                             write_zscii(0x200 + unicode/0x100);
 774                             write_zscii(0x300 + unicode%0x100);
 775                         }
 776                         else write_zscii(-lookup_value);
 777                     }
 778                     else
 779                     {   /*  The character is in one of the standard alphabets:
 780                             write a SHIFT to temporarily change alphabet if
 781                             it isn't in alphabet 0, then write the Z-char    */
 782
 783                         alphabet_used[lookup_value] = 'Y';
 784                         in_alphabet = lookup_value/26;
 785                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 786                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 787                         write_z_char_z(lookup_value%26 + 6);
 788                     }
 789                 }
 790             }
 791         }
 792     }
 793
 794     /*  Flush the Z-characters output buffer and set the "end" bit           */
 795
 796     end_z_chars();
 797   }
 798   else {
 799
 800     /* The text storage here is, of course, temporary. Compression
 801        will occur when we're finished compiling, so that all the
 802        clever Huffman stuff will work.
 803        In the stored text, we use "@@" to indicate @,
 804        "@0" to indicate a zero byte,
 805        "@ANNNN" to indicate an abbreviation,
 806        "@DNNNN" to indicate a dynamic string thing.
 807        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 808        (NNNN is a four-digit hex number using the letters A-P... an
 809        ugly representation but a convenient one.)
 810     */
 811
 812     for (i=0; text_in[i]!=0; i++) {
 813
 814       /*  Contract ".  " into ". " if double-space-removing switch set:
 815           likewise "?  " and "!  " if the setting is high enough. */
 816       if ((double_space_setting >= 1)
 817         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 818         if (text_in[i]=='.'
 819           || (double_space_setting >= 2
 820             && (text_in[i]=='?' || text_in[i]=='!'))) {
 821           text_in[i+1] = text_in[i];
 822           i++;
 823         }
 824       }
 825
 826       total_chars_trans++;
 827
 828       /*  Try abbreviations if the economy switch set. We have to be in
 829           compression mode too, since the abbreviation mechanism is part
 830           of string decompression. */
 831
 832       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 833         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 834         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 835         char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 836         i += (strlen(cx)-1);
 837         write_z_char_g('@');
 838         write_z_char_g('A');
 839         write_z_char_g('A' + ((j >>12) & 0x0F));
 840         write_z_char_g('A' + ((j >> 8) & 0x0F));
 841         write_z_char_g('A' + ((j >> 4) & 0x0F));
 842         write_z_char_g('A' + ((j     ) & 0x0F));
 843       }
 844       else if (text_in[i] == '@') {
 845         if (text_in[i+1]=='@') {
 846           /* An ASCII code */
 847           i+=2; j=atoi((char *) (text_in+i));
 848           if (j == '@' || j == '\0') {
 849             write_z_char_g('@');
 850             if (j == 0) {
 851               j = '0';
 852               if (!compression_switch)
 853                 warning("Ascii @@0 will prematurely terminate non-compressed \
 854 string.");
 855             }
 856           }
 857           write_z_char_g(j);
 858           while (isdigit(text_in[i])) i++; i--;
 859         }
 860         else if (text_in[i+1]=='(') {
 861             char dsymbol[MAX_IDENTIFIER_LENGTH+1];
 862             int len = 0, digits = 0;
 863             i += 2;
 864             /* This accepts "12xyz" as a symbol, which it really isn't,
 865                but that just means it won't be found. */
 866             while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
 867                 char ch = text_in[i++];
 868                 if (isdigit(ch)) digits++;
 869                 dsymbol[len++] = ch;
 870             }
 871             dsymbol[len] = '\0';
 872             j = -1;
 873             /* We would like to parse dsymbol as *either* a decimal
 874                number or a constant symbol. */
 875             if (text_in[i] != ')' || len == 0) {
 876                 error("'@(...)' abbreviation must contain a symbol");
 877             }
 878             else if (digits == len) {
 879                 /* all digits; parse as decimal */
 880                 j = atoi(dsymbol);
 881             }
 882             else {
 883                 int sym = symbol_index(dsymbol, -1);
 884                 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 885                     error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
 886                 }
 887                 else {
 888                     symbols[sym].flags |= USED_SFLAG;
 889                     j = symbols[sym].value;
 890                 }
 891             }
 892             if (j >= MAX_DYNAMIC_STRINGS) {
 893                 error_max_dynamic_strings(j);
 894                 j = -1;
 895             }
 896             if (j+1 >= no_dynamic_strings)
 897                 no_dynamic_strings = j+1;
 898             if (j >= 0) {
 899                 write_z_char_g('@');
 900                 write_z_char_g('D');
 901                 write_z_char_g('A' + ((j >>12) & 0x0F));
 902                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 903                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 904                 write_z_char_g('A' + ((j     ) & 0x0F));
 905             }
 906             else {
 907                 write_z_char_g(' '); /* error fallback */
 908             }
 909         }
 910         else if (isdigit(text_in[i+1])) {
 911           int d1, d2;
 912           d1 = character_digit_value[text_in[i+1]];
 913           d2 = character_digit_value[text_in[i+2]];
 914           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 915             error("'@..' must have two decimal digits");
 916           }
 917           else {
 918             if (!compression_switch)
 919               warning("'@..' print variable will not work in non-compressed \
 920 string; substituting '   '.");
 921             i += 2;
 922             j = d1*10 + d2;
 923             if (j >= MAX_DYNAMIC_STRINGS) {
 924               error_max_dynamic_strings(j);
 925               j = -1;
 926             }
 927             if (j+1 >= no_dynamic_strings)
 928               no_dynamic_strings = j+1;
 929             if (j >= 0) {
 930                 write_z_char_g('@');
 931                 write_z_char_g('D');
 932                 write_z_char_g('A' + ((j >>12) & 0x0F));
 933                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 934                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 935                 write_z_char_g('A' + ((j     ) & 0x0F));
 936             }
 937             else {
 938                 write_z_char_g(' '); /* error fallback */
 939             }
 940           }
 941         }
 942         else {
 943           unicode = text_to_unicode((char *) (text_in+i));
 944           i += textual_form_length - 1;
 945           if (unicode == '@' || unicode == '\0') {
 946             write_z_char_g('@');
 947             write_z_char_g(unicode ? '@' : '0');
 948           }
 949           else if (unicode >= 0 && unicode < 256) {
 950             write_z_char_g(unicode);
 951           }
 952           else {
 953             if (!compression_switch) {
 954               warning("Unicode characters will not work in non-compressed \
 955 string; substituting '?'.");
 956               write_z_char_g('?');
 957             }
 958             else {
 959               j = unicode_entity_index(unicode);
 960               write_z_char_g('@');
 961               write_z_char_g('U');
 962               write_z_char_g('A' + ((j >>12) & 0x0F));
 963               write_z_char_g('A' + ((j >> 8) & 0x0F));
 964               write_z_char_g('A' + ((j >> 4) & 0x0F));
 965               write_z_char_g('A' + ((j     ) & 0x0F));
 966             }
 967           }
 968         }
 969       }
 970       else if (text_in[i] == '^')
 971         write_z_char_g(0x0A);
 972       else if (text_in[i] == '~')
 973         write_z_char_g('"');
 974       else if (character_set_unicode) {
 975         if (text_in[i] & 0x80) {
 976           unicode = text_to_unicode((char *) (text_in+i));
 977           i += textual_form_length - 1;
 978           if (unicode >= 0 && unicode < 256) {
 979             write_z_char_g(unicode);
 980           }
 981           else {
 982             if (!compression_switch) {
 983               warning("Unicode characters will not work in non-compressed \
 984 string; substituting '?'.");
 985               write_z_char_g('?');
 986             }
 987             else {
 988               j = unicode_entity_index(unicode);
 989               write_z_char_g('@');
 990               write_z_char_g('U');
 991               write_z_char_g('A' + ((j >>12) & 0x0F));
 992               write_z_char_g('A' + ((j >> 8) & 0x0F));
 993               write_z_char_g('A' + ((j >> 4) & 0x0F));
 994               write_z_char_g('A' + ((j     ) & 0x0F));
 995             }
 996           }
 997         }
 998         else {
 999           write_z_char_g(text_in[i]);
1000         }
1001       }
1002       else {
1003         unicode = iso_to_unicode_grid[text_in[i]];
1004         if (unicode >= 0 && unicode < 256) {
1005           write_z_char_g(unicode);
1006         }
1007         else {
1008           if (!compression_switch) {
1009             warning("Unicode characters will not work in non-compressed \
1010 string; substituting '?'.");
1011             write_z_char_g('?');
1012           }
1013           else {
1014             j = unicode_entity_index(unicode);
1015             write_z_char_g('@');
1016             write_z_char_g('U');
1017             write_z_char_g('A' + ((j >>12) & 0x0F));
1018             write_z_char_g('A' + ((j >> 8) & 0x0F));
1019             write_z_char_g('A' + ((j >> 4) & 0x0F));
1020             write_z_char_g('A' + ((j     ) & 0x0F));
1021           }
1022         }
1023       }
1024     }
1025     write_z_char_g(0);
1026     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1027
1028   }
1029
1030   if (text_out_overflow)
1031       return -1;
1032   else
1033       return text_out_pos;
1034 }
1035
1036 static int unicode_entity_index(int32 unicode)
1037 {
1038   int j;
1039   int buck = unicode % UNICODE_HASH_BUCKETS;
1040
1041   for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1042     if (unicode_usage_entries[j].ch == unicode)
1043       break;
1044   }
1045   if (j < 0) {
1046     ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1047     j = no_unicode_chars++;
1048     unicode_usage_entries[j].ch = unicode;
1049     unicode_usage_entries[j].next = unicode_usage_hash[buck];
1050     unicode_usage_hash[buck] = j;
1051   }
1052
1053   return j;
1054 }
1055
1056 /* ------------------------------------------------------------------------- */
1057 /*   Glulx compression code                                                  */
1058 /* ------------------------------------------------------------------------- */
1059
1060
1061 static void compress_makebits(int entnum, int depth, int prevbit,
1062   huffbitlist_t *bits);
1063
1064 /*   The compressor. This uses the usual Huffman compression algorithm. */
1065 void compress_game_text()
1066 {
1067   int entities=0, branchstart, branches;
1068   int numlive;
1069   int32 lx;
1070   int jx;
1071   int ch;
1072   int32 ix;
1073   int max_char_set;
1074   huffbitlist_t bits;
1075
1076   if (compression_switch) {
1077     max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1078
1079     huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1080       "huffman entities");
1081     hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1082       "huffman node list");
1083
1084     /* How many entities have we currently got? Well, 256 plus the
1085        string-terminator plus Unicode chars plus abbrevations plus
1086        dynamic strings. */
1087     entities = 256+1;
1088     huff_unicode_start = entities;
1089     entities += no_unicode_chars;
1090     huff_abbrev_start = entities;
1091     if (economy_switch)
1092       entities += no_abbreviations;
1093     huff_dynam_start = entities;
1094     entities += no_dynamic_strings;
1095
1096     if (entities > max_char_set)
1097       compiler_error("Too many entities for max_char_set");
1098
1099     /* Characters */
1100     for (jx=0; jx<256; jx++) {
1101       huff_entities[jx].type = 2;
1102       huff_entities[jx].count = 0;
1103       huff_entities[jx].u.ch = jx;
1104     }
1105     /* Terminator */
1106     huff_entities[256].type = 1;
1107     huff_entities[256].count = 0;
1108     for (jx=0; jx<no_unicode_chars; jx++) {
1109       huff_entities[huff_unicode_start+jx].type = 4;
1110       huff_entities[huff_unicode_start+jx].count = 0;
1111       huff_entities[huff_unicode_start+jx].u.val = jx;
1112     }
1113     if (economy_switch) {
1114       for (jx=0; jx<no_abbreviations; jx++) {
1115         huff_entities[huff_abbrev_start+jx].type = 3;
1116         huff_entities[huff_abbrev_start+jx].count = 0;
1117         huff_entities[huff_abbrev_start+jx].u.val = jx;
1118       }
1119     }
1120     for (jx=0; jx<no_dynamic_strings; jx++) {
1121       huff_entities[huff_dynam_start+jx].type = 9;
1122       huff_entities[huff_dynam_start+jx].count = 0;
1123       huff_entities[huff_dynam_start+jx].u.val = jx;
1124     }
1125   }
1126   else {
1127     /* No compression; use defaults that will make it easy to check
1128        for errors. */
1129     no_huff_entities = 257;
1130     huff_unicode_start = 257;
1131     huff_abbrev_start = 257;
1132     huff_dynam_start = 257+no_abbreviations;
1133     compression_table_size = 0;
1134   }
1135
1136   if (compression_switch) {
1137
1138     for (lx=0, ix=0; lx<no_strings; lx++) {
1139       int escapelen=0, escapetype=0;
1140       int done=FALSE;
1141       int32 escapeval=0;
1142       while (!done) {
1143         ch = static_strings_area[ix];
1144         ix++;
1145         if (ix > static_strings_extent || ch < 0)
1146           compiler_error("Read too much not-yet-compressed text.");
1147         if (escapelen == -1) {
1148           escapelen = 0;
1149           if (ch == '@') {
1150             ch = '@';
1151           }
1152           else if (ch == '0') {
1153             ch = '\0';
1154           }
1155           else if (ch == 'A' || ch == 'D' || ch == 'U') {
1156             escapelen = 4;
1157             escapetype = ch;
1158             escapeval = 0;
1159             continue;
1160           }
1161           else {
1162             compiler_error("Strange @ escape in processed text.");
1163           }
1164         }
1165         else if (escapelen) {
1166           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1167           escapelen--;
1168           if (escapelen == 0) {
1169             if (escapetype == 'A') {
1170               ch = huff_abbrev_start+escapeval;
1171             }
1172             else if (escapetype == 'D') {
1173               ch = huff_dynam_start+escapeval;
1174             }
1175             else if (escapetype == 'U') {
1176               ch = huff_unicode_start+escapeval;
1177             }
1178             else {
1179               compiler_error("Strange @ escape in processed text.");
1180             }
1181           }
1182           else
1183             continue;
1184         }
1185         else {
1186           if (ch == '@') {
1187             escapelen = -1;
1188             continue;
1189           }
1190           if (ch == 0) {
1191             ch = 256;
1192             done = TRUE;
1193           }
1194         }
1195         huff_entities[ch].count++;
1196       }
1197     }
1198
1199     numlive = 0;
1200     for (jx=0; jx<entities; jx++) {
1201       if (huff_entities[jx].count) {
1202         hufflist[numlive] = &(huff_entities[jx]);
1203         numlive++;
1204       }
1205     }
1206
1207     branchstart = entities;
1208     branches = 0;
1209
1210     while (numlive > 1) {
1211       int best1, best2;
1212       int best1num, best2num;
1213       huffentity_t *bran;
1214
1215       if (hufflist[0]->count < hufflist[1]->count) {
1216         best1 = 0;
1217         best2 = 1;
1218       }
1219       else {
1220         best2 = 0;
1221         best1 = 1;
1222       }
1223
1224       best1num = hufflist[best1]->count;
1225       best2num = hufflist[best2]->count;
1226
1227       for (jx=2; jx<numlive; jx++) {
1228         if (hufflist[jx]->count < best1num) {
1229           best2 = best1;
1230           best2num = best1num;
1231           best1 = jx;
1232           best1num = hufflist[best1]->count;
1233         }
1234         else if (hufflist[jx]->count < best2num) {
1235           best2 = jx;
1236           best2num = hufflist[best2]->count;
1237         }
1238       }
1239
1240       bran = &(huff_entities[branchstart+branches]);
1241       branches++;
1242       bran->type = 0;
1243       bran->count = hufflist[best1]->count + hufflist[best2]->count;
1244       bran->u.branch[0] = (hufflist[best1] - huff_entities);
1245       bran->u.branch[1] = (hufflist[best2] - huff_entities);
1246       hufflist[best1] = bran;
1247       if (best2 < numlive-1) {
1248         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1249           ((numlive-1) - best2) * sizeof(huffentity_t *));
1250       }
1251       numlive--;
1252     }
1253
1254     huff_entity_root = (hufflist[0] - huff_entities);
1255
1256     for (ix=0; ix<MAXHUFFBYTES; ix++)
1257       bits.b[ix] = 0;
1258     compression_table_size = 12;
1259
1260     no_huff_entities = 0; /* compress_makebits will total this up */
1261     compress_makebits(huff_entity_root, 0, -1, &bits);
1262   }
1263
1264   /* Now, sadly, we have to compute the size of the string section,
1265      without actually doing the compression. */
1266   compression_string_size = 0;
1267
1268   ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1269
1270   for (lx=0, ix=0; lx<no_strings; lx++) {
1271     int escapelen=0, escapetype=0;
1272     int done=FALSE;
1273     int32 escapeval=0;
1274     jx = 0;
1275     compressed_offsets[lx] = compression_table_size + compression_string_size;
1276     compression_string_size++; /* for the type byte */
1277     while (!done) {
1278       ch = static_strings_area[ix];
1279       ix++;
1280       if (ix > static_strings_extent || ch < 0)
1281         compiler_error("Read too much not-yet-compressed text.");
1282       if (escapelen == -1) {
1283         escapelen = 0;
1284         if (ch == '@') {
1285           ch = '@';
1286         }
1287         else if (ch == '0') {
1288           ch = '\0';
1289         }
1290         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1291           escapelen = 4;
1292           escapetype = ch;
1293           escapeval = 0;
1294           continue;
1295         }
1296         else {
1297           compiler_error("Strange @ escape in processed text.");
1298         }
1299       }
1300       else if (escapelen) {
1301         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1302         escapelen--;
1303         if (escapelen == 0) {
1304           if (escapetype == 'A') {
1305             ch = huff_abbrev_start+escapeval;
1306           }
1307           else if (escapetype == 'D') {
1308             ch = huff_dynam_start+escapeval;
1309           }
1310           else if (escapetype == 'U') {
1311             ch = huff_unicode_start+escapeval;
1312           }
1313           else {
1314             compiler_error("Strange @ escape in processed text.");
1315           }
1316         }
1317         else
1318           continue;
1319       }
1320       else {
1321         if (ch == '@') {
1322           escapelen = -1;
1323           continue;
1324         }
1325         if (ch == 0) {
1326           ch = 256;
1327           done = TRUE;
1328         }
1329       }
1330
1331       if (compression_switch) {
1332         jx += huff_entities[ch].depth;
1333         compression_string_size += (jx/8);
1334         jx = (jx % 8);
1335       }
1336       else {
1337         if (ch >= huff_dynam_start) {
1338           compression_string_size += 3;
1339         }
1340         else if (ch >= huff_unicode_start) {
1341           compiler_error("Abbreviation/Unicode in non-compressed string \
1342 should be impossible.");
1343         }
1344         else
1345           compression_string_size += 1;
1346       }
1347     }
1348     if (compression_switch && jx)
1349       compression_string_size++;
1350   }
1351
1352   done_compression = TRUE;
1353 }
1354
1355 static void compress_makebits(int entnum, int depth, int prevbit,
1356   huffbitlist_t *bits)
1357 {
1358   huffentity_t *ent = &(huff_entities[entnum]);
1359   char *cx;
1360
1361   no_huff_entities++;
1362   ent->addr = compression_table_size;
1363   ent->depth = depth;
1364   ent->bits = *bits;
1365   if (depth > 0) {
1366     if (prevbit)
1367       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1368   }
1369
1370   switch (ent->type) {
1371   case 0:
1372     compression_table_size += 9;
1373     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1374     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1375     break;
1376   case 1:
1377     compression_table_size += 1;
1378     break;
1379   case 2:
1380     compression_table_size += 2;
1381     break;
1382   case 3:
1383     cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1384     compression_table_size += (1 + 1 + strlen(cx));
1385     break;
1386   case 4:
1387   case 9:
1388     compression_table_size += 5;
1389     break;
1390   }
1391 }
1392
1393 /* ------------------------------------------------------------------------- */
1394 /*   The abbreviations optimiser                                             */
1395 /*                                                                           */
1396 /*   This is a very complex, memory and time expensive algorithm to          */
1397 /*   approximately solve the problem of which abbreviation strings would     */
1398 /*   minimise the total number of Z-chars to which the game text translates. */
1399 /*   It is in some ways a quite separate program but remains inside Inform   */
1400 /*   for compatibility with previous releases.                               */
1401 /* ------------------------------------------------------------------------- */
1402
1403 /* The complete game text. */
1404 static char *opttext;
1405 static int32 opttextlen;
1406
1407 typedef struct tlb_s
1408 {   char text[4];
1409     int32 intab, occurrences;
1410 } tlb;
1411 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1412 static memory_list tlbtab_memlist;
1413 static int32 no_occs;
1414
1415 static int32 *grandtable;
1416 static int32 *grandflags;
1417 typedef struct optab_s
1418 {   int32  length;
1419     int32  popularity;
1420     int32  score;
1421     int32  location;
1422     char text[MAX_ABBREV_LENGTH];
1423 } optab;
1424 static int32 MAX_BESTYET;
1425 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1426 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1427
1428 static int pass_no;
1429
1430 static void optimise_pass(void)
1431 {
1432     TIMEVALUE t1, t2;
1433     float duration;
1434     int32 i;
1435     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1436     for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1437     for (i=0; i<no_occs; i++)
1438     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1439         {
1440 #ifdef MAC_FACE
1441             if (i%((**g_pm_hndl).linespercheck) == 0)
1442             {   ProcessEvents (&g_proc);
1443                 if (g_proc != true)
1444                 {   ao_free_arrays();
1445                     longjmp (g_fallback, 1);
1446                 }
1447             }
1448 #endif
1449             if (optabbrevs_trace_setting >= 2) {
1450                 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1451                     pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1452                     (long int) tlbtab[i].occurrences);
1453             }
1454             TIMEVALUE_NOW(&t1);
1455             for (j=0; j<tlbtab[i].occurrences; j++)
1456             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1457                 nl=2; noflags=tlbtab[i].occurrences;
1458                 while ((noflags>=2)&&(nl<MAX_ABBREV_LENGTH-1))
1459                 {   nl++;
1460                     for (j2=0; j2<nl; j2++)
1461                         if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1462                             goto FinishEarly;
1463                     matches=0;
1464                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1465                     {   if (grandflags[j2]==1)
1466                         {   x=grandtable[tlbtab[i].intab+j2]
1467                               - grandtable[tlbtab[i].intab+j];
1468                          if (((x>-nl)&&(x<nl))
1469                             || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1470                                        opttext+grandtable[tlbtab[i].intab+j2],
1471                                        nl)!=0))
1472                             {   grandflags[j2]=0; noflags--; }
1473                             else matches++;
1474                         }
1475                     }
1476                     scrabble=0;
1477                     for (k=0; k<nl; k++)
1478                     {   scrabble++;
1479                         c=opttext[grandtable[tlbtab[i].intab+j+k]];
1480                         if (c!=(int) ' ')
1481                         {   if (iso_to_alphabet_grid[c]<0)
1482                                 scrabble+=2;
1483                             else
1484                                 if (iso_to_alphabet_grid[c]>=26)
1485                                     scrabble++;
1486                         }
1487                     }
1488                     score=(matches-1)*(scrabble-2);
1489                     min=score;
1490                     for (j2=0; j2<MAX_BESTYET; j2++)
1491                     {   if ((nl==bestyet[j2].length)
1492                                 && (memcmp(opttext+bestyet[j2].location,
1493                                        opttext+grandtable[tlbtab[i].intab+j],
1494                                        nl)==0))
1495                         {   j2=MAX_BESTYET; min=score; }
1496                         else
1497                         {   if (bestyet[j2].score<min)
1498                             {   min=bestyet[j2].score; minat=j2;
1499                             }
1500                         }
1501                     }
1502                     if (min!=score)
1503                     {   bestyet[minat].score=score;
1504                         bestyet[minat].length=nl;
1505                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1506                         bestyet[minat].popularity=matches;
1507                     }
1508                 }
1509                 FinishEarly: ;
1510             }
1511             if (optabbrevs_trace_setting >= 2) {
1512                 TIMEVALUE_NOW(&t2);
1513                 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1514                 printf(" (%.4f seconds)\n", duration);
1515             }
1516         }
1517     }
1518 }
1519
1520 static int any_overlap(char *s1, char *s2)
1521 {   int a, b, i, j, flag;
1522     a=strlen(s1); b=strlen(s2);
1523     for (i=1-b; i<a; i++)
1524     {   flag=0;
1525         for (j=0; j<b; j++)
1526             if ((0<=i+j)&&(i+j<=a-1))
1527                 if (s1[i+j]!=s2[j]) flag=1;
1528         if (flag==0) return(1);
1529     }
1530     return(0);
1531 }
1532
1533 extern void optimise_abbreviations(void)
1534 {   int32 i, j, tcount, max=0, MAX_GTABLE;
1535     int32 j2, selected, available, maxat=0, nl;
1536
1537     if (opttext == NULL)
1538         return;
1539
1540     /* We insist that the first two abbreviations will be ". " and ", ". */
1541     if (MAX_ABBREVS < 2)
1542         return;
1543
1544     /* Note that it's safe to access opttext[opttextlen+2]. There are
1545        two newlines and a null beyond opttextlen. */
1546
1547     printf("Beginning calculation of optimal abbreviations...\n");
1548
1549     pass_no = 0;
1550
1551     initialise_memory_list(&tlbtab_memlist,
1552         sizeof(tlb), 1000, (void**)&tlbtab,
1553         "three-letter-blocks buffer");
1554
1555     no_occs=0;
1556
1557     /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1558     MAX_BESTYET = 4 * MAX_ABBREVS;
1559
1560     bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1561     bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1562
1563     bestyet2[0].text[0]='.';
1564     bestyet2[0].text[1]=' ';
1565     bestyet2[0].text[2]=0;
1566
1567     bestyet2[1].text[0]=',';
1568     bestyet2[1].text[1]=' ';
1569     bestyet2[1].text[2]=0;
1570
1571     selected=2;
1572
1573     for (i=0; i<opttextlen; i++)
1574     {
1575         if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1576         {   opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1577             bestyet2[0].popularity++;
1578         }
1579
1580         if ((opttext[i]=='.') && (opttext[i+1]==' '))
1581         {   opttext[i]='\n'; opttext[i+1]='\n';
1582             bestyet2[0].popularity++;
1583         }
1584
1585         if ((opttext[i]==',') && (opttext[i+1]==' '))
1586         {   opttext[i]='\n'; opttext[i+1]='\n';
1587             bestyet2[1].popularity++;
1588         }
1589     }
1590
1591     MAX_GTABLE=opttextlen+1;
1592     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1593
1594     for (i=0, tcount=0; i<opttextlen; i++)
1595     {
1596         tlb test;
1597         test.text[0]=opttext[i];
1598         test.text[1]=opttext[i+1];
1599         test.text[2]=opttext[i+2];
1600         test.text[3]=0;
1601         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1602             goto DontKeep;
1603         for (j=0; j<no_occs; j++) {
1604             if (strcmp(test.text,tlbtab[j].text)==0)
1605                 goto DontKeep;
1606         }
1607         test.occurrences=0;
1608         test.intab=0;
1609         for (j=i+3; j<opttextlen; j++)
1610         {
1611 #ifdef MAC_FACE
1612             if (j%((**g_pm_hndl).linespercheck) == 0)
1613             {   ProcessEvents (&g_proc);
1614                 if (g_proc != true)
1615                 {   ao_free_arrays();
1616                     longjmp (g_fallback, 1);
1617                 }
1618             }
1619 #endif
1620             if ((opttext[i]==opttext[j])
1621                  && (opttext[i+1]==opttext[j+1])
1622                  && (opttext[i+2]==opttext[j+2]))
1623                  {   grandtable[tcount+test.occurrences]=j;
1624                      test.occurrences++;
1625                      if (tcount+test.occurrences==MAX_GTABLE)
1626                      {   printf("All %ld cross-references used\n",
1627                              (long int) MAX_GTABLE);
1628                          goto Built;
1629                      }
1630                  }
1631         }
1632         if (test.occurrences>=2)
1633         {
1634             ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1635             tlbtab[no_occs]=test;
1636             tlbtab[no_occs].intab=tcount;
1637             tcount += tlbtab[no_occs].occurrences;
1638             if (max<tlbtab[no_occs].occurrences)
1639                 max=tlbtab[no_occs].occurrences;
1640             no_occs++;
1641         }
1642         DontKeep: ;
1643     }
1644
1645     Built:
1646     grandflags=my_calloc(sizeof(int), max, "grandflags");
1647
1648
1649     if (optabbrevs_trace_setting >= 1) {
1650         printf("Cross-reference table (%ld entries) built...\n",
1651             (long int) no_occs);
1652     }
1653     /*  for (i=0; i<no_occs; i++)
1654             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1655                 tlbtab[i].occurrences);
1656     */
1657
1658     for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1659     available=MAX_BESTYET;
1660     while ((available>0)&&(selected<MAX_ABBREVS))
1661     {
1662         pass_no++;
1663         if (optabbrevs_trace_setting >= 1) {
1664             printf("Pass %d\n", pass_no);
1665         }
1666
1667         optimise_pass();
1668         available=0;
1669         for (i=0; i<MAX_BESTYET; i++)
1670             if (bestyet[i].score!=0)
1671             {   available++;
1672                 nl=bestyet[i].length;
1673                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1674                     opttext[bestyet[i].location+j2];
1675                 bestyet[i].text[nl]=0;
1676             }
1677
1678     /*  printf("End of pass results:\n");
1679         printf("\nno   score  freq   string\n");
1680         for (i=0; i<MAX_BESTYET; i++)
1681             if (bestyet[i].score>0)
1682                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1683                     bestyet[i].popularity, bestyet[i].text);
1684     */
1685
1686         do
1687         {   max=0;
1688             for (i=0; i<MAX_BESTYET; i++)
1689                 if (max<bestyet[i].score)
1690                 {   max=bestyet[i].score;
1691                     maxat=i;
1692                 }
1693
1694             if (max>0)
1695             {
1696                 char testtext[4];
1697                 bestyet2[selected++]=bestyet[maxat];
1698
1699                 if (optabbrevs_trace_setting >= 1) {
1700                     printf(
1701                         "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1702                         (long int) selected,bestyet[maxat].text,
1703                         (long int) bestyet[maxat].popularity,
1704                         (long int) bestyet[maxat].score);
1705                 }
1706
1707                 testtext[0]=bestyet[maxat].text[0];
1708                 testtext[1]=bestyet[maxat].text[1];
1709                 testtext[2]=bestyet[maxat].text[2];
1710                 testtext[3]=0;
1711
1712                 for (i=0; i<no_occs; i++)
1713                     if (strcmp(testtext,tlbtab[i].text)==0)
1714                         break;
1715
1716                 for (j=0; j<tlbtab[i].occurrences; j++)
1717                 {   if (memcmp(bestyet[maxat].text,
1718                                opttext+grandtable[tlbtab[i].intab+j],
1719                                bestyet[maxat].length)==0)
1720                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1721                             opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1722                     }
1723                 }
1724
1725                 for (i=0; i<MAX_BESTYET; i++)
1726                     if ((bestyet[i].score>0)&&
1727                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1728                     {   bestyet[i].score=0;
1729                        /* printf("Discarding '%s' as overlapping\n",
1730                             bestyet[i].text); */
1731                     }
1732             }
1733         } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1734     }
1735
1736     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1737     for (i=0; i<selected; i++)
1738         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1739
1740     text_free_arrays();
1741 }
1742
1743 /* ------------------------------------------------------------------------- */
1744 /*   The dictionary manager begins here.                                     */
1745 /*                                                                           */
1746 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1747 /*   routine were used to search the dictionary words so far built up, then  */
1748 /*   Inform would crawl.                                                     */
1749 /*                                                                           */
1750 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1751 /*   balanced with the red-black algorithm.                                  */
1752 /* ------------------------------------------------------------------------- */
1753 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1754 /*   7-byte header (left blank here to be filled in at the                   */
1755 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1756 /*   records, one per word, in the form                                      */
1757 /*                                                                           */
1758 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1759 /*        4 or 6 bytes       byte        byte             byte               */
1760 /*                                                                           */
1761 /*   For Glulx, the form is instead: (See below about Unicode-valued         */
1762 /*   dictionaries and DICT_WORD_BYTES.)                                      */
1763 /*                                                                           */
1764 /*        <tag>  <plain text>    <flags>  <verbnumber>   <adjectivenumber>   */
1765 /*         $60    DICT_WORD_BYTES short    short          short              */
1766 /*                                                                           */
1767 /*   These records are stored in "accession order" (i.e. in order of their   */
1768 /*   first being received by these routines) and only alphabetically sorted  */
1769 /*   by construct_storyfile() (using the array below).                       */
1770 /* ------------------------------------------------------------------------- */
1771 /*                                                                           */
1772 /*   Further notes about the data fields...                                  */
1773 /*   The flags are currently:                                                */
1774 /*     bit 0: word is used as a verb (in verb grammar)                       */
1775 /*     bit 1: word is used as a meta verb                                    */
1776 /*     bit 2: word is plural (set by '//p')                                  */
1777 /*     bit 3: word is used as a preposition (in verb grammar)                */
1778 /*     bit 6: set for all verbs, but not used by the parser?                 */
1779 /*     bit 7: word is used as a noun (set for every word that appears in     */
1780 /*       code or in an object property)                                      */
1781 /*                                                                           */
1782 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1783 /*   zero).                                                                  */
1784 /*                                                                           */
1785 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1786 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1787 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1788 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1789 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1790 /* ------------------------------------------------------------------------- */
1791
1792 uchar *dictionary;                    /* (These two variables are externally
1793                                          used only in "tables.c" when
1794                                          building the story-file)            */
1795 static memory_list dictionary_memlist;
1796 int32 dictionary_top;                 /* Position of the next free record
1797                                          in dictionary (i.e., the current
1798                                          number of bytes)                    */
1799
1800 int dict_entries;                     /* Total number of records entered     */
1801
1802 /* ------------------------------------------------------------------------- */
1803 /*   dict_word was originally a typedef for a struct of 6 unsigned chars.    */
1804 /*   It held the (4 or) 6 bytes of Z-coded text of a word.                   */
1805 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1806 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1807 /*   dict_word is called the "sort code" of the original text word.          */
1808 /*                                                                           */
1809 /*   In modifying the compiler for Glulx, I found it easier to discard the   */
1810 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1811 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1812 /*   as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE.      */
1813 /*   (That limit is defined as 40 in the header; it exists only for a few    */
1814 /*   static buffers, and can be increased without using significant memory.) */
1815 /*                                                                           */
1816 /*   ...Well, that certainly bit me on the butt, didn't it. In further       */
1817 /*   modifying the compiler to generate a Unicode dictionary, I have to      */
1818 /*   store four-byte values in the uchar array. This is handled by making    */
1819 /*   the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1820 /*   Then we store the 32-bit character value big-endian. This lets us       */
1821 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1822 /* ------------------------------------------------------------------------- */
1823
1824 extern int compare_sorts(uchar *d1, uchar *d2)
1825 {   int i;
1826     for (i=0; i<DICT_WORD_BYTES; i++)
1827         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1828     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1829        libraries) */
1830     return(0);
1831 }
1832
1833 extern void copy_sorts(uchar *d1, uchar *d2)
1834 {   int i;
1835     for (i=0; i<DICT_WORD_BYTES; i++)
1836         d1[i] = d2[i];
1837 }
1838
1839 static uchar prepared_sort[MAX_DICT_WORD_BYTES];     /* Holds the sort code
1840                                                         of current word */
1841
1842 static int number_and_case;
1843
1844 /* Also used by verbs.c */
1845 static void dictionary_prepare_z(char *dword, uchar *optresult)
1846 {   int i, j, k, k2, wd[13]; int32 tot;
1847
1848     /* A rapid text translation algorithm using only the simplified rules
1849        applying to the text of dictionary entries: first produce a sequence
1850        of 6 (v3) or 9 (v4+) Z-characters                                     */
1851
1852     int dictsize = (version_number==3) ? 6 : 9;
1853
1854     number_and_case = 0;
1855
1856     for (i=0, j=0; dword[j]!=0; i++, j++)
1857     {   if ((dword[j] == '/') && (dword[j+1] == '/'))
1858         {   for (j+=2; dword[j] != 0; j++)
1859             {   switch(dword[j])
1860                 {   case 'p': number_and_case |= 4;  break;
1861                     default:
1862                         error_named("Expected 'p' after '//' \
1863 to give number of dictionary word", dword);
1864                         break;
1865                 }
1866             }
1867             break;
1868         }
1869         if (i>=dictsize) break;
1870
1871         k=(int) dword[j];
1872         if (k==(int) '\'')
1873             warning_named("Obsolete usage: use the ^ character for the \
1874 apostrophe in", dword);
1875         if (k==(int) '^') k=(int) '\'';
1876         if (k=='\"') k='~';
1877
1878         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1879         {   int unicode = text_to_unicode(dword+j);
1880             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1881             k = unicode_to_zscii(unicode);
1882             j += textual_form_length - 1;
1883             if ((k == 5) || (k >= 0x100))
1884             {   unicode_char_error(
1885                    "Character can be printed but not input:", unicode);
1886                 k = '?';
1887             }
1888             k2 = zscii_to_alphabet_grid[(uchar) k];
1889         }
1890         else
1891         {   if (isupper(k)) k = tolower(k);
1892             k2 = iso_to_alphabet_grid[(uchar) k];
1893         }
1894
1895         if (k2 < 0)
1896         {   if ((k2 == -5) || (k2 <= -0x100))
1897                 char_error("Character can be printed but not input:", k);
1898             else
1899             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
1900
1901                 wd[i++] = 5; wd[i++] = 6;
1902                 k2 = -k2;
1903                 wd[i++] = k2/32; wd[i] = k2%32;
1904             }
1905         }
1906         else
1907         {   alphabet_used[k2] = 'Y';
1908             if ((k2/26)!=0)
1909                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
1910             wd[i]=6+(k2%26);                  /* Write the Z character       */
1911         }
1912     }
1913
1914     /* Fill up to the end of the dictionary block with PAD characters        */
1915
1916     for (; i<9; i++) wd[i]=5;
1917
1918     /* The array of Z-chars is converted to two or three 2-byte blocks       */
1919
1920     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1921     prepared_sort[1]=tot%0x100;
1922     prepared_sort[0]=(tot/0x100)%0x100;
1923     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1924     prepared_sort[3]=tot%0x100;
1925     prepared_sort[2]=(tot/0x100)%0x100;
1926     if (version_number==3)
1927         tot = 0;
1928     else
1929         tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1930     prepared_sort[5]=tot%0x100;
1931     prepared_sort[4]=(tot/0x100)%0x100;
1932
1933     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
1934
1935     if (version_number==3) prepared_sort[2]+=0x80;
1936                       else prepared_sort[4]+=0x80;
1937
1938     if (optresult) copy_sorts(optresult, prepared_sort);
1939 }
1940
1941 /* Also used by verbs.c */
1942 static void dictionary_prepare_g(char *dword, uchar *optresult)
1943 {
1944   int i, j, k;
1945   int32 unicode;
1946
1947   number_and_case = 0;
1948
1949   for (i=0, j=0; (dword[j]!=0); i++, j++) {
1950     if ((dword[j] == '/') && (dword[j+1] == '/')) {
1951       for (j+=2; dword[j] != 0; j++) {
1952         switch(dword[j]) {
1953         case 'p':
1954           number_and_case |= 4;
1955           break;
1956         default:
1957           error_named("Expected 'p' after '//' \
1958 to give gender or number of dictionary word", dword);
1959           break;
1960         }
1961       }
1962       break;
1963     }
1964     if (i>=DICT_WORD_SIZE) break;
1965
1966     k= ((unsigned char *)dword)[j];
1967     if (k=='\'')
1968       warning_named("Obsolete usage: use the ^ character for the \
1969 apostrophe in", dword);
1970     if (k=='^')
1971       k='\'';
1972     if (k=='~') /* as in iso_to_alphabet_grid */
1973       k='\"';
1974
1975     if (k=='@' || (character_set_unicode && (k & 0x80))) {
1976       unicode = text_to_unicode(dword+j);
1977       j += textual_form_length - 1;
1978     }
1979     else {
1980       unicode = iso_to_unicode_grid[k];
1981     }
1982
1983     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1984       k = unicode;
1985     }
1986     else {
1987       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1988 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1989       k = '?';
1990     }
1991
1992     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1993       k += ('a' - 'A');
1994
1995     if (DICT_CHAR_SIZE == 1) {
1996       prepared_sort[i] = k;
1997     }
1998     else {
1999       prepared_sort[4*i]   = (k >> 24) & 0xFF;
2000       prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2001       prepared_sort[4*i+2] = (k >>  8) & 0xFF;
2002       prepared_sort[4*i+3] = (k)       & 0xFF;
2003     }
2004   }
2005
2006   if (DICT_CHAR_SIZE == 1) {
2007     for (; i<DICT_WORD_SIZE; i++)
2008       prepared_sort[i] = 0;
2009   }
2010   else {
2011     for (; i<DICT_WORD_SIZE; i++) {
2012       prepared_sort[4*i]   = 0;
2013       prepared_sort[4*i+1] = 0;
2014       prepared_sort[4*i+2] = 0;
2015       prepared_sort[4*i+3] = 0;
2016     }
2017   }
2018
2019   if (optresult) copy_sorts(optresult, prepared_sort);
2020 }
2021
2022 extern void dictionary_prepare(char *dword, uchar *optresult)
2023 {
2024   if (!glulx_mode)
2025     dictionary_prepare_z(dword, optresult);
2026   else
2027     dictionary_prepare_g(dword, optresult);
2028 }
2029
2030 /* ------------------------------------------------------------------------- */
2031 /*   The arrays below are all concerned with the problem of alphabetically   */
2032 /*   sorting the dictionary during the compilation pass.                     */
2033 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
2034 /*   the end of the pass: we need to ensure that no duplicates are ever      */
2035 /*   created.                                                                */
2036 /*                                                                           */
2037 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
2038 /*                          word to be entered into the dictionary, where    */
2039 /*                          n counts upward from 0                           */
2040 /*                          (n is also called the "accession number")        */
2041 /*                                                                           */
2042 /*   The tree structure encodes an ordering.  The special value VACANT means */
2043 /*   "no node here": otherwise, node numbers are the same as accession       */
2044 /*   numbers.  At all times, "root" holds the node number of the top of the  */
2045 /*   tree; each node has up to two branches, such that the subtree of the    */
2046 /*   left branch is always alphabetically before what's at the node, and     */
2047 /*   the subtree to the right is always after; and all branches are coloured */
2048 /*   either "black" or "red".  These colours are used to detect points where */
2049 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
2050 /*   to search).                                                             */
2051 /* ------------------------------------------------------------------------- */
2052
2053 #define RED    'r'
2054 #define BLACK  'b'
2055 #define VACANT -1
2056
2057 static int root;
2058 typedef struct dict_tree_node_s
2059 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
2060     char colour;                  /* The colour of the branch to the parent */
2061 } dict_tree_node;
2062
2063 static dict_tree_node *dtree;     /* Allocated to dict_entries */
2064 static memory_list dtree_memlist;
2065
2066 static uchar *dict_sort_codes;  /* Allocated to dict_entries*DICT_WORD_BYTES */
2067 static memory_list dict_sort_codes_memlist;
2068
2069 int   *final_dict_order;          /* Allocated at sort_dictionary() time */
2070
2071 static void dictionary_begin_pass(void)
2072 {
2073     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
2074     /*  Glulx has a 4-byte header instead. */
2075
2076     if (!glulx_mode)
2077         dictionary_top = 7;
2078     else
2079         dictionary_top = 4;
2080
2081     ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2082
2083     root = VACANT;
2084     dict_entries = 0;
2085 }
2086
2087 static int fdo_count;
2088 static void recursively_sort(int node)
2089 {   if (dtree[node].branch[0] != VACANT)
2090         recursively_sort(dtree[node].branch[0]);
2091     final_dict_order[node] = fdo_count++;
2092     if (dtree[node].branch[1] != VACANT)
2093         recursively_sort(dtree[node].branch[1]);
2094 }
2095
2096 extern void sort_dictionary(void)
2097 {   int i;
2098
2099     final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2100
2101     if (module_switch)
2102     {   for (i=0; i<dict_entries; i++)
2103             final_dict_order[i] = i;
2104         return;
2105     }
2106
2107     if (root != VACANT)
2108     {   fdo_count = 0; recursively_sort(root);
2109     }
2110 }
2111
2112 /* ------------------------------------------------------------------------- */
2113 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
2114 /*   If not, return 0.                                                       */
2115 /* ------------------------------------------------------------------------- */
2116
2117 static int dictionary_find(char *dword)
2118 {   int at = root, n;
2119
2120     dictionary_prepare(dword, NULL);
2121
2122     while (at != VACANT)
2123     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2124         if (n==0) return at + 1;
2125         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2126     }
2127     return 0;
2128 }
2129
2130 /* ------------------------------------------------------------------------- */
2131 /*  Add "dword" to the dictionary with (x,y,z) as its data fields; unless    */
2132 /*  it already exists, in which case OR the data with (x,y,z)                */
2133 /*                                                                           */
2134 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
2135 /*                                                                           */
2136 /*  Returns: the accession number.                                           */
2137 /* ------------------------------------------------------------------------- */
2138
2139 extern int dictionary_add(char *dword, int x, int y, int z)
2140 {   int n; uchar *p;
2141     int ggfr = 0, gfr = 0, fr = 0, r = 0;
2142     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2143     int a, b;
2144     int res=((version_number==3)?4:6);
2145
2146     dictionary_prepare(dword, NULL);
2147
2148     if (root == VACANT)
2149     {   root = 0; goto CreateEntry;
2150     }
2151     while (TRUE)
2152     {
2153         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2154         if (n==0)
2155         {
2156             if (!glulx_mode) {
2157                 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2158                 p[0]=(p[0])|x; p[1]=(p[1])|y;
2159                 if (!ZCODE_LESS_DICT_DATA)
2160                     p[2]=(p[2])|z;
2161                 if (x & 128) p[0] = (p[0])|number_and_case;
2162             }
2163             else {
2164                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2165                 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
2166                 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
2167                 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
2168                 if (x & 128) p[1] = (p[1]) | number_and_case;
2169             }
2170             return at;
2171         }
2172         if (n>0) r=1; else r=0;
2173
2174         a = dtree[at].branch[0]; b = dtree[at].branch[1];
2175         if ((a != VACANT) && (dtree[a].colour == RED) &&
2176             (b != VACANT) && (dtree[b].colour == RED))
2177         {   dtree[a].colour = BLACK;
2178             dtree[b].colour = BLACK;
2179
2180             dtree[at].colour = RED;
2181
2182         /* A tree rotation may be needed to avoid two red links in a row:
2183            e.g.
2184              ggf   (or else gf is root)         ggf (or f is root)
2185               |                                  |
2186               gf                                 f
2187              / \(red)                           / \ (both red)
2188                 f            becomes          gf   at
2189                / \(red)                      /  \ /  \
2190                   at
2191                  /  \
2192
2193            In effect we rehang the "gf" subtree from "f".
2194            See the Technical Manual for further details.
2195         */
2196
2197             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2198             {
2199               if (fr == gfr)
2200               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2201                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2202                 dtree[f].branch[1-fr] = gf;
2203                 dtree[f].colour = BLACK;
2204                 dtree[gf].colour = RED;
2205                 gf = ggf; gfr = ggfr;
2206               }
2207               else
2208               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2209                 dtree[at].colour = BLACK;
2210                 dtree[gf].colour = RED;
2211                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2212                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2213                 dtree[at].branch[gfr] = f;
2214                 dtree[at].branch[fr] = gf;
2215
2216                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2217                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2218               }
2219             }
2220         }
2221
2222         if (dtree[at].branch[r] == VACANT)
2223         {   dtree[at].colour = RED;
2224
2225             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2226             { if (fr == gfr)
2227               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2228                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2229                 dtree[f].branch[1-fr] = gf;
2230                 dtree[f].colour = BLACK;
2231                 dtree[gf].colour = RED;
2232               }
2233               else
2234               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2235                 dtree[at].colour = BLACK;
2236                 dtree[gf].colour = RED;
2237                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2238                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2239                 dtree[at].branch[gfr] = f;
2240                 dtree[at].branch[fr] = gf;
2241
2242                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2243                 f = n; gf = ggf;
2244               }
2245             }
2246             dtree[at].branch[r] = dict_entries;
2247             goto CreateEntry;
2248         }
2249         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2250         ggfr = gfr; gfr = fr; fr = r;
2251     }
2252
2253     CreateEntry:
2254
2255     ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2256     ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2257
2258     dtree[dict_entries].branch[0] = VACANT;
2259     dtree[dict_entries].branch[1] = VACANT;
2260     dtree[dict_entries].colour    = BLACK;
2261
2262     /*  Address in Inform's own dictionary table to write the record to      */
2263
2264     if (!glulx_mode) {
2265
2266         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2267         p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2268
2269         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2270             bytes */
2271
2272         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2273         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2274         if (version_number > 3)
2275           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2276         p[res]=x; p[res+1]=y;
2277         if (!ZCODE_LESS_DICT_DATA) p[res+2]=z;
2278         if (x & 128) p[res] = (p[res])|number_and_case;
2279
2280         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2281
2282     }
2283     else {
2284         int i;
2285         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2286         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2287         p[0] = 0x60; /* type byte -- dict word */
2288
2289         p += DICT_CHAR_SIZE;
2290         for (i=0; i<DICT_WORD_BYTES; i++)
2291           p[i] = prepared_sort[i];
2292
2293         p += DICT_WORD_BYTES;
2294         p[0] = 0; p[1] = x;
2295         p[2] = y/256; p[3] = y%256;
2296         p[4] = 0; p[5] = z;
2297         if (x & 128)
2298           p[1] |= number_and_case;
2299
2300         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2301
2302     }
2303
2304     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2305
2306     return dict_entries++;
2307 }
2308
2309 /* ------------------------------------------------------------------------- */
2310 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2311 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2312 /*   change their verb-numbers.)                                             */
2313 /* ------------------------------------------------------------------------- */
2314
2315 extern void dictionary_set_verb_number(char *dword, int to)
2316 {   int i; uchar *p;
2317     int res=((version_number==3)?4:6);
2318     i=dictionary_find(dword);
2319     if (i!=0)
2320     {
2321         if (!glulx_mode) {
2322             p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2323             p[1]=to;
2324         }
2325         else {
2326             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2327             p[2]=to/256; p[3]=to%256;
2328         }
2329     }
2330 }
2331
2332 /* ------------------------------------------------------------------------- */
2333 /*   Tracing code for the dictionary: used not only by "trace" and text      */
2334 /*   transcription, but also (in the case of "word_to_ascii") in a vital     */
2335 /*   by the linker.                                                          */
2336 /* ------------------------------------------------------------------------- */
2337
2338 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2339    printed directly. (The "Trace dictionary" directive does this.)
2340    If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2341    until it's a page-width.
2342 */
2343 static char *d_show_buf = NULL;
2344 static int d_show_size; /* allocated size */
2345 static int d_show_len;  /* current length */
2346
2347 static void show_char(char c)
2348 {
2349     if (d_show_buf == NULL) {
2350         printf("%c", c);
2351     }
2352     else {
2353         if (d_show_len+2 >= d_show_size) {
2354             int newsize = 2 * d_show_len + 16;
2355             my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2356             d_show_size = newsize;
2357         }
2358         d_show_buf[d_show_len++] = c;
2359         d_show_buf[d_show_len] = '\0';
2360     }
2361 }
2362
2363 /* Display a Unicode character in user-readable form. This uses the same
2364    character encoding as the source code. */
2365 static void show_uchar(uint32 c)
2366 {
2367     char buf[16];
2368     int ix;
2369
2370     if (c < 0x80) {
2371         /* ASCII always works */
2372         show_char(c);
2373         return;
2374     }
2375     if (character_set_unicode) {
2376         /* UTF-8 the character */
2377         if (c < 0x80) {
2378             show_char(c);
2379         }
2380         else if (c < 0x800) {
2381             show_char((0xC0 | ((c & 0x7C0) >> 6)));
2382             show_char((0x80 |  (c & 0x03F)     ));
2383         }
2384         else if (c < 0x10000) {
2385             show_char((0xE0 | ((c & 0xF000) >> 12)));
2386             show_char((0x80 | ((c & 0x0FC0) >>  6)));
2387             show_char((0x80 |  (c & 0x003F)      ));
2388         }
2389         else if (c < 0x200000) {
2390             show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2391             show_char((0x80 | ((c & 0x03F000) >> 12)));
2392             show_char((0x80 | ((c & 0x000FC0) >>  6)));
2393             show_char((0x80 |  (c & 0x00003F)      ));
2394         }
2395         else {
2396             show_char('?');
2397         }
2398         return;
2399     }
2400     if (character_set_setting == 1 && c < 0x100) {
2401         /* Fits in Latin-1 */
2402         show_char(c);
2403         return;
2404     }
2405     /* Supporting other character_set_setting is harder; not currently implemented. */
2406
2407     /* Use the escaped form */
2408     sprintf(buf, "@{%x}", c);
2409     for (ix=0; buf[ix]; ix++)
2410         show_char(buf[ix]);
2411 }
2412
2413 extern void word_to_ascii(uchar *p, char *results)
2414 {   int i, shift, cc, zchar; uchar encoded_word[9];
2415     encoded_word[0] = (((int) p[0])&0x7c)/4;
2416     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2417     encoded_word[2] = ((int) p[1])&0x1f;
2418     encoded_word[3] = (((int) p[2])&0x7c)/4;
2419     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2420     encoded_word[5] = ((int) p[3])&0x1f;
2421     if (version_number > 3)
2422     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2423         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2424         encoded_word[8] = ((int) p[5])&0x1f;
2425     }
2426     else
2427     {
2428         encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2429     }
2430
2431     shift = 0; cc = 0;
2432     for (i=0; i< ((version_number==3)?6:9); i++)
2433     {   zchar = encoded_word[i];
2434
2435         if (zchar == 4) shift = 1;
2436         else
2437         if (zchar == 5) shift = 2;
2438         else
2439         {   if ((shift == 2) && (zchar == 6))
2440             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2441                 i += 2;
2442                 if ((zchar>=32) && (zchar<=126))
2443                     results[cc++] = zchar;
2444                 else
2445                 {   zscii_to_text(results+cc, zchar);
2446                     cc = strlen(results);
2447                 }
2448             }
2449             else
2450             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2451                 cc = strlen(results);
2452             }
2453             shift = 0;
2454         }
2455     }
2456     results[cc] = 0;
2457 }
2458
2459 /* Print a dictionary word to stdout.
2460    (This assumes that d_show_buf is null.)
2461  */
2462 void print_dict_word(int node)
2463 {
2464     uchar *p;
2465     int cprinted;
2466
2467     if (!glulx_mode) {
2468         char textual_form[32];
2469         p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2470
2471         word_to_ascii(p, textual_form);
2472
2473         for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2474             show_char(textual_form[cprinted]);
2475     }
2476     else {
2477         p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2478
2479         for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2480         {
2481             uint32 ch;
2482             if (DICT_CHAR_SIZE == 1)
2483                 ch = p[1+cprinted];
2484             else
2485                 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2486             if (!ch)
2487                 break;
2488             show_uchar(ch);
2489         }
2490     }
2491 }
2492
2493 static void recursively_show_z(int node, int level)
2494 {   int i, cprinted, flags; uchar *p;
2495     char textual_form[32];
2496     int res = (version_number == 3)?4:6; /* byte length of encoded text */
2497
2498     if (dtree[node].branch[0] != VACANT)
2499         recursively_show_z(dtree[node].branch[0], level);
2500
2501     p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2502
2503     word_to_ascii(p, textual_form);
2504
2505     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2506         show_char(textual_form[cprinted]);
2507     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2508         show_char(' ');
2509
2510     /* The level-1 info can only be printfed (d_show_buf must be null). */
2511     if (d_show_buf == NULL && level >= 1)
2512     {
2513         if (level >= 2) {
2514             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2515         }
2516
2517         flags = (int) p[res];
2518         if (flags & 128)
2519         {   printf("noun ");
2520             if (flags & 4)  printf("p"); else printf(" ");
2521             printf(" ");
2522         }
2523         else printf("       ");
2524         if (flags & 8)
2525         {   if (grammar_version_number == 1)
2526                 printf("preposition:%d  ", (int) p[res+2]);
2527             else
2528                 printf("preposition    ");
2529         }
2530         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2531         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2532         printf("\n");
2533     }
2534
2535     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2536     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2537     {
2538         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2539         d_show_len = 0;
2540     }
2541
2542     if (dtree[node].branch[1] != VACANT)
2543         recursively_show_z(dtree[node].branch[1], level);
2544 }
2545
2546 static void recursively_show_g(int node, int level)
2547 {   int i, cprinted;
2548     uchar *p;
2549
2550     if (dtree[node].branch[0] != VACANT)
2551         recursively_show_g(dtree[node].branch[0], level);
2552
2553     p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2554
2555     for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2556     {
2557         uint32 ch;
2558         if (DICT_CHAR_SIZE == 1)
2559             ch = p[1+cprinted];
2560         else
2561             ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2562         if (!ch)
2563             break;
2564         show_uchar(ch);
2565     }
2566     for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2567         show_char(' ');
2568
2569     /* The level-1 info can only be printfed (d_show_buf must be null). */
2570     if (d_show_buf == NULL && level >= 1)
2571     {   int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2572         int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2573         int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2574         if (level >= 2) {
2575             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2576         }
2577         if (flags & 128)
2578         {   printf("noun ");
2579             if (flags & 4)  printf("p"); else printf(" ");
2580             printf(" ");
2581         }
2582         else printf("       ");
2583         if (flags & 8)
2584         {   printf("preposition    ");
2585         }
2586         if ((flags & 3) == 3) printf("metaverb:%d  ", verbnum);
2587         else if ((flags & 3) == 1) printf("verb:%d  ", verbnum);
2588         printf("\n");
2589     }
2590
2591     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2592     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2593     {
2594         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2595         d_show_len = 0;
2596     }
2597
2598     if (dtree[node].branch[1] != VACANT)
2599         recursively_show_g(dtree[node].branch[1], level);
2600 }
2601
2602 static void show_alphabet(int i)
2603 {   int j, c; char chartext[8];
2604
2605     for (j=0; j<26; j++)
2606     {   c = alphabet[i][j];
2607
2608         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2609
2610         zscii_to_text(chartext, c);
2611         printf("%s", chartext);
2612
2613         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2614     }
2615     printf("\n");
2616 }
2617
2618 extern void show_dictionary(int level)
2619 {
2620     /* Level 0: show words only. Level 1: show words and flags.
2621        Level 2: also show bytes.*/
2622     printf("Dictionary contains %d entries:\n",dict_entries);
2623     if (dict_entries != 0)
2624     {   d_show_len = 0; d_show_buf = NULL;
2625         if (!glulx_mode)
2626             recursively_show_z(root, level);
2627         else
2628             recursively_show_g(root, level);
2629     }
2630     if (!glulx_mode)
2631     {
2632         printf("\nZ-machine alphabet entries:\n");
2633         show_alphabet(0);
2634         show_alphabet(1);
2635         show_alphabet(2);
2636     }
2637 }
2638
2639 extern void write_dictionary_to_transcript(void)
2640 {
2641     d_show_size = 80; /* initial size */
2642     d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2643
2644     write_to_transcript_file("", STRCTX_INFO);
2645     sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2646     write_to_transcript_file(d_show_buf, STRCTX_INFO);
2647
2648     d_show_len = 0;
2649
2650     if (dict_entries != 0)
2651     {
2652         if (!glulx_mode)
2653             recursively_show_z(root, 0);
2654         else
2655             recursively_show_g(root, 0);
2656     }
2657     if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2658
2659     my_free(&d_show_buf, "dictionary display buffer");
2660     d_show_len = 0; d_show_buf = NULL;
2661 }
2662
2663 /* ========================================================================= */
2664 /*   Data structure management routines                                      */
2665 /* ------------------------------------------------------------------------- */
2666
2667 extern void init_text_vars(void)
2668 {   int j;
2669
2670     opttext = NULL;
2671     opttextlen = 0;
2672     bestyet = NULL;
2673     bestyet2 = NULL;
2674     tlbtab = NULL;
2675     grandtable = NULL;
2676     grandflags = NULL;
2677
2678     all_text = NULL;
2679
2680     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2681
2682     total_zchars_trans = 0;
2683
2684     dictionary = NULL;
2685     dictionary_top = 0;
2686     dtree = NULL;
2687     final_dict_order = NULL;
2688     dict_sort_codes = NULL;
2689     dict_entries=0;
2690
2691     static_strings_area = NULL;
2692     abbreviations_optimal_parse_schedule = NULL;
2693     abbreviations_optimal_parse_scores = NULL;
2694
2695     compressed_offsets = NULL;
2696     huff_entities = NULL;
2697     hufflist = NULL;
2698     unicode_usage_entries = NULL;
2699 }
2700
2701 extern void text_begin_pass(void)
2702 {   abbrevs_lookup_table_made = FALSE;
2703     no_abbreviations=0;
2704     total_chars_trans=0; total_bytes_trans=0;
2705     all_text_top=0;
2706     dictionary_begin_pass();
2707     low_strings_top = 0;
2708
2709     static_strings_extent = 0;
2710     no_strings = 0;
2711     no_dynamic_strings = 0;
2712     no_unicode_chars = 0;
2713 }
2714
2715 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2716
2717 extern void text_allocate_arrays(void)
2718 {
2719     int ix;
2720
2721     initialise_memory_list(&translated_text_memlist,
2722         sizeof(uchar), 8000, (void**)&translated_text,
2723         "translated text holding area");
2724
2725     initialise_memory_list(&all_text_memlist,
2726         sizeof(char), 0, (void**)&all_text,
2727         "transcription text for optimise");
2728
2729     initialise_memory_list(&static_strings_area_memlist,
2730         sizeof(uchar), 128, (void**)&static_strings_area,
2731         "static strings area");
2732
2733     initialise_memory_list(&abbreviations_at_memlist,
2734         MAX_ABBREV_LENGTH, 64, (void**)&abbreviations_at,
2735         "abbreviation text");
2736
2737     initialise_memory_list(&abbreviations_memlist,
2738         sizeof(abbreviation), 64, (void**)&abbreviations,
2739         "abbreviations");
2740
2741     initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2742         sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2743         "abbreviations optimal parse schedule");
2744     initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2745         sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2746         "abbreviations optimal parse scores");
2747
2748     initialise_memory_list(&dtree_memlist,
2749         sizeof(dict_tree_node), 1500, (void**)&dtree,
2750         "red-black tree for dictionary");
2751     initialise_memory_list(&dict_sort_codes_memlist,
2752         sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2753         "dictionary sort codes");
2754
2755     final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2756
2757     /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2758        4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2759        allocation; we don't have to be precise. */
2760     initialise_memory_list(&dictionary_memlist,
2761         sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2762         "dictionary");
2763
2764     initialise_memory_list(&low_strings_memlist,
2765         sizeof(uchar), 1024, (void**)&low_strings,
2766         "low (abbreviation) strings");
2767
2768     d_show_buf = NULL;
2769     d_show_size = 0;
2770     d_show_len = 0;
2771
2772     huff_entities = NULL;
2773     hufflist = NULL;
2774     unicode_usage_entries = NULL;
2775     done_compression = FALSE;
2776     compression_table_size = 0;
2777     compressed_offsets = NULL;
2778
2779     initialise_memory_list(&unicode_usage_entries_memlist,
2780         sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2781         "unicode entity entries");
2782
2783     /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2784
2785     /* This hash table is only used in Glulx */
2786     for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2787         unicode_usage_hash[ix] = -1;
2788
2789     initialise_memory_list(&compressed_offsets_memlist,
2790         sizeof(int32), 0, (void**)&compressed_offsets,
2791         "static strings index table");
2792 }
2793
2794 extern void extract_all_text()
2795 {
2796     /* optimise_abbreviations() is called after free_arrays(). Therefore,
2797        we need to preserve the text transcript where it will not be
2798        freed up. We do this by copying the pointer to opttext. */
2799     opttext = all_text;
2800     opttextlen = all_text_top;
2801
2802     /* Re-init all_text_memlist. This causes it to forget all about the
2803        old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2804     initialise_memory_list(&all_text_memlist,
2805         sizeof(char), 0, (void**)&all_text,
2806         "dummy transcription text");
2807 }
2808
2809 extern void text_free_arrays(void)
2810 {
2811     deallocate_memory_list(&translated_text_memlist);
2812
2813     deallocate_memory_list(&all_text_memlist);
2814
2815     deallocate_memory_list(&low_strings_memlist);
2816     deallocate_memory_list(&abbreviations_at_memlist);
2817     deallocate_memory_list(&abbreviations_memlist);
2818
2819     deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2820     deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2821
2822     deallocate_memory_list(&dtree_memlist);
2823     deallocate_memory_list(&dict_sort_codes_memlist);
2824     my_free(&final_dict_order, "final dictionary ordering table");
2825
2826     deallocate_memory_list(&dictionary_memlist);
2827
2828     deallocate_memory_list(&compressed_offsets_memlist);
2829     my_free(&hufflist, "huffman node list");
2830     my_free(&huff_entities, "huffman entities");
2831
2832     deallocate_memory_list(&unicode_usage_entries_memlist);
2833
2834     deallocate_memory_list(&static_strings_area_memlist);
2835 }
2836
2837 extern void ao_free_arrays(void)
2838 {
2839     /* Called only after optimise_abbreviations() runs. */
2840
2841     my_free (&opttext,"stashed transcript for optimisation");
2842     my_free (&bestyet,"bestyet");
2843     my_free (&bestyet2,"bestyet2");
2844     my_free (&grandtable,"grandtable");
2845     my_free (&grandflags,"grandflags");
2846
2847     deallocate_memory_list(&tlbtab_memlist);
2848
2849     /* This was re-inited, so we should re-deallocate it. */
2850     deallocate_memory_list(&all_text_memlist);
2851 }
2852
2853 /* ========================================================================= */