src/text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /*   Part of Inform 6.41                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2022                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 uchar *low_strings;                    /* Allocated to low_strings_top       */
  25 int32 low_strings_top;
  26 static memory_list low_strings_memlist;
  27
  28 int32 static_strings_extent;           /* Number of bytes of static strings
  29                                           made so far */
  30 uchar *static_strings_area;            /* Used to hold the static strings
  31                                           area so far
  32                                           Allocated to static_strings_extent */
  33 memory_list static_strings_area_memlist;
  34
  35 static char *all_text;                 /* Text buffer holding the entire text
  36                                           of the game, when it is being
  37                                           recorded
  38                                           (Allocated to all_text_top) */
  39 static memory_list all_text_memlist;
  40 static int32 all_text_top;
  41
  42 int abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  43                                           constructed when the first non-
  44                                           abbreviation string is translated:
  45                                           this flag is TRUE after that       */
  46     abbrevs_lookup[256];               /* Once this has been constructed,
  47                                           abbrevs_lookup[n] = the smallest
  48                                           number of any abbreviation beginning
  49                                           with ASCII character n, or -1
  50                                           if none of the abbreviations do    */
  51 int no_abbreviations;                  /* No of abbreviations defined so far */
  52 /* ------------------------------------------------------------------------- */
  53 /*   Glulx string compression storage                                        */
  54 /* ------------------------------------------------------------------------- */
  55
  56 int no_strings;                        /* No of strings in static strings
  57                                           area.                              */
  58 int no_dynamic_strings;                /* No. of @.. string escapes used
  59                                           (actually, the highest value used
  60                                           plus one)                          */
  61 int no_unicode_chars;                  /* Number of distinct Unicode chars
  62                                           used. (Beyond 0xFF.)               */
  63
  64 huffentity_t *huff_entities;           /* The list of entities (characters,
  65                                           abbreviations, @.. escapes, and
  66                                           the terminator)                    */
  67 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  68
  69 int no_huff_entities;                  /* The number of entities in the list */
  70 int huff_unicode_start;                /* Position in the list where Unicode
  71                                           chars begin.                       */
  72 int huff_abbrev_start;                 /* Position in the list where string
  73                                           abbreviations begin.               */
  74 int huff_dynam_start;                  /* Position in the list where @..
  75                                           entities begin.                    */
  76 int huff_entity_root;                  /* The position in the list of the root
  77                                           entry (when considering the table
  78                                           as a tree).                        */
  79
  80 int done_compression;                  /* Has the game text been compressed? */
  81 int32 compression_table_size;          /* Length of the Huffman table, in
  82                                           bytes                              */
  83 int32 compression_string_size;         /* Length of the compressed string
  84                                           data, in bytes                     */
  85 int32 *compressed_offsets;             /* The beginning of every string in
  86                                           the game, relative to the beginning
  87                                           of the Huffman table. (So entry 0
  88                                           is equal to compression_table_size).
  89                                           Allocated to no_strings at
  90                                           compress_game_text() time.         */
  91 static memory_list compressed_offsets_memlist;
  92
  93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars     */
  94 static memory_list unicode_usage_entries_memlist;
  95
  96 #define UNICODE_HASH_BUCKETS (64)
  97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
  98
  99 static int unicode_entity_index(int32 unicode);
 100
 101 /* ------------------------------------------------------------------------- */
 102 /*   Abbreviation arrays                                                     */
 103 /* ------------------------------------------------------------------------- */
 104
 105 abbreviation *abbreviations;             /* Allocated up to no_abbreviations */
 106 static memory_list abbreviations_memlist;
 107
 108 /* Memory to hold the text of any abbreviation strings declared. This is
 109    counted in units of MAX_ABBREV_LENGTH bytes. (An abbreviation must fit
 110    in that many bytes, null included.)                                       */
 111 uchar *abbreviations_at;                 /* Allocated up to no_abbreviations */
 112 static memory_list abbreviations_at_memlist;
 113
 114 static int *abbreviations_optimal_parse_schedule;
 115 static memory_list abbreviations_optimal_parse_schedule_memlist;
 116
 117 static int *abbreviations_optimal_parse_scores;
 118 static memory_list abbreviations_optimal_parse_scores_memlist;
 119
 120 /* ------------------------------------------------------------------------- */
 121
 122 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 123       total_bytes_trans,               /* Number of bytes of Z-code text out */
 124       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 125                                           needed only for abbrev efficiency
 126                                           calculation in "directs.c"         */
 127 static int32 total_zchars_trans;       /* Number of Z-chars of text out
 128                                           (only used to calculate the above) */
 129
 130 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 131                                           3 Z-chars at a time: when it's full
 132                                           these are written as a 2-byte word */
 133            zob_index;                  /* Index (0 to 2) into it             */
 134
 135 uchar *translated_text;                /* Area holding translated strings
 136                                           until they are moved into the
 137                                           static_strings_area below */
 138 static memory_list translated_text_memlist;
 139
 140 static int32 text_out_pos;             /* The "program counter" during text
 141                                           translation: the next position to
 142                                           write Z-coded text output to       */
 143
 144 static int32 text_out_limit;           /* The upper limit of text_out_pos
 145                                           during text translation (or -1
 146                                           for no limit)                      */
 147
 148 static int text_out_overflow;          /* During text translation, becomes
 149                                           true if text_out_pos tries to pass
 150                                           text_out_limit                     */
 151
 152 /* ------------------------------------------------------------------------- */
 153 /*   For variables/arrays used by the dictionary manager, see below          */
 154 /* ------------------------------------------------------------------------- */
 155
 156 /* ------------------------------------------------------------------------- */
 157 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 158 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 159 /*   alphabetical order (this is necessary for the detection algorithm to    */
 160 /*   to work).  Since the table is only prepared once, and for a table       */
 161 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 162 /* ------------------------------------------------------------------------- */
 163
 164 static void make_abbrevs_lookup(void)
 165 {   int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
 166     do
 167     {   bubble_sort = FALSE;
 168         for (j=0; j<no_abbreviations; j++)
 169             for (k=j+1; k<no_abbreviations; k++)
 170             {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 171                 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
 172                 if (strcmp(p1,p2)<0)
 173                 {   strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
 174                     l=abbreviations[j].value; abbreviations[j].value=abbreviations[k].value;
 175                     abbreviations[k].value=l;
 176                     l=abbreviations[j].quality; abbreviations[j].quality=abbreviations[k].quality;
 177                     abbreviations[k].quality=l;
 178                     bubble_sort = TRUE;
 179                 }
 180             }
 181     } while (bubble_sort);
 182
 183     for (j=no_abbreviations-1; j>=0; j--)
 184     {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 185         abbrevs_lookup[(uchar)p1[0]]=j;
 186         abbreviations[j].freq=0;
 187     }
 188     abbrevs_lookup_table_made = TRUE;
 189 }
 190
 191 /* ------------------------------------------------------------------------- */
 192 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 193 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 194 /*   is only called if text[i] is indeed the first character of at least one */
 195 /*   abbreviation, "from" begin the least index into the abbreviations table */
 196 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 197 /*   that the abbrevs table is in alphabetical order.                        */
 198 /*                                                                           */
 199 /*   The return value is -1 if there is no match.  If there is a match, the  */
 200 /*   text to be abbreviated out is over-written by a string of null chars    */
 201 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 202 /*                                                                           */
 203 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 204 /* ------------------------------------------------------------------------- */
 205
 206 static int try_abbreviations_from(unsigned char *text, int i, int from)
 207 {   int j, k; uchar *p, c;
 208     c=text[i];
 209     for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 210          (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
 211     {   if (text[i+1]==p[1])
 212         {   for (k=2; p[k]!=0; k++)
 213                 if (text[i+k]!=p[k]) goto NotMatched;
 214             if (!glulx_mode) {
 215                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 216             }
 217             abbreviations[j].freq++;
 218             return(j);
 219             NotMatched: ;
 220         }
 221     }
 222     return(-1);
 223 }
 224
 225 extern void make_abbreviation(char *text)
 226 {
 227     /* If -e mode is off, we won't waste space creating an abbreviation entry. */
 228     if (!economy_switch)
 229         return;
 230
 231     ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
 232     ensure_memory_list_available(&abbreviations_at_memlist, no_abbreviations+1);
 233
 234     strcpy((char *)abbreviations_at
 235             + no_abbreviations*MAX_ABBREV_LENGTH, text);
 236
 237     abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
 238     abbreviations[no_abbreviations].freq = 0;
 239
 240     /*   The quality is the number of Z-chars saved by using this            */
 241     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 242
 243     abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
 244
 245     if (abbreviations[no_abbreviations].quality <= 0) {
 246         warning_named("Abbreviation does not save any characters:", text);
 247     }
 248
 249     no_abbreviations++;
 250 }
 251
 252 /* ------------------------------------------------------------------------- */
 253 /*   The front end routine for text translation.                             */
 254 /*   strctx indicates the purpose of the string. This is mostly used for     */
 255 /*   informational output (gametext.txt), but we treat some string contexts  */
 256 /*   specially during compilation.                                           */
 257 /* ------------------------------------------------------------------------- */
 258
 259 extern int32 compile_string(char *b, int strctx)
 260 {   int32 i, j, k;
 261     uchar *c;
 262     int in_low_memory;
 263
 264     if (execution_never_reaches_here) {
 265         /* No need to put strings into gametext.txt or the static/low
 266            strings areas. */
 267         if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
 268             /* VENEER and VENEEROPC are only used at the translate_text level,
 269                so we don't have to catch them here. */
 270             return 0;
 271         }
 272     }
 273
 274     /* In Z-code, abbreviations go in the low memory pool (0x100). So
 275        do strings explicitly defined with the Lowstring directive.
 276        (In Glulx, the in_low_memory flag is ignored.) */
 277     in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 278
 279     if (!glulx_mode && in_low_memory)
 280     {
 281         k = translate_text(-1, b, strctx);
 282         if (k<0) {
 283             error("text translation failed");
 284             k = 0;
 285         }
 286         ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
 287         memcpy(low_strings+low_strings_top, translated_text, k);
 288         j = low_strings_top;
 289         low_strings_top += k;
 290         return(0x21+(j/2));
 291     }
 292
 293     if (glulx_mode && done_compression)
 294         compiler_error("Tried to add a string after compression was done.");
 295
 296     i = translate_text(-1, b, strctx);
 297     if (i < 0) {
 298         error("text translation failed");
 299         i = 0;
 300     }
 301
 302     /* Insert null bytes as needed to ensure that the next static string */
 303     /* also occurs at an address expressible as a packed address         */
 304
 305     if (!glulx_mode) {
 306         int textalign;
 307         if (oddeven_packing_switch)
 308             textalign = scale_factor*2;
 309         else
 310             textalign = scale_factor;
 311         while ((i%textalign)!=0)
 312         {
 313             ensure_memory_list_available(&translated_text_memlist, i+2);
 314             translated_text[i++] = 0;
 315             translated_text[i++] = 0;
 316         }
 317     }
 318
 319     j = static_strings_extent;
 320
 321     ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
 322     for (c=translated_text; c<translated_text+i;
 323          c++, static_strings_extent++)
 324         static_strings_area[static_strings_extent] = *c;
 325
 326     if (!glulx_mode) {
 327         return(j/scale_factor);
 328     }
 329     else {
 330         /* The marker value is a one-based string number. (We reserve zero
 331            to mean "not a string at all". */
 332         return (++no_strings);
 333     }
 334 }
 335
 336 /* ------------------------------------------------------------------------- */
 337 /*   Output a single Z-character into the buffer, and flush it if full       */
 338 /* ------------------------------------------------------------------------- */
 339
 340 static void write_z_char_z(int i)
 341 {   uint32 j;
 342     ASSERT_ZCODE();
 343     total_zchars_trans++;
 344     zchars_out_buffer[zob_index++]=(i%32);
 345     if (zob_index!=3) return;
 346     zob_index=0;
 347     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 348        + zchars_out_buffer[2];
 349
 350     if (text_out_limit >= 0) {
 351         if (text_out_pos+2 > text_out_limit) {
 352             text_out_overflow = TRUE;
 353             return;
 354         }
 355     }
 356     else {
 357         ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
 358     }
 359
 360     translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
 361     total_bytes_trans+=2;
 362 }
 363
 364 static void write_zscii(int zsc)
 365 {
 366     int lookup_value, in_alphabet;
 367
 368     if (zsc==' ')
 369     {   write_z_char_z(0);
 370         return;
 371     }
 372
 373     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 374
 375     else lookup_value = -1;
 376
 377     if (lookup_value >= 0)
 378     {   alphabet_used[lookup_value] = 'Y';
 379         in_alphabet = lookup_value/26;
 380         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 381         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 382         write_z_char_z(lookup_value%26 + 6);
 383     }
 384     else
 385     {   write_z_char_z(5); write_z_char_z(6);
 386         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 387     }
 388 }
 389
 390 /* ------------------------------------------------------------------------- */
 391 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 392 /*   setting the "end" bit on the final 2-byte word                          */
 393 /* ------------------------------------------------------------------------- */
 394
 395 static void end_z_chars(void)
 396 {
 397     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 398     while (zob_index!=0) write_z_char_z(5);
 399     if (text_out_pos < 2) {
 400         /* Something went wrong. */
 401         text_out_overflow = TRUE;
 402         return;
 403     }
 404     translated_text[text_out_pos-2] += 128;
 405 }
 406
 407 /* Glulx handles this much more simply -- compression is done elsewhere. */
 408 static void write_z_char_g(int i)
 409 {
 410     ASSERT_GLULX();
 411     if (text_out_limit >= 0) {
 412         if (text_out_pos+1 > text_out_limit) {
 413             text_out_overflow = TRUE;
 414             return;
 415         }
 416     }
 417     else {
 418         ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
 419     }
 420     total_zchars_trans++;
 421     translated_text[text_out_pos++] = i;
 422     total_bytes_trans++;
 423 }
 424
 425 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
 426 static int zchar_weight(int c)
 427 {
 428     int lookup = iso_to_alphabet_grid[c];
 429     if (lookup < 0) return 4;
 430     if (lookup < 26) return 1;
 431     return 2;
 432 }
 433
 434 /* ------------------------------------------------------------------------- */
 435 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 436 /*   translator. s_text is the source text and the return value is the       */
 437 /*   number of bytes translated.                                             */
 438 /*   The translated text will be stored in translated_text.                  */
 439 /*                                                                           */
 440 /*   If p_limit is >= 0, the text length will not exceed that many bytes.    */
 441 /*   If the translation tries to overflow this boundary, the return value    */
 442 /*   will be -1. (You should display an error and not read translated_text.) */
 443 /*                                                                           */
 444 /*   If p_limit is negative, any amount of text is accepted (up to int32     */
 445 /*   anyway).                                                                */
 446 /*                                                                           */
 447 /*   Note that the source text may be corrupted by this routine.             */
 448 /* ------------------------------------------------------------------------- */
 449
 450 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
 451 {   int i, j, k, in_alphabet, lookup_value, is_abbreviation;
 452     int32 unicode; int zscii;
 453     unsigned char *text_in;
 454
 455     if (p_limit >= 0) {
 456         ensure_memory_list_available(&translated_text_memlist, p_limit);
 457     }
 458
 459     /* For STRCTX_ABBREV, the string being translated is itself an
 460        abbreviation string, so it can't make use of abbreviations. Set
 461        the is_abbreviation flag to indicate this.
 462        The compiler has historically set this flag for the Lowstring
 463        directive as well -- the in_low_memory and is_abbreviation flag were
 464        always the same. I am preserving that convention. */
 465     is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 466
 467
 468     /*  Cast the input and output streams to unsigned char: text_out_pos will
 469         advance as bytes of Z-coded text are written, but text_in doesn't    */
 470
 471     text_in     = (unsigned char *) s_text;
 472     text_out_pos = 0;
 473     text_out_limit = p_limit;
 474     text_out_overflow = FALSE;
 475
 476     /*  Remember the Z-chars total so that later we can subtract to find the
 477         number of Z-chars translated on this string                          */
 478
 479     zchars_trans_in_last_string = total_zchars_trans;
 480
 481     /*  Start with the Z-characters output buffer empty                      */
 482
 483     zob_index=0;
 484
 485     /*  If this is the first text translated since the abbreviations were
 486         declared, and if some were declared, then it's time to make the
 487         lookup table for abbreviations
 488
 489         (Except: we don't if the text being translated is itself
 490         the text of an abbreviation currently being defined)                 */
 491
 492     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 493         && (!is_abbreviation))
 494         make_abbrevs_lookup();
 495
 496     /*  If we're storing the whole game text to memory, then add this text.
 497         We will put two newlines between each text and four at the very end.
 498         (The optimise code does a lot of sloppy text[i+2], so the extra
 499         two newlines past all_text_top are necessary.) */
 500
 501     if ((!is_abbreviation) && (store_the_text))
 502     {   int addlen = strlen(s_text);
 503         ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
 504         sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
 505         /* Advance past two newlines. */
 506         all_text_top += (addlen+2);
 507     }
 508
 509     if (transcript_switch) {
 510         /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
 511         if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
 512             int label = strctx;
 513             if (veneer_mode) {
 514                 if (label == STRCTX_GAME)
 515                     label = STRCTX_VENEER;
 516                 else if (label == STRCTX_GAMEOPC)
 517                     label = STRCTX_VENEEROPC;
 518             }
 519             write_to_transcript_file(s_text, label);
 520         }
 521     }
 522
 523     /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
 524     /*  (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
 525     /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
 526     if (economy_switch)
 527     {
 528         uchar *q, c;
 529         int l, min_score, from;
 530         int text_in_length;
 531
 532         text_in_length = strlen( (char*) text_in);
 533         ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
 534         ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
 535
 536         abbreviations_optimal_parse_scores[text_in_length] = 0;
 537         for(j=text_in_length-1; j>=0; j--)
 538         {   /* Initial values: empty schedule, score = just write the letter without abbreviating. */
 539             abbreviations_optimal_parse_schedule[j] = -1;
 540             min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
 541             /* If there's an abbreviation starting with that letter... */
 542             if ( (from = abbrevs_lookup[text_in[j]]) != -1)
 543             {
 544                 c = text_in[j];
 545                 /* Loop on all abbreviations starting with what is in c. */
 546                 for (k=from, q=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 547                     (k<no_abbreviations)&&(c==q[0]); k++, q+=MAX_ABBREV_LENGTH)
 548                 {
 549                     /* Let's compare; we also keep track of the length of the abbreviation. */
 550                     for (l=1; q[l]!=0; l++)
 551                     {    if (text_in[j+l]!=q[l]) {goto NotMatched;}
 552                     }
 553                     /* We have a match (length l), but is it smaller in size? */
 554                     if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
 555                     {   /* It is indeed smaller, so let's write it down in our schedule. */
 556                         min_score = 2 + abbreviations_optimal_parse_scores[j+l];
 557                         abbreviations_optimal_parse_schedule[j] = k;
 558                     }
 559                     NotMatched: ;
 560                 }
 561             }
 562             /* We gave it our best, this is the smallest we got. */
 563             abbreviations_optimal_parse_scores[j] = min_score;
 564         }
 565     }
 566
 567
 568
 569   if (!glulx_mode) {
 570
 571     /*  The empty string of Z-text is illegal, since it can't carry an end
 572         bit: so we translate an empty string of ASCII text to just the
 573         pad character 5.  Printing this causes nothing to appear on screen.  */
 574
 575     if (text_in[0]==0) write_z_char_z(5);
 576
 577     /*  Loop through the characters of the null-terminated input text: note
 578         that if 1 is written over a character in the input text, it is
 579         afterwards ignored                                                   */
 580
 581     for (i=0; text_in[i]!=0; i++)
 582     {   total_chars_trans++;
 583
 584         /*  Contract ".  " into ". " if double-space-removing switch set:
 585             likewise "?  " and "!  " if the setting is high enough           */
 586
 587         if ((double_space_setting >= 1)
 588             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 589         {   if (text_in[i]=='.') text_in[i+2]=1;
 590             if (double_space_setting >= 2)
 591             {   if (text_in[i]=='?') text_in[i+2]=1;
 592                 if (text_in[i]=='!') text_in[i+2]=1;
 593             }
 594         }
 595
 596         /*  Try abbreviations if the economy switch set. */
 597         /*  Look at the abbreviation schedule to see if we should abbreviate here. */
 598         /*  Note: Just because the schedule has something doesn't mean we should abbreviate there; */
 599         /*  sometimes you abbreviate before because it's better. If we have already replaced the */
 600         /*  char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
 601         if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
 602             ((j = abbreviations_optimal_parse_schedule[i]) != -1))
 603         {
 604             /* Fill with 1s, which will get ignored by everyone else. */
 605             uchar *p = (uchar *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 606             for (k=0; p[k]!=0; k++) text_in[i+k]=1;
 607             /* Actually write the abbreviation in the story file. */
 608             abbreviations[j].freq++;
 609             /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
 610             j += MAX_DYNAMIC_STRINGS;
 611             write_z_char_z(j/32+1); write_z_char_z(j%32);
 612         }
 613
 614
 615         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 616            decoding */
 617         if (character_set_unicode && (text_in[i] & 0x80))
 618         {   unicode = text_to_unicode((char *) (text_in+i));
 619             zscii = unicode_to_zscii(unicode);
 620             if (zscii != 5) write_zscii(zscii);
 621             else
 622             {   unicode_char_error(
 623                     "Character can only be used if declared in \
 624 advance as part of 'Zcharacter table':", unicode);
 625             }
 626             i += textual_form_length - 1;
 627             continue;
 628         }
 629
 630         /*  '@' is the escape character in Inform string notation: the various
 631             possibilities are:
 632
 633                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 634                 @twodigits or    :  write the abbreviation string with this
 635                 @(digits)           decimal number
 636                 @(symbol)        :  write the abbreviation string with this
 637                                     (constant) value
 638                 @accentcode      :  this accented character: e.g.,
 639                                         for @'e write an E-acute
 640                 @{...}           :  this Unicode char (in hex)              */
 641
 642         if (text_in[i]=='@')
 643         {   if (text_in[i+1]=='@')
 644             {
 645                 /*   @@... (ascii value)  */
 646
 647                 i+=2; j=atoi((char *) (text_in+i));
 648                 switch(j)
 649                 {   /* Prevent ~ and ^ from being translated to double-quote
 650                        and new-line, as they ordinarily would be */
 651
 652                     case 94:   write_z_char_z(5); write_z_char_z(6);
 653                                write_z_char_z(94/32); write_z_char_z(94%32);
 654                                break;
 655                     case 126:  write_z_char_z(5); write_z_char_z(6);
 656                                write_z_char_z(126/32); write_z_char_z(126%32);
 657                                break;
 658
 659                     default:   write_zscii(j); break;
 660                 }
 661                 while (isdigit(text_in[i])) i++; i--;
 662             }
 663             else if (text_in[i+1]=='(')
 664             {
 665                 /*   @(...) (dynamic string)   */
 666                 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
 667                 int len = 0, digits = 0;
 668                 i += 2;
 669                 /* This accepts "12xyz" as a symbol, which it really isn't,
 670                    but that just means it won't be found. */
 671                 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
 672                     char ch = text_in[i++];
 673                     if (isdigit(ch)) digits++;
 674                     dsymbol[len++] = ch;
 675                 }
 676                 dsymbol[len] = '\0';
 677                 j = -1;
 678                 /* We would like to parse dsymbol as *either* a decimal
 679                    number or a constant symbol. */
 680                 if (text_in[i] != ')' || len == 0) {
 681                     error("'@(...)' abbreviation must contain a symbol");
 682                 }
 683                 else if (digits == len) {
 684                     /* all digits; parse as decimal */
 685                     j = atoi(dsymbol);
 686                 }
 687                 else {
 688                     int sym = symbol_index(dsymbol, -1);
 689                     if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 690                         error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
 691                     }
 692                     else {
 693                         symbols[sym].flags |= USED_SFLAG;
 694                         j = symbols[sym].value;
 695                     }
 696                 }
 697                 if (!glulx_mode && j >= 96) {
 698                     error_max_dynamic_strings(j);
 699                     j = -1;
 700                 }
 701                 if (j >= MAX_DYNAMIC_STRINGS) {
 702                     error_max_dynamic_strings(j);
 703                     j = -1;
 704                 }
 705                 if (j >= 0) {
 706                     write_z_char_z(j/32+1); write_z_char_z(j%32);
 707                 }
 708                 else {
 709                     write_z_char_z(' '); /* error fallback */
 710                 }
 711             }
 712             else if (isdigit(text_in[i+1])!=0)
 713             {   int d1, d2;
 714
 715                 /*   @.. (dynamic string)   */
 716
 717                 d1 = character_digit_value[text_in[i+1]];
 718                 d2 = character_digit_value[text_in[i+2]];
 719                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 720                     error("'@..' must have two decimal digits");
 721                 else
 722                 {
 723                     j = d1*10 + d2;
 724                     if (!glulx_mode && j >= 96) {
 725                         error_max_dynamic_strings(j);
 726                         j = -1;
 727                     }
 728                     if (j >= MAX_DYNAMIC_STRINGS) {
 729                         /* Shouldn't get here with two digits */
 730                         error_max_dynamic_strings(j);
 731                         j = -1;
 732                     }
 733                     i+=2;
 734                     if (j >= 0) {
 735                         write_z_char_z(j/32+1); write_z_char_z(j%32);
 736                     }
 737                     else {
 738                         write_z_char_z(' '); /* error fallback */
 739                     }
 740                 }
 741             }
 742             else
 743             {
 744                 /*   A string escape specifying an unusual character   */
 745
 746                 unicode = text_to_unicode((char *) (text_in+i));
 747                 zscii = unicode_to_zscii(unicode);
 748                 if (zscii != 5) write_zscii(zscii);
 749                 else
 750                 {   unicode_char_error(
 751                        "Character can only be used if declared in \
 752 advance as part of 'Zcharacter table':", unicode);
 753                 }
 754                 i += textual_form_length - 1;
 755             }
 756         }
 757         else
 758         {   /*  Skip a character which has been over-written with the null
 759                 value 1 earlier on                                           */
 760
 761             if (text_in[i]!=1)
 762             {   if (text_in[i]==' ') write_z_char_z(0);
 763                 else
 764                 {   j = (int) text_in[i];
 765                     lookup_value = iso_to_alphabet_grid[j];
 766                     if (lookup_value < 0)
 767                     {   /*  The character isn't in the standard alphabets, so
 768                             we have to use the ZSCII 4-Z-char sequence */
 769
 770                         if (lookup_value == -5)
 771                         {   /*  Character isn't in the ZSCII set at all */
 772
 773                             unicode = iso_to_unicode(j);
 774                             unicode_char_error(
 775                                 "Character can only be used if declared in \
 776 advance as part of 'Zcharacter table':", unicode);
 777                             write_zscii(0x200 + unicode/0x100);
 778                             write_zscii(0x300 + unicode%0x100);
 779                         }
 780                         else write_zscii(-lookup_value);
 781                     }
 782                     else
 783                     {   /*  The character is in one of the standard alphabets:
 784                             write a SHIFT to temporarily change alphabet if
 785                             it isn't in alphabet 0, then write the Z-char    */
 786
 787                         alphabet_used[lookup_value] = 'Y';
 788                         in_alphabet = lookup_value/26;
 789                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 790                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 791                         write_z_char_z(lookup_value%26 + 6);
 792                     }
 793                 }
 794             }
 795         }
 796     }
 797
 798     /*  Flush the Z-characters output buffer and set the "end" bit           */
 799
 800     end_z_chars();
 801   }
 802   else {
 803
 804     /* The text storage here is, of course, temporary. Compression
 805        will occur when we're finished compiling, so that all the
 806        clever Huffman stuff will work.
 807        In the stored text, we use "@@" to indicate @,
 808        "@0" to indicate a zero byte,
 809        "@ANNNN" to indicate an abbreviation,
 810        "@DNNNN" to indicate a dynamic string thing.
 811        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 812        (NNNN is a four-digit hex number using the letters A-P... an
 813        ugly representation but a convenient one.)
 814     */
 815
 816     for (i=0; text_in[i]!=0; i++) {
 817
 818       /*  Contract ".  " into ". " if double-space-removing switch set:
 819           likewise "?  " and "!  " if the setting is high enough. */
 820       if ((double_space_setting >= 1)
 821         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 822         if (text_in[i]=='.'
 823           || (double_space_setting >= 2
 824             && (text_in[i]=='?' || text_in[i]=='!'))) {
 825           text_in[i+1] = text_in[i];
 826           i++;
 827         }
 828       }
 829
 830       total_chars_trans++;
 831
 832       /*  Try abbreviations if the economy switch set. We have to be in
 833           compression mode too, since the abbreviation mechanism is part
 834           of string decompression. */
 835
 836       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 837         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 838         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 839         char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 840         i += (strlen(cx)-1);
 841         write_z_char_g('@');
 842         write_z_char_g('A');
 843         write_z_char_g('A' + ((j >>12) & 0x0F));
 844         write_z_char_g('A' + ((j >> 8) & 0x0F));
 845         write_z_char_g('A' + ((j >> 4) & 0x0F));
 846         write_z_char_g('A' + ((j     ) & 0x0F));
 847       }
 848       else if (text_in[i] == '@') {
 849         if (text_in[i+1]=='@') {
 850           /* An ASCII code */
 851           i+=2; j=atoi((char *) (text_in+i));
 852           if (j == '@' || j == '\0') {
 853             write_z_char_g('@');
 854             if (j == 0) {
 855               j = '0';
 856               if (!compression_switch)
 857                 warning("Ascii @@0 will prematurely terminate non-compressed \
 858 string.");
 859             }
 860           }
 861           write_z_char_g(j);
 862           while (isdigit(text_in[i])) i++; i--;
 863         }
 864         else if (text_in[i+1]=='(') {
 865             char dsymbol[MAX_IDENTIFIER_LENGTH+1];
 866             int len = 0, digits = 0;
 867             i += 2;
 868             /* This accepts "12xyz" as a symbol, which it really isn't,
 869                but that just means it won't be found. */
 870             while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
 871                 char ch = text_in[i++];
 872                 if (isdigit(ch)) digits++;
 873                 dsymbol[len++] = ch;
 874             }
 875             dsymbol[len] = '\0';
 876             j = -1;
 877             /* We would like to parse dsymbol as *either* a decimal
 878                number or a constant symbol. */
 879             if (text_in[i] != ')' || len == 0) {
 880                 error("'@(...)' abbreviation must contain a symbol");
 881             }
 882             else if (digits == len) {
 883                 /* all digits; parse as decimal */
 884                 j = atoi(dsymbol);
 885             }
 886             else {
 887                 int sym = symbol_index(dsymbol, -1);
 888                 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 889                     error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
 890                 }
 891                 else {
 892                     symbols[sym].flags |= USED_SFLAG;
 893                     j = symbols[sym].value;
 894                 }
 895             }
 896             if (j >= MAX_DYNAMIC_STRINGS) {
 897                 error_max_dynamic_strings(j);
 898                 j = -1;
 899             }
 900             if (j+1 >= no_dynamic_strings)
 901                 no_dynamic_strings = j+1;
 902             if (j >= 0) {
 903                 write_z_char_g('@');
 904                 write_z_char_g('D');
 905                 write_z_char_g('A' + ((j >>12) & 0x0F));
 906                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 907                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 908                 write_z_char_g('A' + ((j     ) & 0x0F));
 909             }
 910             else {
 911                 write_z_char_g(' '); /* error fallback */
 912             }
 913         }
 914         else if (isdigit(text_in[i+1])) {
 915           int d1, d2;
 916           d1 = character_digit_value[text_in[i+1]];
 917           d2 = character_digit_value[text_in[i+2]];
 918           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 919             error("'@..' must have two decimal digits");
 920           }
 921           else {
 922             if (!compression_switch)
 923               warning("'@..' print variable will not work in non-compressed \
 924 string; substituting '   '.");
 925             i += 2;
 926             j = d1*10 + d2;
 927             if (j >= MAX_DYNAMIC_STRINGS) {
 928               error_max_dynamic_strings(j);
 929               j = -1;
 930             }
 931             if (j+1 >= no_dynamic_strings)
 932               no_dynamic_strings = j+1;
 933             if (j >= 0) {
 934                 write_z_char_g('@');
 935                 write_z_char_g('D');
 936                 write_z_char_g('A' + ((j >>12) & 0x0F));
 937                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 938                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 939                 write_z_char_g('A' + ((j     ) & 0x0F));
 940             }
 941             else {
 942                 write_z_char_g(' '); /* error fallback */
 943             }
 944           }
 945         }
 946         else {
 947           unicode = text_to_unicode((char *) (text_in+i));
 948           i += textual_form_length - 1;
 949           if (unicode == '@' || unicode == '\0') {
 950             write_z_char_g('@');
 951             write_z_char_g(unicode ? '@' : '0');
 952           }
 953           else if (unicode >= 0 && unicode < 256) {
 954             write_z_char_g(unicode);
 955           }
 956           else {
 957             if (!compression_switch) {
 958               warning("Unicode characters will not work in non-compressed \
 959 string; substituting '?'.");
 960               write_z_char_g('?');
 961             }
 962             else {
 963               j = unicode_entity_index(unicode);
 964               write_z_char_g('@');
 965               write_z_char_g('U');
 966               write_z_char_g('A' + ((j >>12) & 0x0F));
 967               write_z_char_g('A' + ((j >> 8) & 0x0F));
 968               write_z_char_g('A' + ((j >> 4) & 0x0F));
 969               write_z_char_g('A' + ((j     ) & 0x0F));
 970             }
 971           }
 972         }
 973       }
 974       else if (text_in[i] == '^')
 975         write_z_char_g(0x0A);
 976       else if (text_in[i] == '~')
 977         write_z_char_g('"');
 978       else if (character_set_unicode) {
 979         if (text_in[i] & 0x80) {
 980           unicode = text_to_unicode((char *) (text_in+i));
 981           i += textual_form_length - 1;
 982           if (unicode >= 0 && unicode < 256) {
 983             write_z_char_g(unicode);
 984           }
 985           else {
 986             if (!compression_switch) {
 987               warning("Unicode characters will not work in non-compressed \
 988 string; substituting '?'.");
 989               write_z_char_g('?');
 990             }
 991             else {
 992               j = unicode_entity_index(unicode);
 993               write_z_char_g('@');
 994               write_z_char_g('U');
 995               write_z_char_g('A' + ((j >>12) & 0x0F));
 996               write_z_char_g('A' + ((j >> 8) & 0x0F));
 997               write_z_char_g('A' + ((j >> 4) & 0x0F));
 998               write_z_char_g('A' + ((j     ) & 0x0F));
 999             }
1000           }
1001         }
1002         else {
1003           write_z_char_g(text_in[i]);
1004         }
1005       }
1006       else {
1007         unicode = iso_to_unicode_grid[text_in[i]];
1008         if (unicode >= 0 && unicode < 256) {
1009           write_z_char_g(unicode);
1010         }
1011         else {
1012           if (!compression_switch) {
1013             warning("Unicode characters will not work in non-compressed \
1014 string; substituting '?'.");
1015             write_z_char_g('?');
1016           }
1017           else {
1018             j = unicode_entity_index(unicode);
1019             write_z_char_g('@');
1020             write_z_char_g('U');
1021             write_z_char_g('A' + ((j >>12) & 0x0F));
1022             write_z_char_g('A' + ((j >> 8) & 0x0F));
1023             write_z_char_g('A' + ((j >> 4) & 0x0F));
1024             write_z_char_g('A' + ((j     ) & 0x0F));
1025           }
1026         }
1027       }
1028     }
1029     write_z_char_g(0);
1030     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1031
1032   }
1033
1034   if (text_out_overflow)
1035       return -1;
1036   else
1037       return text_out_pos;
1038 }
1039
1040 static int unicode_entity_index(int32 unicode)
1041 {
1042   int j;
1043   int buck = unicode % UNICODE_HASH_BUCKETS;
1044
1045   for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1046     if (unicode_usage_entries[j].ch == unicode)
1047       break;
1048   }
1049   if (j < 0) {
1050     ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1051     j = no_unicode_chars++;
1052     unicode_usage_entries[j].ch = unicode;
1053     unicode_usage_entries[j].next = unicode_usage_hash[buck];
1054     unicode_usage_hash[buck] = j;
1055   }
1056
1057   return j;
1058 }
1059
1060 /* ------------------------------------------------------------------------- */
1061 /*   Glulx compression code                                                  */
1062 /* ------------------------------------------------------------------------- */
1063
1064
1065 static void compress_makebits(int entnum, int depth, int prevbit,
1066   huffbitlist_t *bits);
1067
1068 /*   The compressor. This uses the usual Huffman compression algorithm. */
1069 void compress_game_text()
1070 {
1071   int entities=0, branchstart, branches;
1072   int numlive;
1073   int32 lx;
1074   int jx;
1075   int ch;
1076   int32 ix;
1077   int max_char_set;
1078   huffbitlist_t bits;
1079
1080   if (compression_switch) {
1081     max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1082
1083     huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1084       "huffman entities");
1085     hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1086       "huffman node list");
1087
1088     /* How many entities have we currently got? Well, 256 plus the
1089        string-terminator plus Unicode chars plus abbrevations plus
1090        dynamic strings. */
1091     entities = 256+1;
1092     huff_unicode_start = entities;
1093     entities += no_unicode_chars;
1094     huff_abbrev_start = entities;
1095     if (economy_switch)
1096       entities += no_abbreviations;
1097     huff_dynam_start = entities;
1098     entities += no_dynamic_strings;
1099
1100     if (entities > max_char_set)
1101       compiler_error("Too many entities for max_char_set");
1102
1103     /* Characters */
1104     for (jx=0; jx<256; jx++) {
1105       huff_entities[jx].type = 2;
1106       huff_entities[jx].count = 0;
1107       huff_entities[jx].u.ch = jx;
1108     }
1109     /* Terminator */
1110     huff_entities[256].type = 1;
1111     huff_entities[256].count = 0;
1112     for (jx=0; jx<no_unicode_chars; jx++) {
1113       huff_entities[huff_unicode_start+jx].type = 4;
1114       huff_entities[huff_unicode_start+jx].count = 0;
1115       huff_entities[huff_unicode_start+jx].u.val = jx;
1116     }
1117     if (economy_switch) {
1118       for (jx=0; jx<no_abbreviations; jx++) {
1119         huff_entities[huff_abbrev_start+jx].type = 3;
1120         huff_entities[huff_abbrev_start+jx].count = 0;
1121         huff_entities[huff_abbrev_start+jx].u.val = jx;
1122       }
1123     }
1124     for (jx=0; jx<no_dynamic_strings; jx++) {
1125       huff_entities[huff_dynam_start+jx].type = 9;
1126       huff_entities[huff_dynam_start+jx].count = 0;
1127       huff_entities[huff_dynam_start+jx].u.val = jx;
1128     }
1129   }
1130   else {
1131     /* No compression; use defaults that will make it easy to check
1132        for errors. */
1133     no_huff_entities = 257;
1134     huff_unicode_start = 257;
1135     huff_abbrev_start = 257;
1136     huff_dynam_start = 257+no_abbreviations;
1137     compression_table_size = 0;
1138   }
1139
1140   if (compression_switch) {
1141
1142     for (lx=0, ix=0; lx<no_strings; lx++) {
1143       int escapelen=0, escapetype=0;
1144       int done=FALSE;
1145       int32 escapeval=0;
1146       while (!done) {
1147         ch = static_strings_area[ix];
1148         ix++;
1149         if (ix > static_strings_extent || ch < 0)
1150           compiler_error("Read too much not-yet-compressed text.");
1151         if (escapelen == -1) {
1152           escapelen = 0;
1153           if (ch == '@') {
1154             ch = '@';
1155           }
1156           else if (ch == '0') {
1157             ch = '\0';
1158           }
1159           else if (ch == 'A' || ch == 'D' || ch == 'U') {
1160             escapelen = 4;
1161             escapetype = ch;
1162             escapeval = 0;
1163             continue;
1164           }
1165           else {
1166             compiler_error("Strange @ escape in processed text.");
1167           }
1168         }
1169         else if (escapelen) {
1170           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1171           escapelen--;
1172           if (escapelen == 0) {
1173             if (escapetype == 'A') {
1174               ch = huff_abbrev_start+escapeval;
1175             }
1176             else if (escapetype == 'D') {
1177               ch = huff_dynam_start+escapeval;
1178             }
1179             else if (escapetype == 'U') {
1180               ch = huff_unicode_start+escapeval;
1181             }
1182             else {
1183               compiler_error("Strange @ escape in processed text.");
1184             }
1185           }
1186           else
1187             continue;
1188         }
1189         else {
1190           if (ch == '@') {
1191             escapelen = -1;
1192             continue;
1193           }
1194           if (ch == 0) {
1195             ch = 256;
1196             done = TRUE;
1197           }
1198         }
1199         huff_entities[ch].count++;
1200       }
1201     }
1202
1203     numlive = 0;
1204     for (jx=0; jx<entities; jx++) {
1205       if (huff_entities[jx].count) {
1206         hufflist[numlive] = &(huff_entities[jx]);
1207         numlive++;
1208       }
1209     }
1210
1211     branchstart = entities;
1212     branches = 0;
1213
1214     while (numlive > 1) {
1215       int best1, best2;
1216       int best1num, best2num;
1217       huffentity_t *bran;
1218
1219       if (hufflist[0]->count < hufflist[1]->count) {
1220         best1 = 0;
1221         best2 = 1;
1222       }
1223       else {
1224         best2 = 0;
1225         best1 = 1;
1226       }
1227
1228       best1num = hufflist[best1]->count;
1229       best2num = hufflist[best2]->count;
1230
1231       for (jx=2; jx<numlive; jx++) {
1232         if (hufflist[jx]->count < best1num) {
1233           best2 = best1;
1234           best2num = best1num;
1235           best1 = jx;
1236           best1num = hufflist[best1]->count;
1237         }
1238         else if (hufflist[jx]->count < best2num) {
1239           best2 = jx;
1240           best2num = hufflist[best2]->count;
1241         }
1242       }
1243
1244       bran = &(huff_entities[branchstart+branches]);
1245       branches++;
1246       bran->type = 0;
1247       bran->count = hufflist[best1]->count + hufflist[best2]->count;
1248       bran->u.branch[0] = (hufflist[best1] - huff_entities);
1249       bran->u.branch[1] = (hufflist[best2] - huff_entities);
1250       hufflist[best1] = bran;
1251       if (best2 < numlive-1) {
1252         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1253           ((numlive-1) - best2) * sizeof(huffentity_t *));
1254       }
1255       numlive--;
1256     }
1257
1258     huff_entity_root = (hufflist[0] - huff_entities);
1259
1260     for (ix=0; ix<MAXHUFFBYTES; ix++)
1261       bits.b[ix] = 0;
1262     compression_table_size = 12;
1263
1264     no_huff_entities = 0; /* compress_makebits will total this up */
1265     compress_makebits(huff_entity_root, 0, -1, &bits);
1266   }
1267
1268   /* Now, sadly, we have to compute the size of the string section,
1269      without actually doing the compression. */
1270   compression_string_size = 0;
1271
1272   ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1273
1274   for (lx=0, ix=0; lx<no_strings; lx++) {
1275     int escapelen=0, escapetype=0;
1276     int done=FALSE;
1277     int32 escapeval=0;
1278     jx = 0;
1279     compressed_offsets[lx] = compression_table_size + compression_string_size;
1280     compression_string_size++; /* for the type byte */
1281     while (!done) {
1282       ch = static_strings_area[ix];
1283       ix++;
1284       if (ix > static_strings_extent || ch < 0)
1285         compiler_error("Read too much not-yet-compressed text.");
1286       if (escapelen == -1) {
1287         escapelen = 0;
1288         if (ch == '@') {
1289           ch = '@';
1290         }
1291         else if (ch == '0') {
1292           ch = '\0';
1293         }
1294         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1295           escapelen = 4;
1296           escapetype = ch;
1297           escapeval = 0;
1298           continue;
1299         }
1300         else {
1301           compiler_error("Strange @ escape in processed text.");
1302         }
1303       }
1304       else if (escapelen) {
1305         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1306         escapelen--;
1307         if (escapelen == 0) {
1308           if (escapetype == 'A') {
1309             ch = huff_abbrev_start+escapeval;
1310           }
1311           else if (escapetype == 'D') {
1312             ch = huff_dynam_start+escapeval;
1313           }
1314           else if (escapetype == 'U') {
1315             ch = huff_unicode_start+escapeval;
1316           }
1317           else {
1318             compiler_error("Strange @ escape in processed text.");
1319           }
1320         }
1321         else
1322           continue;
1323       }
1324       else {
1325         if (ch == '@') {
1326           escapelen = -1;
1327           continue;
1328         }
1329         if (ch == 0) {
1330           ch = 256;
1331           done = TRUE;
1332         }
1333       }
1334
1335       if (compression_switch) {
1336         jx += huff_entities[ch].depth;
1337         compression_string_size += (jx/8);
1338         jx = (jx % 8);
1339       }
1340       else {
1341         if (ch >= huff_dynam_start) {
1342           compression_string_size += 3;
1343         }
1344         else if (ch >= huff_unicode_start) {
1345           compiler_error("Abbreviation/Unicode in non-compressed string \
1346 should be impossible.");
1347         }
1348         else
1349           compression_string_size += 1;
1350       }
1351     }
1352     if (compression_switch && jx)
1353       compression_string_size++;
1354   }
1355
1356   done_compression = TRUE;
1357 }
1358
1359 static void compress_makebits(int entnum, int depth, int prevbit,
1360   huffbitlist_t *bits)
1361 {
1362   huffentity_t *ent = &(huff_entities[entnum]);
1363   char *cx;
1364
1365   no_huff_entities++;
1366   ent->addr = compression_table_size;
1367   ent->depth = depth;
1368   ent->bits = *bits;
1369   if (depth > 0) {
1370     if (prevbit)
1371       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1372   }
1373
1374   switch (ent->type) {
1375   case 0:
1376     compression_table_size += 9;
1377     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1378     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1379     break;
1380   case 1:
1381     compression_table_size += 1;
1382     break;
1383   case 2:
1384     compression_table_size += 2;
1385     break;
1386   case 3:
1387     cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1388     compression_table_size += (1 + 1 + strlen(cx));
1389     break;
1390   case 4:
1391   case 9:
1392     compression_table_size += 5;
1393     break;
1394   }
1395 }
1396
1397 /* ------------------------------------------------------------------------- */
1398 /*   The abbreviations optimiser                                             */
1399 /*                                                                           */
1400 /*   This is a very complex, memory and time expensive algorithm to          */
1401 /*   approximately solve the problem of which abbreviation strings would     */
1402 /*   minimise the total number of Z-chars to which the game text translates. */
1403 /*   It is in some ways a quite separate program but remains inside Inform   */
1404 /*   for compatibility with previous releases.                               */
1405 /* ------------------------------------------------------------------------- */
1406
1407 /* The complete game text. */
1408 static char *opttext;
1409 static int32 opttextlen;
1410
1411 typedef struct tlb_s
1412 {   char text[4];
1413     int32 intab, occurrences;
1414 } tlb;
1415 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1416 static memory_list tlbtab_memlist;
1417 static int32 no_occs;
1418
1419 static int32 *grandtable;
1420 static int32 *grandflags;
1421 typedef struct optab_s
1422 {   int32  length;
1423     int32  popularity;
1424     int32  score;
1425     int32  location;
1426     char text[MAX_ABBREV_LENGTH];
1427 } optab;
1428 static int32 MAX_BESTYET;
1429 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1430 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1431
1432 static int pass_no;
1433
1434 static void optimise_pass(void)
1435 {
1436     TIMEVALUE t1, t2;
1437     float duration;
1438     int32 i;
1439     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1440     for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1441     for (i=0; i<no_occs; i++)
1442     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1443         {
1444 #ifdef MAC_FACE
1445             if (i%((**g_pm_hndl).linespercheck) == 0)
1446             {   ProcessEvents (&g_proc);
1447                 if (g_proc != true)
1448                 {   ao_free_arrays();
1449                     longjmp (g_fallback, 1);
1450                 }
1451             }
1452 #endif
1453             if (optabbrevs_trace_setting >= 2) {
1454                 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1455                     pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1456                     (long int) tlbtab[i].occurrences);
1457             }
1458             TIMEVALUE_NOW(&t1);
1459             for (j=0; j<tlbtab[i].occurrences; j++)
1460             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1461                 nl=2; noflags=tlbtab[i].occurrences;
1462                 while ((noflags>=2)&&(nl<MAX_ABBREV_LENGTH-1))
1463                 {   nl++;
1464                     for (j2=0; j2<nl; j2++)
1465                         if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1466                             goto FinishEarly;
1467                     matches=0;
1468                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1469                     {   if (grandflags[j2]==1)
1470                         {   x=grandtable[tlbtab[i].intab+j2]
1471                               - grandtable[tlbtab[i].intab+j];
1472                          if (((x>-nl)&&(x<nl))
1473                             || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1474                                        opttext+grandtable[tlbtab[i].intab+j2],
1475                                        nl)!=0))
1476                             {   grandflags[j2]=0; noflags--; }
1477                             else matches++;
1478                         }
1479                     }
1480                     scrabble=0;
1481                     for (k=0; k<nl; k++)
1482                     {   scrabble++;
1483                         c=opttext[grandtable[tlbtab[i].intab+j+k]];
1484                         if (c!=(int) ' ')
1485                         {   if (iso_to_alphabet_grid[c]<0)
1486                                 scrabble+=2;
1487                             else
1488                                 if (iso_to_alphabet_grid[c]>=26)
1489                                     scrabble++;
1490                         }
1491                     }
1492                     score=(matches-1)*(scrabble-2);
1493                     min=score;
1494                     for (j2=0; j2<MAX_BESTYET; j2++)
1495                     {   if ((nl==bestyet[j2].length)
1496                                 && (memcmp(opttext+bestyet[j2].location,
1497                                        opttext+grandtable[tlbtab[i].intab+j],
1498                                        nl)==0))
1499                         {   j2=MAX_BESTYET; min=score; }
1500                         else
1501                         {   if (bestyet[j2].score<min)
1502                             {   min=bestyet[j2].score; minat=j2;
1503                             }
1504                         }
1505                     }
1506                     if (min!=score)
1507                     {   bestyet[minat].score=score;
1508                         bestyet[minat].length=nl;
1509                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1510                         bestyet[minat].popularity=matches;
1511                     }
1512                 }
1513                 FinishEarly: ;
1514             }
1515             if (optabbrevs_trace_setting >= 2) {
1516                 TIMEVALUE_NOW(&t2);
1517                 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1518                 printf(" (%.4f seconds)\n", duration);
1519             }
1520         }
1521     }
1522 }
1523
1524 static int any_overlap(char *s1, char *s2)
1525 {   int a, b, i, j, flag;
1526     a=strlen(s1); b=strlen(s2);
1527     for (i=1-b; i<a; i++)
1528     {   flag=0;
1529         for (j=0; j<b; j++)
1530             if ((0<=i+j)&&(i+j<=a-1))
1531                 if (s1[i+j]!=s2[j]) flag=1;
1532         if (flag==0) return(1);
1533     }
1534     return(0);
1535 }
1536
1537 extern void optimise_abbreviations(void)
1538 {   int32 i, j, tcount, max=0, MAX_GTABLE;
1539     int32 j2, selected, available, maxat=0, nl;
1540
1541     if (opttext == NULL)
1542         return;
1543
1544     /* We insist that the first two abbreviations will be ". " and ", ". */
1545     if (MAX_ABBREVS < 2)
1546         return;
1547
1548     /* Note that it's safe to access opttext[opttextlen+2]. There are
1549        two newlines and a null beyond opttextlen. */
1550
1551     printf("Beginning calculation of optimal abbreviations...\n");
1552
1553     pass_no = 0;
1554
1555     initialise_memory_list(&tlbtab_memlist,
1556         sizeof(tlb), 1000, (void**)&tlbtab,
1557         "three-letter-blocks buffer");
1558
1559     no_occs=0;
1560
1561     /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1562     MAX_BESTYET = 4 * MAX_ABBREVS;
1563
1564     bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1565     bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1566
1567     bestyet2[0].text[0]='.';
1568     bestyet2[0].text[1]=' ';
1569     bestyet2[0].text[2]=0;
1570
1571     bestyet2[1].text[0]=',';
1572     bestyet2[1].text[1]=' ';
1573     bestyet2[1].text[2]=0;
1574
1575     selected=2;
1576
1577     for (i=0; i<opttextlen; i++)
1578     {
1579         if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1580         {   opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1581             bestyet2[0].popularity++;
1582         }
1583
1584         if ((opttext[i]=='.') && (opttext[i+1]==' '))
1585         {   opttext[i]='\n'; opttext[i+1]='\n';
1586             bestyet2[0].popularity++;
1587         }
1588
1589         if ((opttext[i]==',') && (opttext[i+1]==' '))
1590         {   opttext[i]='\n'; opttext[i+1]='\n';
1591             bestyet2[1].popularity++;
1592         }
1593     }
1594
1595     MAX_GTABLE=opttextlen+1;
1596     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1597
1598     for (i=0, tcount=0; i<opttextlen; i++)
1599     {
1600         tlb test;
1601         test.text[0]=opttext[i];
1602         test.text[1]=opttext[i+1];
1603         test.text[2]=opttext[i+2];
1604         test.text[3]=0;
1605         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1606             goto DontKeep;
1607         for (j=0; j<no_occs; j++) {
1608             if (strcmp(test.text,tlbtab[j].text)==0)
1609                 goto DontKeep;
1610         }
1611         test.occurrences=0;
1612         test.intab=0;
1613         for (j=i+3; j<opttextlen; j++)
1614         {
1615 #ifdef MAC_FACE
1616             if (j%((**g_pm_hndl).linespercheck) == 0)
1617             {   ProcessEvents (&g_proc);
1618                 if (g_proc != true)
1619                 {   ao_free_arrays();
1620                     longjmp (g_fallback, 1);
1621                 }
1622             }
1623 #endif
1624             if ((opttext[i]==opttext[j])
1625                  && (opttext[i+1]==opttext[j+1])
1626                  && (opttext[i+2]==opttext[j+2]))
1627                  {   grandtable[tcount+test.occurrences]=j;
1628                      test.occurrences++;
1629                      if (tcount+test.occurrences==MAX_GTABLE)
1630                      {   printf("All %ld cross-references used\n",
1631                              (long int) MAX_GTABLE);
1632                          goto Built;
1633                      }
1634                  }
1635         }
1636         if (test.occurrences>=2)
1637         {
1638             ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1639             tlbtab[no_occs]=test;
1640             tlbtab[no_occs].intab=tcount;
1641             tcount += tlbtab[no_occs].occurrences;
1642             if (max<tlbtab[no_occs].occurrences)
1643                 max=tlbtab[no_occs].occurrences;
1644             no_occs++;
1645         }
1646         DontKeep: ;
1647     }
1648
1649     Built:
1650     grandflags=my_calloc(sizeof(int), max, "grandflags");
1651
1652
1653     if (optabbrevs_trace_setting >= 1) {
1654         printf("Cross-reference table (%ld entries) built...\n",
1655             (long int) no_occs);
1656     }
1657     /*  for (i=0; i<no_occs; i++)
1658             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1659                 tlbtab[i].occurrences);
1660     */
1661
1662     for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1663     available=MAX_BESTYET;
1664     while ((available>0)&&(selected<MAX_ABBREVS))
1665     {
1666         pass_no++;
1667         if (optabbrevs_trace_setting >= 1) {
1668             printf("Pass %d\n", pass_no);
1669         }
1670
1671         optimise_pass();
1672         available=0;
1673         for (i=0; i<MAX_BESTYET; i++)
1674             if (bestyet[i].score!=0)
1675             {   available++;
1676                 nl=bestyet[i].length;
1677                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1678                     opttext[bestyet[i].location+j2];
1679                 bestyet[i].text[nl]=0;
1680             }
1681
1682     /*  printf("End of pass results:\n");
1683         printf("\nno   score  freq   string\n");
1684         for (i=0; i<MAX_BESTYET; i++)
1685             if (bestyet[i].score>0)
1686                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1687                     bestyet[i].popularity, bestyet[i].text);
1688     */
1689
1690         do
1691         {   max=0;
1692             for (i=0; i<MAX_BESTYET; i++)
1693                 if (max<bestyet[i].score)
1694                 {   max=bestyet[i].score;
1695                     maxat=i;
1696                 }
1697
1698             if (max>0)
1699             {
1700                 char testtext[4];
1701                 bestyet2[selected++]=bestyet[maxat];
1702
1703                 if (optabbrevs_trace_setting >= 1) {
1704                     printf(
1705                         "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1706                         (long int) selected,bestyet[maxat].text,
1707                         (long int) bestyet[maxat].popularity,
1708                         (long int) bestyet[maxat].score);
1709                 }
1710
1711                 testtext[0]=bestyet[maxat].text[0];
1712                 testtext[1]=bestyet[maxat].text[1];
1713                 testtext[2]=bestyet[maxat].text[2];
1714                 testtext[3]=0;
1715
1716                 for (i=0; i<no_occs; i++)
1717                     if (strcmp(testtext,tlbtab[i].text)==0)
1718                         break;
1719
1720                 for (j=0; j<tlbtab[i].occurrences; j++)
1721                 {   if (memcmp(bestyet[maxat].text,
1722                                opttext+grandtable[tlbtab[i].intab+j],
1723                                bestyet[maxat].length)==0)
1724                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1725                             opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1726                     }
1727                 }
1728
1729                 for (i=0; i<MAX_BESTYET; i++)
1730                     if ((bestyet[i].score>0)&&
1731                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1732                     {   bestyet[i].score=0;
1733                        /* printf("Discarding '%s' as overlapping\n",
1734                             bestyet[i].text); */
1735                     }
1736             }
1737         } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1738     }
1739
1740     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1741     for (i=0; i<selected; i++)
1742         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1743
1744     text_free_arrays();
1745 }
1746
1747 /* ------------------------------------------------------------------------- */
1748 /*   The dictionary manager begins here.                                     */
1749 /*                                                                           */
1750 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1751 /*   routine were used to search the dictionary words so far built up, then  */
1752 /*   Inform would crawl.                                                     */
1753 /*                                                                           */
1754 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1755 /*   balanced with the red-black algorithm.                                  */
1756 /* ------------------------------------------------------------------------- */
1757 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1758 /*   7-byte header (left blank here to be filled in at the                   */
1759 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1760 /*   records, one per word, in the form                                      */
1761 /*                                                                           */
1762 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1763 /*        4 or 6 bytes       byte        byte             byte               */
1764 /*                                                                           */
1765 /*   For Glulx, the form is instead: (See below about Unicode-valued         */
1766 /*   dictionaries and DICT_WORD_BYTES.)                                      */
1767 /*                                                                           */
1768 /*        <tag>  <plain text>    <flags>  <verbnumber>   <adjectivenumber>   */
1769 /*         $60    DICT_WORD_BYTES short    short          short              */
1770 /*                                                                           */
1771 /*   These records are stored in "accession order" (i.e. in order of their   */
1772 /*   first being received by these routines) and only alphabetically sorted  */
1773 /*   by construct_storyfile() (using the array below).                       */
1774 /* ------------------------------------------------------------------------- */
1775 /*                                                                           */
1776 /*   Further notes about the data fields...                                  */
1777 /*   The flags are currently:                                                */
1778 /*     bit 0: word is used as a verb (in verb grammar)                       */
1779 /*     bit 1: word is used as a meta verb                                    */
1780 /*     bit 2: word is plural (set by '//p')                                  */
1781 /*     bit 3: word is used as a preposition (in verb grammar)                */
1782 /*     bit 6: set for all verbs, but not used by the parser?                 */
1783 /*     bit 7: word is used as a noun (set for every word that appears in     */
1784 /*       code or in an object property)                                      */
1785 /*                                                                           */
1786 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1787 /*   zero).                                                                  */
1788 /*                                                                           */
1789 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1790 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1791 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1792 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1793 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1794 /* ------------------------------------------------------------------------- */
1795
1796 uchar *dictionary;                    /* (These two variables are externally
1797                                          used only in "tables.c" when
1798                                          building the story-file)            */
1799 static memory_list dictionary_memlist;
1800 int32 dictionary_top;                 /* Position of the next free record
1801                                          in dictionary (i.e., the current
1802                                          number of bytes)                    */
1803
1804 int dict_entries;                     /* Total number of records entered     */
1805
1806 /* ------------------------------------------------------------------------- */
1807 /*   dict_word was originally a typedef for a struct of 6 unsigned chars.    */
1808 /*   It held the (4 or) 6 bytes of Z-coded text of a word.                   */
1809 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1810 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1811 /*   dict_word is called the "sort code" of the original text word.          */
1812 /*                                                                           */
1813 /*   In modifying the compiler for Glulx, I found it easier to discard the   */
1814 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1815 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1816 /*   as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE.      */
1817 /*   (That limit is defined as 40 in the header; it exists only for a few    */
1818 /*   static buffers, and can be increased without using significant memory.) */
1819 /*                                                                           */
1820 /*   ...Well, that certainly bit me on the butt, didn't it. In further       */
1821 /*   modifying the compiler to generate a Unicode dictionary, I have to      */
1822 /*   store four-byte values in the uchar array. This is handled by making    */
1823 /*   the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1824 /*   Then we store the 32-bit character value big-endian. This lets us       */
1825 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1826 /* ------------------------------------------------------------------------- */
1827
1828 extern int compare_sorts(uchar *d1, uchar *d2)
1829 {   int i;
1830     for (i=0; i<DICT_WORD_BYTES; i++)
1831         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1832     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1833        libraries) */
1834     return(0);
1835 }
1836
1837 extern void copy_sorts(uchar *d1, uchar *d2)
1838 {   int i;
1839     for (i=0; i<DICT_WORD_BYTES; i++)
1840         d1[i] = d2[i];
1841 }
1842
1843 static uchar prepared_sort[MAX_DICT_WORD_BYTES];     /* Holds the sort code
1844                                                         of current word */
1845
1846 static int number_and_case;
1847
1848 /* Also used by verbs.c */
1849 static void dictionary_prepare_z(char *dword, uchar *optresult)
1850 {   int i, j, k, k2, wd[13]; int32 tot;
1851
1852     /* A rapid text translation algorithm using only the simplified rules
1853        applying to the text of dictionary entries: first produce a sequence
1854        of 6 (v3) or 9 (v4+) Z-characters                                     */
1855
1856     int dictsize = (version_number==3) ? 6 : 9;
1857
1858     number_and_case = 0;
1859
1860     for (i=0, j=0; dword[j]!=0; i++, j++)
1861     {   if ((dword[j] == '/') && (dword[j+1] == '/'))
1862         {   for (j+=2; dword[j] != 0; j++)
1863             {   switch(dword[j])
1864                 {   case 'p': number_and_case |= 4;  break;
1865                     default:
1866                         error_named("Expected 'p' after '//' \
1867 to give number of dictionary word", dword);
1868                         break;
1869                 }
1870             }
1871             break;
1872         }
1873         if (i>=dictsize) break;
1874
1875         k=(int) dword[j];
1876         if (k==(int) '\'')
1877             warning_named("Obsolete usage: use the ^ character for the \
1878 apostrophe in", dword);
1879         if (k==(int) '^') k=(int) '\'';
1880         if (k=='\"') k='~';
1881
1882         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1883         {   int unicode = text_to_unicode(dword+j);
1884             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1885             k = unicode_to_zscii(unicode);
1886             j += textual_form_length - 1;
1887             if ((k == 5) || (k >= 0x100))
1888             {   unicode_char_error(
1889                    "Character can be printed but not input:", unicode);
1890                 k = '?';
1891             }
1892             k2 = zscii_to_alphabet_grid[(uchar) k];
1893         }
1894         else
1895         {   if (isupper(k)) k = tolower(k);
1896             k2 = iso_to_alphabet_grid[(uchar) k];
1897         }
1898
1899         if (k2 < 0)
1900         {   if ((k2 == -5) || (k2 <= -0x100))
1901                 char_error("Character can be printed but not input:", k);
1902             else
1903             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
1904
1905                 wd[i++] = 5; wd[i++] = 6;
1906                 k2 = -k2;
1907                 wd[i++] = k2/32; wd[i] = k2%32;
1908             }
1909         }
1910         else
1911         {   alphabet_used[k2] = 'Y';
1912             if ((k2/26)!=0)
1913                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
1914             wd[i]=6+(k2%26);                  /* Write the Z character       */
1915         }
1916     }
1917
1918     /* Fill up to the end of the dictionary block with PAD characters        */
1919
1920     for (; i<9; i++) wd[i]=5;
1921
1922     /* The array of Z-chars is converted to two or three 2-byte blocks       */
1923
1924     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1925     prepared_sort[1]=tot%0x100;
1926     prepared_sort[0]=(tot/0x100)%0x100;
1927     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1928     prepared_sort[3]=tot%0x100;
1929     prepared_sort[2]=(tot/0x100)%0x100;
1930     if (version_number==3)
1931         tot = 0;
1932     else
1933         tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1934     prepared_sort[5]=tot%0x100;
1935     prepared_sort[4]=(tot/0x100)%0x100;
1936
1937     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
1938
1939     if (version_number==3) prepared_sort[2]+=0x80;
1940                       else prepared_sort[4]+=0x80;
1941
1942     if (optresult) copy_sorts(optresult, prepared_sort);
1943 }
1944
1945 /* Also used by verbs.c */
1946 static void dictionary_prepare_g(char *dword, uchar *optresult)
1947 {
1948   int i, j, k;
1949   int32 unicode;
1950
1951   number_and_case = 0;
1952
1953   for (i=0, j=0; (dword[j]!=0); i++, j++) {
1954     if ((dword[j] == '/') && (dword[j+1] == '/')) {
1955       for (j+=2; dword[j] != 0; j++) {
1956         switch(dword[j]) {
1957         case 'p':
1958           number_and_case |= 4;
1959           break;
1960         default:
1961           error_named("Expected 'p' after '//' \
1962 to give gender or number of dictionary word", dword);
1963           break;
1964         }
1965       }
1966       break;
1967     }
1968     if (i>=DICT_WORD_SIZE) break;
1969
1970     k= ((unsigned char *)dword)[j];
1971     if (k=='\'')
1972       warning_named("Obsolete usage: use the ^ character for the \
1973 apostrophe in", dword);
1974     if (k=='^')
1975       k='\'';
1976     if (k=='~') /* as in iso_to_alphabet_grid */
1977       k='\"';
1978
1979     if (k=='@' || (character_set_unicode && (k & 0x80))) {
1980       unicode = text_to_unicode(dword+j);
1981       j += textual_form_length - 1;
1982     }
1983     else {
1984       unicode = iso_to_unicode_grid[k];
1985     }
1986
1987     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1988       k = unicode;
1989     }
1990     else {
1991       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1992 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1993       k = '?';
1994     }
1995
1996     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1997       k += ('a' - 'A');
1998
1999     if (DICT_CHAR_SIZE == 1) {
2000       prepared_sort[i] = k;
2001     }
2002     else {
2003       prepared_sort[4*i]   = (k >> 24) & 0xFF;
2004       prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2005       prepared_sort[4*i+2] = (k >>  8) & 0xFF;
2006       prepared_sort[4*i+3] = (k)       & 0xFF;
2007     }
2008   }
2009
2010   if (DICT_CHAR_SIZE == 1) {
2011     for (; i<DICT_WORD_SIZE; i++)
2012       prepared_sort[i] = 0;
2013   }
2014   else {
2015     for (; i<DICT_WORD_SIZE; i++) {
2016       prepared_sort[4*i]   = 0;
2017       prepared_sort[4*i+1] = 0;
2018       prepared_sort[4*i+2] = 0;
2019       prepared_sort[4*i+3] = 0;
2020     }
2021   }
2022
2023   if (optresult) copy_sorts(optresult, prepared_sort);
2024 }
2025
2026 extern void dictionary_prepare(char *dword, uchar *optresult)
2027 {
2028   if (!glulx_mode)
2029     dictionary_prepare_z(dword, optresult);
2030   else
2031     dictionary_prepare_g(dword, optresult);
2032 }
2033
2034 /* ------------------------------------------------------------------------- */
2035 /*   The arrays below are all concerned with the problem of alphabetically   */
2036 /*   sorting the dictionary during the compilation pass.                     */
2037 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
2038 /*   the end of the pass: we need to ensure that no duplicates are ever      */
2039 /*   created.                                                                */
2040 /*                                                                           */
2041 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
2042 /*                          word to be entered into the dictionary, where    */
2043 /*                          n counts upward from 0                           */
2044 /*                          (n is also called the "accession number")        */
2045 /*                                                                           */
2046 /*   The tree structure encodes an ordering.  The special value VACANT means */
2047 /*   "no node here": otherwise, node numbers are the same as accession       */
2048 /*   numbers.  At all times, "root" holds the node number of the top of the  */
2049 /*   tree; each node has up to two branches, such that the subtree of the    */
2050 /*   left branch is always alphabetically before what's at the node, and     */
2051 /*   the subtree to the right is always after; and all branches are coloured */
2052 /*   either "black" or "red".  These colours are used to detect points where */
2053 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
2054 /*   to search).                                                             */
2055 /* ------------------------------------------------------------------------- */
2056
2057 #define RED    'r'
2058 #define BLACK  'b'
2059 #define VACANT -1
2060
2061 static int root;
2062 typedef struct dict_tree_node_s
2063 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
2064     char colour;                  /* The colour of the branch to the parent */
2065 } dict_tree_node;
2066
2067 static dict_tree_node *dtree;     /* Allocated to dict_entries */
2068 static memory_list dtree_memlist;
2069
2070 static uchar *dict_sort_codes;  /* Allocated to dict_entries*DICT_WORD_BYTES */
2071 static memory_list dict_sort_codes_memlist;
2072
2073 int   *final_dict_order;          /* Allocated at sort_dictionary() time */
2074
2075 static void dictionary_begin_pass(void)
2076 {
2077     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
2078     /*  Glulx has a 4-byte header instead. */
2079
2080     if (!glulx_mode)
2081         dictionary_top = 7;
2082     else
2083         dictionary_top = 4;
2084
2085     ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2086
2087     root = VACANT;
2088     dict_entries = 0;
2089 }
2090
2091 static int fdo_count;
2092 static void recursively_sort(int node)
2093 {   if (dtree[node].branch[0] != VACANT)
2094         recursively_sort(dtree[node].branch[0]);
2095     final_dict_order[node] = fdo_count++;
2096     if (dtree[node].branch[1] != VACANT)
2097         recursively_sort(dtree[node].branch[1]);
2098 }
2099
2100 extern void sort_dictionary(void)
2101 {
2102     final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2103
2104     if (root != VACANT)
2105     {   fdo_count = 0; recursively_sort(root);
2106     }
2107 }
2108
2109 /* ------------------------------------------------------------------------- */
2110 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
2111 /*   If not, return 0.                                                       */
2112 /* ------------------------------------------------------------------------- */
2113
2114 static int dictionary_find(char *dword)
2115 {   int at = root, n;
2116
2117     dictionary_prepare(dword, NULL);
2118
2119     while (at != VACANT)
2120     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2121         if (n==0) return at + 1;
2122         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2123     }
2124     return 0;
2125 }
2126
2127 /* ------------------------------------------------------------------------- */
2128 /*  Add "dword" to the dictionary with (x,y,z) as its data fields; unless    */
2129 /*  it already exists, in which case OR the data with (x,y,z)                */
2130 /*                                                                           */
2131 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
2132 /*                                                                           */
2133 /*  Returns: the accession number.                                           */
2134 /* ------------------------------------------------------------------------- */
2135
2136 extern int dictionary_add(char *dword, int x, int y, int z)
2137 {   int n; uchar *p;
2138     int ggfr = 0, gfr = 0, fr = 0, r = 0;
2139     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2140     int a, b;
2141     int res=((version_number==3)?4:6);
2142
2143     dictionary_prepare(dword, NULL);
2144
2145     if (root == VACANT)
2146     {   root = 0; goto CreateEntry;
2147     }
2148     while (TRUE)
2149     {
2150         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2151         if (n==0)
2152         {
2153             if (!glulx_mode) {
2154                 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2155                 p[0]=(p[0])|x; p[1]=(p[1])|y;
2156                 if (!ZCODE_LESS_DICT_DATA)
2157                     p[2]=(p[2])|z;
2158                 if (x & 128) p[0] = (p[0])|number_and_case;
2159             }
2160             else {
2161                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2162                 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
2163                 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
2164                 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
2165                 if (x & 128) p[1] = (p[1]) | number_and_case;
2166             }
2167             return at;
2168         }
2169         if (n>0) r=1; else r=0;
2170
2171         a = dtree[at].branch[0]; b = dtree[at].branch[1];
2172         if ((a != VACANT) && (dtree[a].colour == RED) &&
2173             (b != VACANT) && (dtree[b].colour == RED))
2174         {   dtree[a].colour = BLACK;
2175             dtree[b].colour = BLACK;
2176
2177             dtree[at].colour = RED;
2178
2179         /* A tree rotation may be needed to avoid two red links in a row:
2180            e.g.
2181              ggf   (or else gf is root)         ggf (or f is root)
2182               |                                  |
2183               gf                                 f
2184              / \(red)                           / \ (both red)
2185                 f            becomes          gf   at
2186                / \(red)                      /  \ /  \
2187                   at
2188                  /  \
2189
2190            In effect we rehang the "gf" subtree from "f".
2191            See the Technical Manual for further details.
2192         */
2193
2194             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2195             {
2196               if (fr == gfr)
2197               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2198                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2199                 dtree[f].branch[1-fr] = gf;
2200                 dtree[f].colour = BLACK;
2201                 dtree[gf].colour = RED;
2202                 gf = ggf; gfr = ggfr;
2203               }
2204               else
2205               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2206                 dtree[at].colour = BLACK;
2207                 dtree[gf].colour = RED;
2208                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2209                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2210                 dtree[at].branch[gfr] = f;
2211                 dtree[at].branch[fr] = gf;
2212
2213                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2214                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2215               }
2216             }
2217         }
2218
2219         if (dtree[at].branch[r] == VACANT)
2220         {   dtree[at].colour = RED;
2221
2222             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2223             { if (fr == gfr)
2224               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2225                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2226                 dtree[f].branch[1-fr] = gf;
2227                 dtree[f].colour = BLACK;
2228                 dtree[gf].colour = RED;
2229               }
2230               else
2231               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2232                 dtree[at].colour = BLACK;
2233                 dtree[gf].colour = RED;
2234                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2235                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2236                 dtree[at].branch[gfr] = f;
2237                 dtree[at].branch[fr] = gf;
2238
2239                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2240                 f = n; gf = ggf;
2241               }
2242             }
2243             dtree[at].branch[r] = dict_entries;
2244             goto CreateEntry;
2245         }
2246         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2247         ggfr = gfr; gfr = fr; fr = r;
2248     }
2249
2250     CreateEntry:
2251
2252     ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2253     ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2254
2255     dtree[dict_entries].branch[0] = VACANT;
2256     dtree[dict_entries].branch[1] = VACANT;
2257     dtree[dict_entries].colour    = BLACK;
2258
2259     /*  Address in Inform's own dictionary table to write the record to      */
2260
2261     if (!glulx_mode) {
2262
2263         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2264         p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2265
2266         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2267             bytes */
2268
2269         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2270         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2271         if (version_number > 3)
2272           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2273         p[res]=x; p[res+1]=y;
2274         if (!ZCODE_LESS_DICT_DATA) p[res+2]=z;
2275         if (x & 128) p[res] = (p[res])|number_and_case;
2276
2277         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2278
2279     }
2280     else {
2281         int i;
2282         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2283         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2284         p[0] = 0x60; /* type byte -- dict word */
2285
2286         p += DICT_CHAR_SIZE;
2287         for (i=0; i<DICT_WORD_BYTES; i++)
2288           p[i] = prepared_sort[i];
2289
2290         p += DICT_WORD_BYTES;
2291         p[0] = 0; p[1] = x;
2292         p[2] = y/256; p[3] = y%256;
2293         p[4] = 0; p[5] = z;
2294         if (x & 128)
2295           p[1] |= number_and_case;
2296
2297         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2298
2299     }
2300
2301     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2302
2303     return dict_entries++;
2304 }
2305
2306 /* ------------------------------------------------------------------------- */
2307 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2308 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2309 /*   change their verb-numbers.)                                             */
2310 /* ------------------------------------------------------------------------- */
2311
2312 extern void dictionary_set_verb_number(char *dword, int to)
2313 {   int i; uchar *p;
2314     int res=((version_number==3)?4:6);
2315     i=dictionary_find(dword);
2316     if (i!=0)
2317     {
2318         if (!glulx_mode) {
2319             p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2320             p[1]=to;
2321         }
2322         else {
2323             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2324             p[2]=to/256; p[3]=to%256;
2325         }
2326     }
2327 }
2328
2329 /* ------------------------------------------------------------------------- */
2330 /*   Tracing code for the dictionary: used by "trace" and text               */
2331 /*   transcription.                                                          */
2332 /* ------------------------------------------------------------------------- */
2333
2334 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2335    printed directly. (The "Trace dictionary" directive does this.)
2336    If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2337    until it's a page-width.
2338 */
2339 static char *d_show_buf = NULL;
2340 static int d_show_size; /* allocated size */
2341 static int d_show_len;  /* current length */
2342
2343 static void show_char(char c)
2344 {
2345     if (d_show_buf == NULL) {
2346         printf("%c", c);
2347     }
2348     else {
2349         if (d_show_len+2 >= d_show_size) {
2350             int newsize = 2 * d_show_len + 16;
2351             my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2352             d_show_size = newsize;
2353         }
2354         d_show_buf[d_show_len++] = c;
2355         d_show_buf[d_show_len] = '\0';
2356     }
2357 }
2358
2359 /* Display a Unicode character in user-readable form. This uses the same
2360    character encoding as the source code. */
2361 static void show_uchar(uint32 c)
2362 {
2363     char buf[16];
2364     int ix;
2365
2366     if (c < 0x80) {
2367         /* ASCII always works */
2368         show_char(c);
2369         return;
2370     }
2371     if (character_set_unicode) {
2372         /* UTF-8 the character */
2373         if (c < 0x80) {
2374             show_char(c);
2375         }
2376         else if (c < 0x800) {
2377             show_char((0xC0 | ((c & 0x7C0) >> 6)));
2378             show_char((0x80 |  (c & 0x03F)     ));
2379         }
2380         else if (c < 0x10000) {
2381             show_char((0xE0 | ((c & 0xF000) >> 12)));
2382             show_char((0x80 | ((c & 0x0FC0) >>  6)));
2383             show_char((0x80 |  (c & 0x003F)      ));
2384         }
2385         else if (c < 0x200000) {
2386             show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2387             show_char((0x80 | ((c & 0x03F000) >> 12)));
2388             show_char((0x80 | ((c & 0x000FC0) >>  6)));
2389             show_char((0x80 |  (c & 0x00003F)      ));
2390         }
2391         else {
2392             show_char('?');
2393         }
2394         return;
2395     }
2396     if (character_set_setting == 1 && c < 0x100) {
2397         /* Fits in Latin-1 */
2398         show_char(c);
2399         return;
2400     }
2401     /* Supporting other character_set_setting is harder; not currently implemented. */
2402
2403     /* Use the escaped form */
2404     sprintf(buf, "@{%x}", c);
2405     for (ix=0; buf[ix]; ix++)
2406         show_char(buf[ix]);
2407 }
2408
2409 extern void word_to_ascii(uchar *p, char *results)
2410 {   int i, shift, cc, zchar; uchar encoded_word[9];
2411     encoded_word[0] = (((int) p[0])&0x7c)/4;
2412     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2413     encoded_word[2] = ((int) p[1])&0x1f;
2414     encoded_word[3] = (((int) p[2])&0x7c)/4;
2415     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2416     encoded_word[5] = ((int) p[3])&0x1f;
2417     if (version_number > 3)
2418     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2419         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2420         encoded_word[8] = ((int) p[5])&0x1f;
2421     }
2422     else
2423     {
2424         encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2425     }
2426
2427     shift = 0; cc = 0;
2428     for (i=0; i< ((version_number==3)?6:9); i++)
2429     {   zchar = encoded_word[i];
2430
2431         if (zchar == 4) shift = 1;
2432         else
2433         if (zchar == 5) shift = 2;
2434         else
2435         {   if ((shift == 2) && (zchar == 6))
2436             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2437                 i += 2;
2438                 if ((zchar>=32) && (zchar<=126))
2439                     results[cc++] = zchar;
2440                 else
2441                 {   zscii_to_text(results+cc, zchar);
2442                     cc = strlen(results);
2443                 }
2444             }
2445             else
2446             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2447                 cc = strlen(results);
2448             }
2449             shift = 0;
2450         }
2451     }
2452     results[cc] = 0;
2453 }
2454
2455 /* Print a dictionary word to stdout.
2456    (This assumes that d_show_buf is null.)
2457  */
2458 void print_dict_word(int node)
2459 {
2460     uchar *p;
2461     int cprinted;
2462
2463     if (!glulx_mode) {
2464         char textual_form[32];
2465         p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2466
2467         word_to_ascii(p, textual_form);
2468
2469         for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2470             show_char(textual_form[cprinted]);
2471     }
2472     else {
2473         p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2474
2475         for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2476         {
2477             uint32 ch;
2478             if (DICT_CHAR_SIZE == 1)
2479                 ch = p[1+cprinted];
2480             else
2481                 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2482             if (!ch)
2483                 break;
2484             show_uchar(ch);
2485         }
2486     }
2487 }
2488
2489 static void recursively_show_z(int node, int level)
2490 {   int i, cprinted, flags; uchar *p;
2491     char textual_form[32];
2492     int res = (version_number == 3)?4:6; /* byte length of encoded text */
2493
2494     if (dtree[node].branch[0] != VACANT)
2495         recursively_show_z(dtree[node].branch[0], level);
2496
2497     p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2498
2499     word_to_ascii(p, textual_form);
2500
2501     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2502         show_char(textual_form[cprinted]);
2503     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2504         show_char(' ');
2505
2506     /* The level-1 info can only be printfed (d_show_buf must be null). */
2507     if (d_show_buf == NULL && level >= 1)
2508     {
2509         if (level >= 2) {
2510             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2511         }
2512
2513         flags = (int) p[res];
2514         if (flags & 128)
2515         {   printf("noun ");
2516             if (flags & 4)  printf("p"); else printf(" ");
2517             printf(" ");
2518         }
2519         else printf("       ");
2520         if (flags & 8)
2521         {   if (grammar_version_number == 1)
2522                 printf("preposition:%d  ", (int) p[res+2]);
2523             else
2524                 printf("preposition    ");
2525         }
2526         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2527         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2528         printf("\n");
2529     }
2530
2531     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2532     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2533     {
2534         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2535         d_show_len = 0;
2536     }
2537
2538     if (dtree[node].branch[1] != VACANT)
2539         recursively_show_z(dtree[node].branch[1], level);
2540 }
2541
2542 static void recursively_show_g(int node, int level)
2543 {   int i, cprinted;
2544     uchar *p;
2545
2546     if (dtree[node].branch[0] != VACANT)
2547         recursively_show_g(dtree[node].branch[0], level);
2548
2549     p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2550
2551     for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2552     {
2553         uint32 ch;
2554         if (DICT_CHAR_SIZE == 1)
2555             ch = p[1+cprinted];
2556         else
2557             ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2558         if (!ch)
2559             break;
2560         show_uchar(ch);
2561     }
2562     for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2563         show_char(' ');
2564
2565     /* The level-1 info can only be printfed (d_show_buf must be null). */
2566     if (d_show_buf == NULL && level >= 1)
2567     {   int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2568         int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2569         int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2570         if (level >= 2) {
2571             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2572         }
2573         if (flags & 128)
2574         {   printf("noun ");
2575             if (flags & 4)  printf("p"); else printf(" ");
2576             printf(" ");
2577         }
2578         else printf("       ");
2579         if (flags & 8)
2580         {   printf("preposition    ");
2581         }
2582         if ((flags & 3) == 3) printf("metaverb:%d  ", verbnum);
2583         else if ((flags & 3) == 1) printf("verb:%d  ", verbnum);
2584         printf("\n");
2585     }
2586
2587     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2588     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2589     {
2590         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2591         d_show_len = 0;
2592     }
2593
2594     if (dtree[node].branch[1] != VACANT)
2595         recursively_show_g(dtree[node].branch[1], level);
2596 }
2597
2598 static void show_alphabet(int i)
2599 {   int j, c; char chartext[8];
2600
2601     for (j=0; j<26; j++)
2602     {   c = alphabet[i][j];
2603
2604         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2605
2606         zscii_to_text(chartext, c);
2607         printf("%s", chartext);
2608
2609         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2610     }
2611     printf("\n");
2612 }
2613
2614 extern void show_dictionary(int level)
2615 {
2616     /* Level 0: show words only. Level 1: show words and flags.
2617        Level 2: also show bytes.*/
2618     printf("Dictionary contains %d entries:\n",dict_entries);
2619     if (dict_entries != 0)
2620     {   d_show_len = 0; d_show_buf = NULL;
2621         if (!glulx_mode)
2622             recursively_show_z(root, level);
2623         else
2624             recursively_show_g(root, level);
2625     }
2626     if (!glulx_mode)
2627     {
2628         printf("\nZ-machine alphabet entries:\n");
2629         show_alphabet(0);
2630         show_alphabet(1);
2631         show_alphabet(2);
2632     }
2633 }
2634
2635 extern void write_dictionary_to_transcript(void)
2636 {
2637     d_show_size = 80; /* initial size */
2638     d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2639
2640     write_to_transcript_file("", STRCTX_INFO);
2641     sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2642     write_to_transcript_file(d_show_buf, STRCTX_INFO);
2643
2644     d_show_len = 0;
2645
2646     if (dict_entries != 0)
2647     {
2648         if (!glulx_mode)
2649             recursively_show_z(root, 0);
2650         else
2651             recursively_show_g(root, 0);
2652     }
2653     if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2654
2655     my_free(&d_show_buf, "dictionary display buffer");
2656     d_show_len = 0; d_show_buf = NULL;
2657 }
2658
2659 /* ========================================================================= */
2660 /*   Data structure management routines                                      */
2661 /* ------------------------------------------------------------------------- */
2662
2663 extern void init_text_vars(void)
2664 {   int j;
2665
2666     opttext = NULL;
2667     opttextlen = 0;
2668     bestyet = NULL;
2669     bestyet2 = NULL;
2670     tlbtab = NULL;
2671     grandtable = NULL;
2672     grandflags = NULL;
2673
2674     all_text = NULL;
2675
2676     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2677
2678     total_zchars_trans = 0;
2679
2680     dictionary = NULL;
2681     dictionary_top = 0;
2682     dtree = NULL;
2683     final_dict_order = NULL;
2684     dict_sort_codes = NULL;
2685     dict_entries=0;
2686
2687     static_strings_area = NULL;
2688     abbreviations_optimal_parse_schedule = NULL;
2689     abbreviations_optimal_parse_scores = NULL;
2690
2691     compressed_offsets = NULL;
2692     huff_entities = NULL;
2693     hufflist = NULL;
2694     unicode_usage_entries = NULL;
2695 }
2696
2697 extern void text_begin_pass(void)
2698 {   abbrevs_lookup_table_made = FALSE;
2699     no_abbreviations=0;
2700     total_chars_trans=0; total_bytes_trans=0;
2701     all_text_top=0;
2702     dictionary_begin_pass();
2703     low_strings_top = 0;
2704
2705     static_strings_extent = 0;
2706     no_strings = 0;
2707     no_dynamic_strings = 0;
2708     no_unicode_chars = 0;
2709 }
2710
2711 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2712
2713 extern void text_allocate_arrays(void)
2714 {
2715     int ix;
2716
2717     initialise_memory_list(&translated_text_memlist,
2718         sizeof(uchar), 8000, (void**)&translated_text,
2719         "translated text holding area");
2720
2721     initialise_memory_list(&all_text_memlist,
2722         sizeof(char), 0, (void**)&all_text,
2723         "transcription text for optimise");
2724
2725     initialise_memory_list(&static_strings_area_memlist,
2726         sizeof(uchar), 128, (void**)&static_strings_area,
2727         "static strings area");
2728
2729     initialise_memory_list(&abbreviations_at_memlist,
2730         MAX_ABBREV_LENGTH, 64, (void**)&abbreviations_at,
2731         "abbreviation text");
2732
2733     initialise_memory_list(&abbreviations_memlist,
2734         sizeof(abbreviation), 64, (void**)&abbreviations,
2735         "abbreviations");
2736
2737     initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2738         sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2739         "abbreviations optimal parse schedule");
2740     initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2741         sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2742         "abbreviations optimal parse scores");
2743
2744     initialise_memory_list(&dtree_memlist,
2745         sizeof(dict_tree_node), 1500, (void**)&dtree,
2746         "red-black tree for dictionary");
2747     initialise_memory_list(&dict_sort_codes_memlist,
2748         sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2749         "dictionary sort codes");
2750
2751     final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2752
2753     /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2754        4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2755        allocation; we don't have to be precise. */
2756     initialise_memory_list(&dictionary_memlist,
2757         sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2758         "dictionary");
2759
2760     initialise_memory_list(&low_strings_memlist,
2761         sizeof(uchar), 1024, (void**)&low_strings,
2762         "low (abbreviation) strings");
2763
2764     d_show_buf = NULL;
2765     d_show_size = 0;
2766     d_show_len = 0;
2767
2768     huff_entities = NULL;
2769     hufflist = NULL;
2770     unicode_usage_entries = NULL;
2771     done_compression = FALSE;
2772     compression_table_size = 0;
2773     compressed_offsets = NULL;
2774
2775     initialise_memory_list(&unicode_usage_entries_memlist,
2776         sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2777         "unicode entity entries");
2778
2779     /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2780
2781     /* This hash table is only used in Glulx */
2782     for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2783         unicode_usage_hash[ix] = -1;
2784
2785     initialise_memory_list(&compressed_offsets_memlist,
2786         sizeof(int32), 0, (void**)&compressed_offsets,
2787         "static strings index table");
2788 }
2789
2790 extern void extract_all_text()
2791 {
2792     /* optimise_abbreviations() is called after free_arrays(). Therefore,
2793        we need to preserve the text transcript where it will not be
2794        freed up. We do this by copying the pointer to opttext. */
2795     opttext = all_text;
2796     opttextlen = all_text_top;
2797
2798     /* Re-init all_text_memlist. This causes it to forget all about the
2799        old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2800     initialise_memory_list(&all_text_memlist,
2801         sizeof(char), 0, (void**)&all_text,
2802         "dummy transcription text");
2803 }
2804
2805 extern void text_free_arrays(void)
2806 {
2807     deallocate_memory_list(&translated_text_memlist);
2808
2809     deallocate_memory_list(&all_text_memlist);
2810
2811     deallocate_memory_list(&low_strings_memlist);
2812     deallocate_memory_list(&abbreviations_at_memlist);
2813     deallocate_memory_list(&abbreviations_memlist);
2814
2815     deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2816     deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2817
2818     deallocate_memory_list(&dtree_memlist);
2819     deallocate_memory_list(&dict_sort_codes_memlist);
2820     my_free(&final_dict_order, "final dictionary ordering table");
2821
2822     deallocate_memory_list(&dictionary_memlist);
2823
2824     deallocate_memory_list(&compressed_offsets_memlist);
2825     my_free(&hufflist, "huffman node list");
2826     my_free(&huff_entities, "huffman entities");
2827
2828     deallocate_memory_list(&unicode_usage_entries_memlist);
2829
2830     deallocate_memory_list(&static_strings_area_memlist);
2831 }
2832
2833 extern void ao_free_arrays(void)
2834 {
2835     /* Called only after optimise_abbreviations() runs. */
2836
2837     my_free (&opttext,"stashed transcript for optimisation");
2838     my_free (&bestyet,"bestyet");
2839     my_free (&bestyet2,"bestyet2");
2840     my_free (&grandtable,"grandtable");
2841     my_free (&grandflags,"grandflags");
2842
2843     deallocate_memory_list(&tlbtab_memlist);
2844
2845     /* This was re-inited, so we should re-deallocate it. */
2846     deallocate_memory_list(&all_text_memlist);
2847 }
2848
2849 /* ========================================================================= */