src/text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /*   Part of Inform 6.35                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2021                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  *
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 uchar *low_strings, *low_strings_top;  /* Start and next free byte in the low
  25                                           strings pool */
  26
  27 int32 static_strings_extent;           /* Number of bytes of static strings
  28                                           made so far */
  29 memory_block static_strings_area;      /* Used if (!temporary_files_switch) to
  30                                           hold the static strings area so far */
  31
  32 static uchar *strings_holding_area;    /* Area holding translated strings
  33                                           until they are moved into either
  34                                           a temporary file, or the
  35                                           static_strings_area below */
  36
  37 char *all_text, *all_text_top;         /* Start and next byte free in (large)
  38                                           text buffer holding the entire text
  39                                           of the game, when it is being
  40                                           recorded                           */
  41 int is_abbreviation,                   /* When TRUE, the string being trans
  42                                           is itself an abbreviation string
  43                                           so can't make use of abbreviations */
  44     abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  45                                           constructed when the first non-
  46                                           abbreviation string is translated:
  47                                           this flag is TRUE after that       */
  48     abbrevs_lookup[256];               /* Once this has been constructed,
  49                                           abbrevs_lookup[n] = the smallest
  50                                           number of any abbreviation beginning
  51                                           with ASCII character n, or -1
  52                                           if none of the abbreviations do    */
  53 int no_abbreviations;                  /* No of abbreviations defined so far */
  54 uchar *abbreviations_at;                 /* Memory to hold the text of any
  55                                           abbreviation strings declared      */
  56 /* ------------------------------------------------------------------------- */
  57 /*   Glulx string compression storage                                        */
  58 /* ------------------------------------------------------------------------- */
  59
  60 int no_strings;                        /* No of strings in static strings
  61                                           area.                              */
  62 int no_dynamic_strings;                /* No. of @.. string escapes used
  63                                           (actually, the highest value used
  64                                           plus one)                          */
  65 int no_unicode_chars;                  /* Number of distinct Unicode chars
  66                                           used. (Beyond 0xFF.)               */
  67
  68 static int MAX_CHARACTER_SET;          /* Number of possible entities */
  69 huffentity_t *huff_entities;           /* The list of entities (characters,
  70                                           abbreviations, @.. escapes, and
  71                                           the terminator)                    */
  72 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  73
  74 int no_huff_entities;                  /* The number of entities in the list */
  75 int huff_unicode_start;                /* Position in the list where Unicode
  76                                           chars begin.                       */
  77 int huff_abbrev_start;                 /* Position in the list where string
  78                                           abbreviations begin.               */
  79 int huff_dynam_start;                  /* Position in the list where @..
  80                                           entities begin.                    */
  81 int huff_entity_root;                  /* The position in the list of the root
  82                                           entry (when considering the table
  83                                           as a tree).                        */
  84
  85 int done_compression;                  /* Has the game text been compressed? */
  86 int32 compression_table_size;          /* Length of the Huffman table, in
  87                                           bytes                              */
  88 int32 compression_string_size;         /* Length of the compressed string
  89                                           data, in bytes                     */
  90 int32 *compressed_offsets;             /* The beginning of every string in
  91                                           the game, relative to the beginning
  92                                           of the Huffman table. (So entry 0
  93                                           is equal to compression_table_size)*/
  94
  95 #define UNICODE_HASH_BUCKETS (64)
  96 unicode_usage_t *unicode_usage_entries;
  97 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
  98
  99 static int unicode_entity_index(int32 unicode);
 100
 101 /* ------------------------------------------------------------------------- */
 102 /*   Abbreviation arrays                                                     */
 103 /* ------------------------------------------------------------------------- */
 104
 105 int *abbrev_values;
 106 int *abbrev_quality;
 107 int *abbrev_freqs;
 108
 109 /* ------------------------------------------------------------------------- */
 110
 111 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 112       total_bytes_trans,               /* Number of bytes of Z-code text out */
 113       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 114                                           needed only for abbrev efficiency
 115                                           calculation in "directs.c"         */
 116 static int32 total_zchars_trans,       /* Number of Z-chars of text out
 117                                           (only used to calculate the above) */
 118       no_chars_transcribed;            /* Number of ASCII chars written to
 119                                           the text transcription area (used
 120                                           for the -r and -u switches)        */
 121
 122 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 123                                           3 Z-chars at a time: when it's full
 124                                           these are written as a 2-byte word */
 125            zob_index;                  /* Index (0 to 2) into it             */
 126
 127 static unsigned char *text_out_pc;     /* The "program counter" during text
 128                                           translation: the next address to
 129                                           write Z-coded text output to       */
 130
 131 static unsigned char *text_out_limit;  /* The upper limit of text_out_pc
 132                                           during text translation            */
 133
 134 static int text_out_overflow;          /* During text translation, becomes
 135                                           true if text_out_pc tries to pass
 136                                           text_out_limit                     */
 137
 138 /* ------------------------------------------------------------------------- */
 139 /*   For variables/arrays used by the dictionary manager, see below          */
 140 /* ------------------------------------------------------------------------- */
 141
 142 /* ------------------------------------------------------------------------- */
 143 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 144 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 145 /*   alphabetical order (this is necessary for the detection algorithm to    */
 146 /*   to work).  Since the table is only prepared once, and for a table       */
 147 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 148 /* ------------------------------------------------------------------------- */
 149
 150 static void make_abbrevs_lookup(void)
 151 {   int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
 152     do
 153     {   bubble_sort = FALSE;
 154         for (j=0; j<no_abbreviations; j++)
 155             for (k=j+1; k<no_abbreviations; k++)
 156             {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 157                 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
 158                 if (strcmp(p1,p2)<0)
 159                 {   strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
 160                     l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
 161                     abbrev_values[k]=l;
 162                     l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
 163                     abbrev_quality[k]=l;
 164                     bubble_sort = TRUE;
 165                 }
 166             }
 167     } while (bubble_sort);
 168
 169     for (j=no_abbreviations-1; j>=0; j--)
 170     {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 171         abbrevs_lookup[(uchar)p1[0]]=j;
 172         abbrev_freqs[j]=0;
 173     }
 174     abbrevs_lookup_table_made = TRUE;
 175 }
 176
 177 /* ------------------------------------------------------------------------- */
 178 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 179 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 180 /*   is only called if text[i] is indeed the first character of at least one */
 181 /*   abbreviation, "from" begin the least index into the abbreviations table */
 182 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 183 /*   that the abbrevs table is in alphabetical order.                        */
 184 /*                                                                           */
 185 /*   The return value is -1 if there is no match.  If there is a match, the  */
 186 /*   text to be abbreviated out is over-written by a string of null chars    */
 187 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 188 /*                                                                           */
 189 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 190 /* ------------------------------------------------------------------------- */
 191
 192 static int try_abbreviations_from(unsigned char *text, int i, int from)
 193 {   int j, k; uchar *p, c;
 194     c=text[i];
 195     for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 196          (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
 197     {   if (text[i+1]==p[1])
 198         {   for (k=2; p[k]!=0; k++)
 199                 if (text[i+k]!=p[k]) goto NotMatched;
 200             if (!glulx_mode) {
 201                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 202             }
 203             abbrev_freqs[j]++;
 204             return(j);
 205             NotMatched: ;
 206         }
 207     }
 208     return(-1);
 209 }
 210
 211 extern void make_abbreviation(char *text)
 212 {
 213     strcpy((char *)abbreviations_at
 214             + no_abbreviations*MAX_ABBREV_LENGTH, text);
 215
 216     is_abbreviation = TRUE;
 217     abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
 218     is_abbreviation = FALSE;
 219
 220     /*   The quality is the number of Z-chars saved by using this            */
 221     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 222
 223     abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
 224 }
 225
 226 /* ------------------------------------------------------------------------- */
 227 /*   The front end routine for text translation                              */
 228 /* ------------------------------------------------------------------------- */
 229
 230 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
 231 {   int i, j; uchar *c;
 232
 233     is_abbreviation = is_abbrev;
 234
 235     /* Put into the low memory pool (at 0x100 in the Z-machine) of strings   */
 236     /* which may be wanted as possible entries in the abbreviations table    */
 237
 238     if (!glulx_mode && in_low_memory)
 239     {   j=subtract_pointers(low_strings_top,low_strings);
 240         low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
 241         if (!low_strings_top)
 242             memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
 243         is_abbreviation = FALSE;
 244         return(0x21+(j/2));
 245     }
 246
 247     if (glulx_mode && done_compression)
 248         compiler_error("Tried to add a string after compression was done.");
 249
 250     c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
 251     if (!c)
 252         memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 253
 254     i = subtract_pointers(c, strings_holding_area);
 255
 256     /* Insert null bytes as needed to ensure that the next static string */
 257     /* also occurs at an address expressible as a packed address         */
 258
 259     if (!glulx_mode) {
 260         int textalign;
 261         if (oddeven_packing_switch)
 262             textalign = scale_factor*2;
 263         else
 264             textalign = scale_factor;
 265         while ((i%textalign)!=0)
 266         {
 267             if (i+2 > MAX_STATIC_STRINGS)
 268                 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 269             i+=2; *c++ = 0; *c++ = 0;
 270         }
 271     }
 272
 273     j = static_strings_extent;
 274
 275     if (temporary_files_switch)
 276         for (c=strings_holding_area; c<strings_holding_area+i;
 277              c++, static_strings_extent++)
 278             fputc(*c,Temp1_fp);
 279     else
 280         for (c=strings_holding_area; c<strings_holding_area+i;
 281              c++, static_strings_extent++)
 282             write_byte_to_memory_block(&static_strings_area,
 283                 static_strings_extent, *c);
 284
 285     is_abbreviation = FALSE;
 286
 287     if (!glulx_mode) {
 288         return(j/scale_factor);
 289     }
 290     else {
 291         /* The marker value is a one-based string number. (We reserve zero
 292            to mean "not a string at all". */
 293         return (++no_strings);
 294     }
 295 }
 296
 297 /* ------------------------------------------------------------------------- */
 298 /*   Output a single Z-character into the buffer, and flush it if full       */
 299 /* ------------------------------------------------------------------------- */
 300
 301 static void write_z_char_z(int i)
 302 {   uint32 j;
 303     ASSERT_ZCODE();
 304     total_zchars_trans++;
 305     zchars_out_buffer[zob_index++]=(i%32);
 306     if (zob_index!=3) return;
 307     zob_index=0;
 308     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 309        + zchars_out_buffer[2];
 310     if (text_out_pc+2 > text_out_limit) {
 311         text_out_overflow = TRUE;
 312         return;
 313     }
 314     text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
 315     total_bytes_trans+=2;
 316 }
 317
 318 static void write_zscii(int zsc)
 319 {
 320     int lookup_value, in_alphabet;
 321
 322     if (zsc==' ')
 323     {   write_z_char_z(0);
 324         return;
 325     }
 326
 327     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 328
 329     else lookup_value = -1;
 330
 331     if (lookup_value >= 0)
 332     {   alphabet_used[lookup_value] = 'Y';
 333         in_alphabet = lookup_value/26;
 334         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 335         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 336         write_z_char_z(lookup_value%26 + 6);
 337     }
 338     else
 339     {   write_z_char_z(5); write_z_char_z(6);
 340         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 341     }
 342 }
 343
 344 /* ------------------------------------------------------------------------- */
 345 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 346 /*   setting the "end" bit on the final 2-byte word                          */
 347 /* ------------------------------------------------------------------------- */
 348
 349 static void end_z_chars(void)
 350 {   unsigned char *p;
 351     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 352     while (zob_index!=0) write_z_char_z(5);
 353     p=(unsigned char *) text_out_pc;
 354     *(p-2)= *(p-2)+128;
 355 }
 356
 357 /* Glulx handles this much more simply -- compression is done elsewhere. */
 358 static void write_z_char_g(int i)
 359 {
 360   ASSERT_GLULX();
 361   if (text_out_pc+1 > text_out_limit) {
 362       text_out_overflow = TRUE;
 363       return;
 364   }
 365   total_zchars_trans++;
 366   text_out_pc[0] = i;
 367   text_out_pc++;
 368   total_bytes_trans++;
 369 }
 370
 371 /* ------------------------------------------------------------------------- */
 372 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 373 /*   translator. p is the address to write output to, s_text the source text */
 374 /*   and the return value is the next free address to write output to.       */
 375 /*   The return value will not exceed p_limit. If the translation tries to   */
 376 /*   overflow this boundary, the return value will be NULL (and you should   */
 377 /*   display an error).                                                      */
 378 /*   Note that the source text may be corrupted by this routine.             */
 379 /* ------------------------------------------------------------------------- */
 380
 381 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
 382 {   int i, j, k, in_alphabet, lookup_value;
 383     int32 unicode; int zscii;
 384     unsigned char *text_in;
 385
 386     /*  Cast the input and output streams to unsigned char: text_out_pc will
 387         advance as bytes of Z-coded text are written, but text_in doesn't    */
 388
 389     text_in     = (unsigned char *) s_text;
 390     text_out_pc = (unsigned char *) p;
 391     text_out_limit = (unsigned char *) p_limit;
 392     text_out_overflow = FALSE;
 393
 394     /*  Remember the Z-chars total so that later we can subtract to find the
 395         number of Z-chars translated on this string                          */
 396
 397     zchars_trans_in_last_string = total_zchars_trans;
 398
 399     /*  Start with the Z-characters output buffer empty                      */
 400
 401     zob_index=0;
 402
 403     /*  If this is the first text translated since the abbreviations were
 404         declared, and if some were declared, then it's time to make the
 405         lookup table for abbreviations
 406
 407         (Except: we don't if the text being translated is itself
 408         the text of an abbreviation currently being defined)                 */
 409
 410     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 411         && (!is_abbreviation))
 412         make_abbrevs_lookup();
 413
 414     /*  If we're storing the whole game text to memory, then add this text   */
 415
 416     if ((!is_abbreviation) && (store_the_text))
 417     {   no_chars_transcribed += strlen(s_text)+2;
 418         if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
 419             memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
 420         sprintf(all_text_top, "%s\n\n", s_text);
 421         all_text_top += strlen(all_text_top);
 422     }
 423
 424     if (transcript_switch && (!veneer_mode))
 425         write_to_transcript_file(s_text);
 426
 427   if (!glulx_mode) {
 428
 429     /*  The empty string of Z-text is illegal, since it can't carry an end
 430         bit: so we translate an empty string of ASCII text to just the
 431         pad character 5.  Printing this causes nothing to appear on screen.  */
 432
 433     if (text_in[0]==0) write_z_char_z(5);
 434
 435     /*  Loop through the characters of the null-terminated input text: note
 436         that if 1 is written over a character in the input text, it is
 437         afterwards ignored                                                   */
 438
 439     for (i=0; text_in[i]!=0; i++)
 440     {   total_chars_trans++;
 441
 442         /*  Contract ".  " into ". " if double-space-removing switch set:
 443             likewise "?  " and "!  " if the setting is high enough           */
 444
 445         if ((double_space_setting >= 1)
 446             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 447         {   if (text_in[i]=='.') text_in[i+2]=1;
 448             if (double_space_setting >= 2)
 449             {   if (text_in[i]=='?') text_in[i+2]=1;
 450                 if (text_in[i]=='!') text_in[i+2]=1;
 451             }
 452         }
 453
 454         /*  Try abbreviations if the economy switch set                      */
 455
 456         if ((economy_switch) && (!is_abbreviation)
 457             && ((k=abbrevs_lookup[text_in[i]])!=-1))
 458         {   if ((j=try_abbreviations_from(text_in, i, k))!=-1)
 459             {   /* abbreviations run from MAX_DYNAMIC_STRINGS to 96 */
 460                 j += MAX_DYNAMIC_STRINGS;
 461                 write_z_char_z(j/32+1); write_z_char_z(j%32);
 462             }
 463         }
 464
 465         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 466            decoding */
 467         if (character_set_unicode && (text_in[i] & 0x80))
 468         {   unicode = text_to_unicode((char *) (text_in+i));
 469             zscii = unicode_to_zscii(unicode);
 470             if (zscii != 5) write_zscii(zscii);
 471             else
 472             {   unicode_char_error(
 473                     "Character can only be used if declared in \
 474 advance as part of 'Zcharacter table':", unicode);
 475             }
 476             i += textual_form_length - 1;
 477             continue;
 478         }
 479
 480         /*  '@' is the escape character in Inform string notation: the various
 481             possibilities are:
 482
 483                 (printing only)
 484                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 485                 @twodigits       :  write the abbreviation string with this
 486                                     decimal number
 487
 488                 (any string context)
 489                 @accentcode      :  this accented character: e.g.,
 490                                         for @'e write an E-acute
 491                 @{...}           :  this Unicode char (in hex)              */
 492
 493         if (text_in[i]=='@')
 494         {   if (text_in[i+1]=='@')
 495             {
 496                 /*   @@...   */
 497
 498                 i+=2; j=atoi((char *) (text_in+i));
 499                 switch(j)
 500                 {   /* Prevent ~ and ^ from being translated to double-quote
 501                        and new-line, as they ordinarily would be */
 502
 503                     case 94:   write_z_char_z(5); write_z_char_z(6);
 504                                write_z_char_z(94/32); write_z_char_z(94%32);
 505                                break;
 506                     case 126:  write_z_char_z(5); write_z_char_z(6);
 507                                write_z_char_z(126/32); write_z_char_z(126%32);
 508                                break;
 509
 510                     default:   write_zscii(j); break;
 511                 }
 512                 while (isdigit(text_in[i])) i++; i--;
 513             }
 514             else if (isdigit(text_in[i+1])!=0)
 515             {   int d1, d2;
 516
 517                 /*   @.. (dynamic string)   */
 518
 519                 d1 = character_digit_value[text_in[i+1]];
 520                 d2 = character_digit_value[text_in[i+2]];
 521                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 522                     error("'@..' must have two decimal digits");
 523                 else
 524                 {
 525                     j = d1*10 + d2;
 526                     if (!glulx_mode && j >= 96)
 527                     {   error("Z-machine dynamic strings are limited to 96");
 528                         j = 0;
 529                     }
 530                     if (j >= MAX_DYNAMIC_STRINGS) {
 531                         memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
 532                         j = 0;
 533                     }
 534                     i+=2;
 535                     write_z_char_z(j/32+1); write_z_char_z(j%32);
 536                 }
 537             }
 538             else
 539             {
 540                 /*   A string escape specifying an unusual character   */
 541
 542                 unicode = text_to_unicode((char *) (text_in+i));
 543                 zscii = unicode_to_zscii(unicode);
 544                 if (zscii != 5) write_zscii(zscii);
 545                 else
 546                 {   unicode_char_error(
 547                        "Character can only be used if declared in \
 548 advance as part of 'Zcharacter table':", unicode);
 549                 }
 550                 i += textual_form_length - 1;
 551             }
 552         }
 553         else
 554         {   /*  Skip a character which has been over-written with the null
 555                 value 1 earlier on                                           */
 556
 557             if (text_in[i]!=1)
 558             {   if (text_in[i]==' ') write_z_char_z(0);
 559                 else
 560                 {   j = (int) text_in[i];
 561                     lookup_value = iso_to_alphabet_grid[j];
 562                     if (lookup_value < 0)
 563                     {   /*  The character isn't in the standard alphabets, so
 564                             we have to use the ZSCII 4-Z-char sequence */
 565
 566                         if (lookup_value == -5)
 567                         {   /*  Character isn't in the ZSCII set at all */
 568
 569                             unicode = iso_to_unicode(j);
 570                             unicode_char_error(
 571                                 "Character can only be used if declared in \
 572 advance as part of 'Zcharacter table':", unicode);
 573                             write_zscii(0x200 + unicode/0x100);
 574                             write_zscii(0x300 + unicode%0x100);
 575                         }
 576                         else write_zscii(-lookup_value);
 577                     }
 578                     else
 579                     {   /*  The character is in one of the standard alphabets:
 580                             write a SHIFT to temporarily change alphabet if
 581                             it isn't in alphabet 0, then write the Z-char    */
 582
 583                         alphabet_used[lookup_value] = 'Y';
 584                         in_alphabet = lookup_value/26;
 585                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 586                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 587                         write_z_char_z(lookup_value%26 + 6);
 588                     }
 589                 }
 590             }
 591         }
 592     }
 593
 594     /*  Flush the Z-characters output buffer and set the "end" bit           */
 595
 596     end_z_chars();
 597
 598   }
 599   else {
 600
 601     /* The text storage here is, of course, temporary. Compression
 602        will occur when we're finished compiling, so that all the
 603        clever Huffman stuff will work.
 604        In the stored text, we use "@@" to indicate @,
 605        "@0" to indicate a zero byte,
 606        "@ANNNN" to indicate an abbreviation,
 607        "@DNNNN" to indicate a dynamic string thing.
 608        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 609        (NNNN is a four-digit hex number using the letters A-P... an
 610        ugly representation but a convenient one.)
 611     */
 612
 613     for (i=0; text_in[i]!=0; i++) {
 614
 615       /*  Contract ".  " into ". " if double-space-removing switch set:
 616           likewise "?  " and "!  " if the setting is high enough. */
 617       if ((double_space_setting >= 1)
 618         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 619         if (text_in[i]=='.'
 620           || (double_space_setting >= 2
 621             && (text_in[i]=='?' || text_in[i]=='!'))) {
 622           text_in[i+1] = text_in[i];
 623           i++;
 624         }
 625       }
 626
 627       total_chars_trans++;
 628
 629       /*  Try abbreviations if the economy switch set. We have to be in
 630           compression mode too, since the abbreviation mechanism is part
 631           of string decompression. */
 632
 633       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 634         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 635         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 636         char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 637         i += (strlen(cx)-1);
 638         write_z_char_g('@');
 639         write_z_char_g('A');
 640         write_z_char_g('A' + ((j >>12) & 0x0F));
 641         write_z_char_g('A' + ((j >> 8) & 0x0F));
 642         write_z_char_g('A' + ((j >> 4) & 0x0F));
 643         write_z_char_g('A' + ((j     ) & 0x0F));
 644       }
 645       else if (text_in[i] == '@') {
 646         if (text_in[i+1]=='@') {
 647           /* An ASCII code */
 648           i+=2; j=atoi((char *) (text_in+i));
 649           if (j == '@' || j == '\0') {
 650             write_z_char_g('@');
 651             if (j == 0) {
 652               j = '0';
 653               if (!compression_switch)
 654                 warning("Ascii @@0 will prematurely terminate non-compressed \
 655 string.");
 656             }
 657           }
 658           write_z_char_g(j);
 659           while (isdigit(text_in[i])) i++; i--;
 660         }
 661         else if (isdigit(text_in[i+1])) {
 662           int d1, d2;
 663           d1 = character_digit_value[text_in[i+1]];
 664           d2 = character_digit_value[text_in[i+2]];
 665           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 666             error("'@..' must have two decimal digits");
 667           }
 668           else {
 669             if (!compression_switch)
 670               warning("'@..' print variable will not work in non-compressed \
 671 string; substituting '   '.");
 672             i += 2;
 673             j = d1*10 + d2;
 674             if (j >= MAX_DYNAMIC_STRINGS) {
 675               memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
 676               j = 0;
 677             }
 678             if (j+1 >= no_dynamic_strings)
 679               no_dynamic_strings = j+1;
 680             write_z_char_g('@');
 681             write_z_char_g('D');
 682             write_z_char_g('A' + ((j >>12) & 0x0F));
 683             write_z_char_g('A' + ((j >> 8) & 0x0F));
 684             write_z_char_g('A' + ((j >> 4) & 0x0F));
 685             write_z_char_g('A' + ((j     ) & 0x0F));
 686           }
 687         }
 688         else {
 689           unicode = text_to_unicode((char *) (text_in+i));
 690           i += textual_form_length - 1;
 691           if (unicode == '@' || unicode == '\0') {
 692             write_z_char_g('@');
 693             write_z_char_g(unicode ? '@' : '0');
 694           }
 695           else if (unicode >= 0 && unicode < 256) {
 696             write_z_char_g(unicode);
 697           }
 698           else {
 699             if (!compression_switch) {
 700               warning("Unicode characters will not work in non-compressed \
 701 string; substituting '?'.");
 702               write_z_char_g('?');
 703             }
 704             else {
 705               j = unicode_entity_index(unicode);
 706               write_z_char_g('@');
 707               write_z_char_g('U');
 708               write_z_char_g('A' + ((j >>12) & 0x0F));
 709               write_z_char_g('A' + ((j >> 8) & 0x0F));
 710               write_z_char_g('A' + ((j >> 4) & 0x0F));
 711               write_z_char_g('A' + ((j     ) & 0x0F));
 712             }
 713           }
 714         }
 715       }
 716       else if (text_in[i] == '^')
 717         write_z_char_g(0x0A);
 718       else if (text_in[i] == '~')
 719         write_z_char_g('"');
 720       else if (character_set_unicode) {
 721         if (text_in[i] & 0x80) {
 722           unicode = text_to_unicode((char *) (text_in+i));
 723           i += textual_form_length - 1;
 724           if (unicode >= 0 && unicode < 256) {
 725             write_z_char_g(unicode);
 726           }
 727           else {
 728             if (!compression_switch) {
 729               warning("Unicode characters will not work in non-compressed \
 730 string; substituting '?'.");
 731               write_z_char_g('?');
 732             }
 733             else {
 734               j = unicode_entity_index(unicode);
 735               write_z_char_g('@');
 736               write_z_char_g('U');
 737               write_z_char_g('A' + ((j >>12) & 0x0F));
 738               write_z_char_g('A' + ((j >> 8) & 0x0F));
 739               write_z_char_g('A' + ((j >> 4) & 0x0F));
 740               write_z_char_g('A' + ((j     ) & 0x0F));
 741             }
 742           }
 743         }
 744         else {
 745           write_z_char_g(text_in[i]);
 746         }
 747       }
 748       else {
 749         unicode = iso_to_unicode_grid[text_in[i]];
 750         if (unicode >= 0 && unicode < 256) {
 751           write_z_char_g(unicode);
 752         }
 753         else {
 754           if (!compression_switch) {
 755             warning("Unicode characters will not work in non-compressed \
 756 string; substituting '?'.");
 757             write_z_char_g('?');
 758           }
 759           else {
 760             j = unicode_entity_index(unicode);
 761             write_z_char_g('@');
 762             write_z_char_g('U');
 763             write_z_char_g('A' + ((j >>12) & 0x0F));
 764             write_z_char_g('A' + ((j >> 8) & 0x0F));
 765             write_z_char_g('A' + ((j >> 4) & 0x0F));
 766             write_z_char_g('A' + ((j     ) & 0x0F));
 767           }
 768         }
 769       }
 770     }
 771     write_z_char_g(0);
 772
 773   }
 774
 775   if (text_out_overflow)
 776       return NULL;
 777   else
 778       return((uchar *) text_out_pc);
 779 }
 780
 781 static int unicode_entity_index(int32 unicode)
 782 {
 783   unicode_usage_t *uptr;
 784   int j;
 785   int buck = unicode % UNICODE_HASH_BUCKETS;
 786
 787   for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
 788     if (uptr->ch == unicode)
 789       break;
 790   }
 791   if (uptr) {
 792     j = (uptr - unicode_usage_entries);
 793   }
 794   else {
 795     if (no_unicode_chars >= MAX_UNICODE_CHARS) {
 796       memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
 797       j = 0;
 798     }
 799     else {
 800       j = no_unicode_chars;
 801       no_unicode_chars++;
 802       uptr = unicode_usage_entries + j;
 803       uptr->ch = unicode;
 804       uptr->next = unicode_usage_hash[buck];
 805       unicode_usage_hash[buck] = uptr;
 806     }
 807   }
 808
 809   return j;
 810 }
 811
 812 /* ------------------------------------------------------------------------- */
 813 /*   Glulx compression code                                                  */
 814 /* ------------------------------------------------------------------------- */
 815
 816
 817 static void compress_makebits(int entnum, int depth, int prevbit,
 818   huffbitlist_t *bits);
 819
 820 /*   The compressor. This uses the usual Huffman compression algorithm. */
 821 void compress_game_text()
 822 {
 823   int entities=0, branchstart, branches;
 824   int numlive;
 825   int32 lx;
 826   int jx;
 827   int ch;
 828   int32 ix;
 829   huffbitlist_t bits;
 830
 831   if (compression_switch) {
 832
 833     /* How many entities have we currently got? Well, 256 plus the
 834        string-terminator plus Unicode chars plus abbrevations plus
 835        dynamic strings. */
 836     entities = 256+1;
 837     huff_unicode_start = entities;
 838     entities += no_unicode_chars;
 839     huff_abbrev_start = entities;
 840     if (economy_switch)
 841       entities += no_abbreviations;
 842     huff_dynam_start = entities;
 843     entities += no_dynamic_strings;
 844
 845     if (entities > MAX_CHARACTER_SET)
 846       memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
 847
 848     /* Characters */
 849     for (jx=0; jx<256; jx++) {
 850       huff_entities[jx].type = 2;
 851       huff_entities[jx].count = 0;
 852       huff_entities[jx].u.ch = jx;
 853     }
 854     /* Terminator */
 855     huff_entities[256].type = 1;
 856     huff_entities[256].count = 0;
 857     for (jx=0; jx<no_unicode_chars; jx++) {
 858       huff_entities[huff_unicode_start+jx].type = 4;
 859       huff_entities[huff_unicode_start+jx].count = 0;
 860       huff_entities[huff_unicode_start+jx].u.val = jx;
 861     }
 862     if (economy_switch) {
 863       for (jx=0; jx<no_abbreviations; jx++) {
 864         huff_entities[huff_abbrev_start+jx].type = 3;
 865         huff_entities[huff_abbrev_start+jx].count = 0;
 866         huff_entities[huff_abbrev_start+jx].u.val = jx;
 867       }
 868     }
 869     for (jx=0; jx<no_dynamic_strings; jx++) {
 870       huff_entities[huff_dynam_start+jx].type = 9;
 871       huff_entities[huff_dynam_start+jx].count = 0;
 872       huff_entities[huff_dynam_start+jx].u.val = jx;
 873     }
 874   }
 875   else {
 876     /* No compression; use defaults that will make it easy to check
 877        for errors. */
 878     no_huff_entities = 257;
 879     huff_unicode_start = 257;
 880     huff_abbrev_start = 257;
 881     huff_dynam_start = 257+MAX_ABBREVS;
 882     compression_table_size = 0;
 883   }
 884
 885   if (temporary_files_switch) {
 886     fclose(Temp1_fp);
 887     Temp1_fp=fopen(Temp1_Name,"rb");
 888     if (Temp1_fp==NULL)
 889       fatalerror("I/O failure: couldn't reopen temporary file 1");
 890   }
 891
 892   if (compression_switch) {
 893
 894     for (lx=0, ix=0; lx<no_strings; lx++) {
 895       int escapelen=0, escapetype=0;
 896       int done=FALSE;
 897       int32 escapeval=0;
 898       while (!done) {
 899         if (temporary_files_switch)
 900           ch = fgetc(Temp1_fp);
 901         else
 902           ch = read_byte_from_memory_block(&static_strings_area, ix);
 903         ix++;
 904         if (ix > static_strings_extent || ch < 0)
 905           compiler_error("Read too much not-yet-compressed text.");
 906         if (escapelen == -1) {
 907           escapelen = 0;
 908           if (ch == '@') {
 909             ch = '@';
 910           }
 911           else if (ch == '0') {
 912             ch = '\0';
 913           }
 914           else if (ch == 'A' || ch == 'D' || ch == 'U') {
 915             escapelen = 4;
 916             escapetype = ch;
 917             escapeval = 0;
 918             continue;
 919           }
 920           else {
 921             compiler_error("Strange @ escape in processed text.");
 922           }
 923         }
 924         else if (escapelen) {
 925           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
 926           escapelen--;
 927           if (escapelen == 0) {
 928             if (escapetype == 'A') {
 929               ch = huff_abbrev_start+escapeval;
 930             }
 931             else if (escapetype == 'D') {
 932               ch = huff_dynam_start+escapeval;
 933             }
 934             else if (escapetype == 'U') {
 935               ch = huff_unicode_start+escapeval;
 936             }
 937             else {
 938               compiler_error("Strange @ escape in processed text.");
 939             }
 940           }
 941           else
 942             continue;
 943         }
 944         else {
 945           if (ch == '@') {
 946             escapelen = -1;
 947             continue;
 948           }
 949           if (ch == 0) {
 950             ch = 256;
 951             done = TRUE;
 952           }
 953         }
 954         huff_entities[ch].count++;
 955       }
 956     }
 957
 958     numlive = 0;
 959     for (jx=0; jx<entities; jx++) {
 960       if (huff_entities[jx].count) {
 961         hufflist[numlive] = &(huff_entities[jx]);
 962         numlive++;
 963       }
 964     }
 965
 966     branchstart = entities;
 967     branches = 0;
 968
 969     while (numlive > 1) {
 970       int best1, best2;
 971       int best1num, best2num;
 972       huffentity_t *bran;
 973
 974       if (hufflist[0]->count < hufflist[1]->count) {
 975         best1 = 0;
 976         best2 = 1;
 977       }
 978       else {
 979         best2 = 0;
 980         best1 = 1;
 981       }
 982
 983       best1num = hufflist[best1]->count;
 984       best2num = hufflist[best2]->count;
 985
 986       for (jx=2; jx<numlive; jx++) {
 987         if (hufflist[jx]->count < best1num) {
 988           best2 = best1;
 989           best2num = best1num;
 990           best1 = jx;
 991           best1num = hufflist[best1]->count;
 992         }
 993         else if (hufflist[jx]->count < best2num) {
 994           best2 = jx;
 995           best2num = hufflist[best2]->count;
 996         }
 997       }
 998
 999       bran = &(huff_entities[branchstart+branches]);
1000       branches++;
1001       bran->type = 0;
1002       bran->count = hufflist[best1]->count + hufflist[best2]->count;
1003       bran->u.branch[0] = (hufflist[best1] - huff_entities);
1004       bran->u.branch[1] = (hufflist[best2] - huff_entities);
1005       hufflist[best1] = bran;
1006       if (best2 < numlive-1) {
1007         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1008           ((numlive-1) - best2) * sizeof(huffentity_t *));
1009       }
1010       numlive--;
1011     }
1012
1013     huff_entity_root = (hufflist[0] - huff_entities);
1014
1015     for (ix=0; ix<MAXHUFFBYTES; ix++)
1016       bits.b[ix] = 0;
1017     compression_table_size = 12;
1018
1019     no_huff_entities = 0; /* compress_makebits will total this up */
1020     compress_makebits(huff_entity_root, 0, -1, &bits);
1021   }
1022
1023   /* Now, sadly, we have to compute the size of the string section,
1024      without actually doing the compression. */
1025   compression_string_size = 0;
1026
1027   if (temporary_files_switch) {
1028     fseek(Temp1_fp, 0, SEEK_SET);
1029   }
1030
1031   if (no_strings >= MAX_NUM_STATIC_STRINGS)
1032     memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1033
1034   for (lx=0, ix=0; lx<no_strings; lx++) {
1035     int escapelen=0, escapetype=0;
1036     int done=FALSE;
1037     int32 escapeval=0;
1038     jx = 0;
1039     compressed_offsets[lx] = compression_table_size + compression_string_size;
1040     compression_string_size++; /* for the type byte */
1041     while (!done) {
1042       if (temporary_files_switch)
1043         ch = fgetc(Temp1_fp);
1044       else
1045         ch = read_byte_from_memory_block(&static_strings_area, ix);
1046       ix++;
1047       if (ix > static_strings_extent || ch < 0)
1048         compiler_error("Read too much not-yet-compressed text.");
1049       if (escapelen == -1) {
1050         escapelen = 0;
1051         if (ch == '@') {
1052           ch = '@';
1053         }
1054         else if (ch == '0') {
1055           ch = '\0';
1056         }
1057         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1058           escapelen = 4;
1059           escapetype = ch;
1060           escapeval = 0;
1061           continue;
1062         }
1063         else {
1064           compiler_error("Strange @ escape in processed text.");
1065         }
1066       }
1067       else if (escapelen) {
1068         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1069         escapelen--;
1070         if (escapelen == 0) {
1071           if (escapetype == 'A') {
1072             ch = huff_abbrev_start+escapeval;
1073           }
1074           else if (escapetype == 'D') {
1075             ch = huff_dynam_start+escapeval;
1076           }
1077           else if (escapetype == 'U') {
1078             ch = huff_unicode_start+escapeval;
1079           }
1080           else {
1081             compiler_error("Strange @ escape in processed text.");
1082           }
1083         }
1084         else
1085           continue;
1086       }
1087       else {
1088         if (ch == '@') {
1089           escapelen = -1;
1090           continue;
1091         }
1092         if (ch == 0) {
1093           ch = 256;
1094           done = TRUE;
1095         }
1096       }
1097
1098       if (compression_switch) {
1099         jx += huff_entities[ch].depth;
1100         compression_string_size += (jx/8);
1101         jx = (jx % 8);
1102       }
1103       else {
1104         if (ch >= huff_dynam_start) {
1105           compression_string_size += 3;
1106         }
1107         else if (ch >= huff_unicode_start) {
1108           compiler_error("Abbreviation/Unicode in non-compressed string \
1109 should be impossible.");
1110         }
1111         else
1112           compression_string_size += 1;
1113       }
1114     }
1115     if (compression_switch && jx)
1116       compression_string_size++;
1117   }
1118
1119   done_compression = TRUE;
1120 }
1121
1122 static void compress_makebits(int entnum, int depth, int prevbit,
1123   huffbitlist_t *bits)
1124 {
1125   huffentity_t *ent = &(huff_entities[entnum]);
1126   char *cx;
1127
1128   no_huff_entities++;
1129   ent->addr = compression_table_size;
1130   ent->depth = depth;
1131   ent->bits = *bits;
1132   if (depth > 0) {
1133     if (prevbit)
1134       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1135   }
1136
1137   switch (ent->type) {
1138   case 0:
1139     compression_table_size += 9;
1140     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1141     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1142     break;
1143   case 1:
1144     compression_table_size += 1;
1145     break;
1146   case 2:
1147     compression_table_size += 2;
1148     break;
1149   case 3:
1150     cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1151     compression_table_size += (1 + 1 + strlen(cx));
1152     break;
1153   case 4:
1154   case 9:
1155     compression_table_size += 5;
1156     break;
1157   }
1158 }
1159
1160 /* ------------------------------------------------------------------------- */
1161 /*   The abbreviations optimiser                                             */
1162 /*                                                                           */
1163 /*   This is a very complex, memory and time expensive algorithm to          */
1164 /*   approximately solve the problem of which abbreviation strings would     */
1165 /*   minimise the total number of Z-chars to which the game text translates. */
1166 /*   It is in some ways a quite separate program but remains inside Inform   */
1167 /*   for compatibility with previous releases.                               */
1168 /* ------------------------------------------------------------------------- */
1169
1170 typedef struct tlb_s
1171 {   char text[4];
1172     int32 intab, occurrences;
1173 } tlb;
1174 static tlb *tlbtab;
1175 static int32 no_occs;
1176
1177 static int32 *grandtable;
1178 static int32 *grandflags;
1179 typedef struct optab_s
1180 {   int32  length;
1181     int32  popularity;
1182     int32  score;
1183     int32  location;
1184     char text[MAX_ABBREV_LENGTH];
1185 } optab;
1186 static optab *bestyet, *bestyet2;
1187
1188 static int pass_no;
1189
1190 static char *sub_buffer;
1191
1192 static void optimise_pass(void)
1193 {   int32 i; int t1, t2;
1194     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1195     for (i=0; i<256; i++) bestyet[i].length=0;
1196     for (i=0; i<no_occs; i++)
1197     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1198         {
1199 #ifdef MAC_FACE
1200             if (i%((**g_pm_hndl).linespercheck) == 0)
1201             {   ProcessEvents (&g_proc);
1202                 if (g_proc != true)
1203                 {   free_arrays();
1204                     if (store_the_text)
1205                         my_free(&all_text,"transcription text");
1206                     longjmp (g_fallback, 1);
1207                 }
1208             }
1209 #endif
1210             printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1211                 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1212                 (long int) tlbtab[i].occurrences);
1213             t1=(int) (time(0));
1214             for (j=0; j<tlbtab[i].occurrences; j++)
1215             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1216                 nl=2; noflags=tlbtab[i].occurrences;
1217                 while ((noflags>=2)&&(nl<=62))
1218                 {   nl++;
1219                     for (j2=0; j2<nl; j2++)
1220                         if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1221                             goto FinishEarly;
1222                     matches=0;
1223                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1224                     {   if (grandflags[j2]==1)
1225                         {   x=grandtable[tlbtab[i].intab+j2]
1226                               - grandtable[tlbtab[i].intab+j];
1227                          if (((x>-nl)&&(x<nl))
1228                             || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1229                                        all_text+grandtable[tlbtab[i].intab+j2],
1230                                        nl)!=0))
1231                             {   grandflags[j2]=0; noflags--; }
1232                             else matches++;
1233                         }
1234                     }
1235                     scrabble=0;
1236                     for (k=0; k<nl; k++)
1237                     {   scrabble++;
1238                         c=all_text[grandtable[tlbtab[i].intab+j+k]];
1239                         if (c!=(int) ' ')
1240                         {   if (iso_to_alphabet_grid[c]<0)
1241                                 scrabble+=2;
1242                             else
1243                                 if (iso_to_alphabet_grid[c]>=26)
1244                                     scrabble++;
1245                         }
1246                     }
1247                     score=(matches-1)*(scrabble-2);
1248                     min=score;
1249                     for (j2=0; j2<256; j2++)
1250                     {   if ((nl==bestyet[j2].length)
1251                                 && (memcmp(all_text+bestyet[j2].location,
1252                                        all_text+grandtable[tlbtab[i].intab+j],
1253                                        nl)==0))
1254                         {   j2=256; min=score; }
1255                         else
1256                         {   if (bestyet[j2].score<min)
1257                             {   min=bestyet[j2].score; minat=j2;
1258                             }
1259                         }
1260                     }
1261                     if (min!=score)
1262                     {   bestyet[minat].score=score;
1263                         bestyet[minat].length=nl;
1264                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1265                         bestyet[minat].popularity=matches;
1266                         for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1267                             all_text[bestyet[minat].location+j2];
1268                         sub_buffer[nl]=0;
1269                     }
1270                 }
1271                 FinishEarly: ;
1272             }
1273             t2=((int) time(0)) - t1;
1274             printf(" (%d seconds)\n",t2);
1275         }
1276     }
1277 }
1278
1279 static int any_overlap(char *s1, char *s2)
1280 {   int a, b, i, j, flag;
1281     a=strlen(s1); b=strlen(s2);
1282     for (i=1-b; i<a; i++)
1283     {   flag=0;
1284         for (j=0; j<b; j++)
1285             if ((0<=i+j)&&(i+j<=a-1))
1286                 if (s1[i+j]!=s2[j]) flag=1;
1287         if (flag==0) return(1);
1288     }
1289     return(0);
1290 }
1291
1292 #define MAX_TLBS 8000
1293
1294 extern void optimise_abbreviations(void)
1295 {   int32 i, j, t, max=0, MAX_GTABLE;
1296     int32 j2, selected, available, maxat=0, nl;
1297     tlb test;
1298
1299     printf("Beginning calculation of optimal abbreviations...\n");
1300
1301     pass_no = 0;
1302     tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1303     sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1304     for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1305
1306     bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1307     bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1308
1309     bestyet2[0].text[0]='.';
1310     bestyet2[0].text[1]=' ';
1311     bestyet2[0].text[2]=0;
1312
1313     bestyet2[1].text[0]=',';
1314     bestyet2[1].text[1]=' ';
1315     bestyet2[1].text[2]=0;
1316
1317     for (i=0; all_text+i<all_text_top; i++)
1318     {
1319         if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1320         {   all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1321             bestyet2[0].popularity++;
1322         }
1323
1324         if ((all_text[i]=='.') && (all_text[i+1]==' '))
1325         {   all_text[i]='\n'; all_text[i+1]='\n';
1326             bestyet2[0].popularity++;
1327         }
1328
1329         if ((all_text[i]==',') && (all_text[i+1]==' '))
1330         {   all_text[i]='\n'; all_text[i+1]='\n';
1331             bestyet2[1].popularity++;
1332         }
1333     }
1334
1335     MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1336     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1337
1338     for (i=0, t=0; all_text+i<all_text_top; i++)
1339     {   test.text[0]=all_text[i];
1340         test.text[1]=all_text[i+1];
1341         test.text[2]=all_text[i+2];
1342         test.text[3]=0;
1343         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1344             goto DontKeep;
1345         for (j=0; j<no_occs; j++)
1346             if (strcmp(test.text,tlbtab[j].text)==0)
1347                 goto DontKeep;
1348         test.occurrences=0;
1349         for (j=i+3; all_text+j<all_text_top; j++)
1350         {
1351 #ifdef MAC_FACE
1352             if (j%((**g_pm_hndl).linespercheck) == 0)
1353             {   ProcessEvents (&g_proc);
1354                 if (g_proc != true)
1355                 {   free_arrays();
1356                     if (store_the_text)
1357                         my_free(&all_text,"transcription text");
1358                     longjmp (g_fallback, 1);
1359                 }
1360             }
1361 #endif
1362             if ((all_text[i]==all_text[j])
1363                  && (all_text[i+1]==all_text[j+1])
1364                  && (all_text[i+2]==all_text[j+2]))
1365                  {   grandtable[t+test.occurrences]=j;
1366                      test.occurrences++;
1367                      if (t+test.occurrences==MAX_GTABLE)
1368                      {   printf("All %ld cross-references used\n",
1369                              (long int) MAX_GTABLE);
1370                          goto Built;
1371                      }
1372                  }
1373         }
1374         if (test.occurrences>=2)
1375         {   tlbtab[no_occs]=test;
1376             tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1377             if (max<tlbtab[no_occs].occurrences)
1378                 max=tlbtab[no_occs].occurrences;
1379             no_occs++;
1380             if (no_occs==MAX_TLBS)
1381             {   printf("All %d three-letter-blocks used\n",
1382                     MAX_TLBS);
1383                 goto Built;
1384             }
1385         }
1386         DontKeep: ;
1387     }
1388
1389     Built:
1390     grandflags=my_calloc(sizeof(int), max, "grandflags");
1391
1392
1393     printf("Cross-reference table (%ld entries) built...\n",
1394         (long int) no_occs);
1395     /*  for (i=0; i<no_occs; i++)
1396             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1397                 tlbtab[i].occurrences);
1398     */
1399
1400     for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1401     available=256;
1402     while ((available>0)&&(selected<64))
1403     {   printf("Pass %d\n", ++pass_no);
1404
1405         optimise_pass();
1406         available=0;
1407         for (i=0; i<256; i++)
1408             if (bestyet[i].score!=0)
1409             {   available++;
1410                 nl=bestyet[i].length;
1411                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1412                     all_text[bestyet[i].location+j2];
1413                 bestyet[i].text[nl]=0;
1414             }
1415
1416     /*  printf("End of pass results:\n");
1417         printf("\nno   score  freq   string\n");
1418         for (i=0; i<256; i++)
1419             if (bestyet[i].score>0)
1420                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1421                     bestyet[i].popularity, bestyet[i].text);
1422     */
1423
1424         do
1425         {   max=0;
1426             for (i=0; i<256; i++)
1427                 if (max<bestyet[i].score)
1428                 {   max=bestyet[i].score;
1429                     maxat=i;
1430                 }
1431
1432             if (max>0)
1433             {   bestyet2[selected++]=bestyet[maxat];
1434
1435                 printf(
1436                     "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1437                     (long int) selected,bestyet[maxat].text,
1438                     (long int) bestyet[maxat].popularity,
1439                     (long int) bestyet[maxat].score);
1440
1441                 test.text[0]=bestyet[maxat].text[0];
1442                 test.text[1]=bestyet[maxat].text[1];
1443                 test.text[2]=bestyet[maxat].text[2];
1444                 test.text[3]=0;
1445
1446                 for (i=0; i<no_occs; i++)
1447                     if (strcmp(test.text,tlbtab[i].text)==0)
1448                         break;
1449
1450                 for (j=0; j<tlbtab[i].occurrences; j++)
1451                 {   if (memcmp(bestyet[maxat].text,
1452                                all_text+grandtable[tlbtab[i].intab+j],
1453                                bestyet[maxat].length)==0)
1454                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1455                             all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1456                     }
1457                 }
1458
1459                 for (i=0; i<256; i++)
1460                     if ((bestyet[i].score>0)&&
1461                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1462                     {   bestyet[i].score=0;
1463                        /* printf("Discarding '%s' as overlapping\n",
1464                             bestyet[i].text); */
1465                     }
1466             }
1467         } while ((max>0)&&(available>0)&&(selected<64));
1468     }
1469
1470     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1471     for (i=0; i<selected; i++)
1472         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1473
1474     text_free_arrays();
1475 }
1476
1477 /* ------------------------------------------------------------------------- */
1478 /*   The dictionary manager begins here.                                     */
1479 /*                                                                           */
1480 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1481 /*   routine were used to search the dictionary words so far built up, then  */
1482 /*   Inform would crawl.                                                     */
1483 /*                                                                           */
1484 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1485 /*   balanced with the red-black algorithm.                                  */
1486 /* ------------------------------------------------------------------------- */
1487 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1488 /*   7-byte header (left blank here to be filled in at the                   */
1489 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1490 /*   records, one per word, in the form                                      */
1491 /*                                                                           */
1492 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1493 /*        4 or 6 bytes       byte        byte             byte               */
1494 /*                                                                           */
1495 /*   For Glulx, the form is instead: (But see below about Unicode-valued     */
1496 /*   dictionaries and my heinie.)                                            */
1497 /*                                                                           */
1498 /*        <plain text>      <flags>   <verbnumber>     <adjectivenumber>     */
1499 /*        DICT_WORD_SIZE     short       short            short              */
1500 /*                                                                           */
1501 /*   These records are stored in "accession order" (i.e. in order of their   */
1502 /*   first being received by these routines) and only alphabetically sorted  */
1503 /*   by construct_storyfile() (using the array below).                       */
1504 /* ------------------------------------------------------------------------- */
1505 /*                                                                           */
1506 /*   Further notes about the data fields...                                  */
1507 /*   The flags are currently:                                                */
1508 /*     bit 0: word is used as a verb (in verb grammar)                       */
1509 /*     bit 1: word is used as a meta verb                                    */
1510 /*     bit 2: word is plural (set by '//p')                                  */
1511 /*     bit 3: word is used as a preposition (in verb grammar)                */
1512 /*     bit 6: set for all verbs, but not used by the parser?                 */
1513 /*     bit 7: word is used as a noun (set for every word that appears in     */
1514 /*       code or in an object property)                                      */
1515 /*                                                                           */
1516 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1517 /*   zero).                                                                  */
1518 /*                                                                           */
1519 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1520 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1521 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1522 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1523 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1524 /* ------------------------------------------------------------------------- */
1525
1526 uchar *dictionary,                    /* (These two pointers are externally
1527                                          used only in "tables.c" when
1528                                          building the story-file)            */
1529     *dictionary_top;                  /* Pointer to next free record         */
1530
1531 int dict_entries;                     /* Total number of records entered     */
1532
1533 /* ------------------------------------------------------------------------- */
1534 /*   dict_word is a typedef for a struct of 6 unsigned chars (defined in     */
1535 /*   "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word.     */
1536 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1537 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1538 /*   dict_word is called the "sort code" of the original text word.          */
1539 /*                                                                           */
1540 /*   ###- In modifying the compiler, I've found it easier to discard the     */
1541 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1542 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1543 /*   as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE.      */
1544 /*   (That limit is defined as 40 in the header; it exists only for a few    */
1545 /*   static buffers, and can be increased without using significant memory.) */
1546 /*                                                                           */
1547 /*   ###- Well, that certainly bit me on the butt, didn't it. In further     */
1548 /*   modifying the compiler to generate a Unicode dictionary, I have to      */
1549 /*   store four-byte values in the uchar array. This is handled by making    */
1550 /*   the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1551 /*   Then we store the 32-bit character value big-endian. This lets us       */
1552 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1553 /* ------------------------------------------------------------------------- */
1554
1555 extern int compare_sorts(uchar *d1, uchar *d2)
1556 {   int i;
1557     for (i=0; i<DICT_WORD_BYTES; i++)
1558         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1559     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1560        libraries) */
1561     return(0);
1562 }
1563
1564 extern void copy_sorts(uchar *d1, uchar *d2)
1565 {   int i;
1566     for (i=0; i<DICT_WORD_BYTES; i++)
1567         d1[i] = d2[i];
1568 }
1569
1570 static uchar prepared_sort[MAX_DICT_WORD_BYTES];     /* Holds the sort code
1571                                                         of current word */
1572
1573 static int number_and_case;
1574
1575 /* Also used by verbs.c */
1576 static void dictionary_prepare_z(char *dword, uchar *optresult)
1577 {   int i, j, k, k2, wd[13]; int32 tot;
1578
1579     /* A rapid text translation algorithm using only the simplified rules
1580        applying to the text of dictionary entries: first produce a sequence
1581        of 6 (v3) or 9 (v4+) Z-characters                                     */
1582
1583     int dictsize = (version_number==3) ? 6 : 9;
1584
1585     number_and_case = 0;
1586
1587     for (i=0, j=0; dword[j]!=0; i++, j++)
1588     {   if ((dword[j] == '/') && (dword[j+1] == '/'))
1589         {   for (j+=2; dword[j] != 0; j++)
1590             {   switch(dword[j])
1591                 {   case 'p': number_and_case |= 4;  break;
1592                     default:
1593                         error_named("Expected 'p' after '//' \
1594 to give number of dictionary word", dword);
1595                         break;
1596                 }
1597             }
1598             break;
1599         }
1600         if (i>=dictsize) break;
1601
1602         k=(int) dword[j];
1603         if (k==(int) '\'')
1604             warning_named("Obsolete usage: use the ^ character for the \
1605 apostrophe in", dword);
1606         if (k==(int) '^') k=(int) '\'';
1607         if (k=='\"') k='~';
1608
1609         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1610         {   int unicode = text_to_unicode(dword+j);
1611             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1612             k = unicode_to_zscii(unicode);
1613             j += textual_form_length - 1;
1614             if ((k == 5) || (k >= 0x100))
1615             {   unicode_char_error(
1616                    "Character can be printed but not input:", unicode);
1617                 k = '?';
1618             }
1619             k2 = zscii_to_alphabet_grid[(uchar) k];
1620         }
1621         else
1622         {   if (isupper(k)) k = tolower(k);
1623             k2 = iso_to_alphabet_grid[(uchar) k];
1624         }
1625
1626         if (k2 < 0)
1627         {   if ((k2 == -5) || (k2 <= -0x100))
1628                 char_error("Character can be printed but not input:", k);
1629             else
1630             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
1631
1632                 wd[i++] = 5; wd[i++] = 6;
1633                 k2 = -k2;
1634                 wd[i++] = k2/32; wd[i] = k2%32;
1635             }
1636         }
1637         else
1638         {   alphabet_used[k2] = 'Y';
1639             if ((k2/26)!=0)
1640                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
1641             wd[i]=6+(k2%26);                  /* Write the Z character       */
1642         }
1643     }
1644
1645     /* Fill up to the end of the dictionary block with PAD characters        */
1646
1647     for (; i<9; i++) wd[i]=5;
1648
1649     /* The array of Z-chars is converted to three 2-byte blocks              */
1650
1651     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1652     prepared_sort[1]=tot%0x100;
1653     prepared_sort[0]=(tot/0x100)%0x100;
1654     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1655     prepared_sort[3]=tot%0x100;
1656     prepared_sort[2]=(tot/0x100)%0x100;
1657     tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1658     prepared_sort[5]=tot%0x100;
1659     prepared_sort[4]=(tot/0x100)%0x100;
1660
1661     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
1662
1663     if (version_number==3) prepared_sort[2]+=0x80;
1664                       else prepared_sort[4]+=0x80;
1665
1666     if (optresult) copy_sorts(optresult, prepared_sort);
1667 }
1668
1669 /* Also used by verbs.c */
1670 static void dictionary_prepare_g(char *dword, uchar *optresult)
1671 {
1672   int i, j, k;
1673   int32 unicode;
1674
1675   number_and_case = 0;
1676
1677   for (i=0, j=0; (dword[j]!=0); i++, j++) {
1678     if ((dword[j] == '/') && (dword[j+1] == '/')) {
1679       for (j+=2; dword[j] != 0; j++) {
1680         switch(dword[j]) {
1681         case 'p':
1682           number_and_case |= 4;
1683           break;
1684         default:
1685           error_named("Expected 'p' after '//' \
1686 to give gender or number of dictionary word", dword);
1687           break;
1688         }
1689       }
1690       break;
1691     }
1692     if (i>=DICT_WORD_SIZE) break;
1693
1694     k= ((unsigned char *)dword)[j];
1695     if (k=='\'')
1696       warning_named("Obsolete usage: use the ^ character for the \
1697 apostrophe in", dword);
1698     if (k=='^')
1699       k='\'';
1700     if (k=='~') /* as in iso_to_alphabet_grid */
1701       k='\"';
1702
1703     if (k=='@' || (character_set_unicode && (k & 0x80))) {
1704       unicode = text_to_unicode(dword+j);
1705       j += textual_form_length - 1;
1706     }
1707     else {
1708       unicode = iso_to_unicode_grid[k];
1709     }
1710
1711     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1712       k = unicode;
1713     }
1714     else {
1715       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1716 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1717       k = '?';
1718     }
1719
1720     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1721       k += ('a' - 'A');
1722
1723     if (DICT_CHAR_SIZE == 1) {
1724       prepared_sort[i] = k;
1725     }
1726     else {
1727       prepared_sort[4*i]   = (k >> 24) & 0xFF;
1728       prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1729       prepared_sort[4*i+2] = (k >>  8) & 0xFF;
1730       prepared_sort[4*i+3] = (k)       & 0xFF;
1731     }
1732   }
1733
1734   if (DICT_CHAR_SIZE == 1) {
1735     for (; i<DICT_WORD_SIZE; i++)
1736       prepared_sort[i] = 0;
1737   }
1738   else {
1739     for (; i<DICT_WORD_SIZE; i++) {
1740       prepared_sort[4*i]   = 0;
1741       prepared_sort[4*i+1] = 0;
1742       prepared_sort[4*i+2] = 0;
1743       prepared_sort[4*i+3] = 0;
1744     }
1745   }
1746
1747   if (optresult) copy_sorts(optresult, prepared_sort);
1748 }
1749
1750 extern void dictionary_prepare(char *dword, uchar *optresult)
1751 {
1752   if (!glulx_mode)
1753     dictionary_prepare_z(dword, optresult);
1754   else
1755     dictionary_prepare_g(dword, optresult);
1756 }
1757
1758 /* ------------------------------------------------------------------------- */
1759 /*   The arrays below are all concerned with the problem of alphabetically   */
1760 /*   sorting the dictionary during the compilation pass.                     */
1761 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
1762 /*   the end of the pass: we need to ensure that no duplicates are ever      */
1763 /*   created.                                                                */
1764 /*                                                                           */
1765 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
1766 /*                          word to be entered into the dictionary, where    */
1767 /*                          n counts upward from 0                           */
1768 /*                          (n is also called the "accession number")        */
1769 /*                                                                           */
1770 /*   The tree structure encodes an ordering.  The special value VACANT means */
1771 /*   "no node here": otherwise, node numbers are the same as accession       */
1772 /*   numbers.  At all times, "root" holds the node number of the top of the  */
1773 /*   tree; each node has up to two branches, such that the subtree of the    */
1774 /*   left branch is always alphabetically before what's at the node, and     */
1775 /*   the subtree to the right is always after; and all branches are coloured */
1776 /*   either "black" or "red".  These colours are used to detect points where */
1777 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
1778 /*   to search).                                                             */
1779 /* ------------------------------------------------------------------------- */
1780
1781 #define RED    'r'
1782 #define BLACK  'b'
1783 #define VACANT -1
1784
1785 static int root;
1786 typedef struct dict_tree_node_s
1787 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
1788     char colour;                  /* The colour of the branch to the parent */
1789 } dict_tree_node;
1790
1791 static dict_tree_node *dtree;
1792
1793 int   *final_dict_order;
1794 static uchar *dict_sort_codes;
1795
1796 static void dictionary_begin_pass(void)
1797 {
1798     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
1799     /*  Glulx has a 4-byte header instead. */
1800
1801     if (!glulx_mode)
1802         dictionary_top=dictionary+7;
1803     else
1804         dictionary_top=dictionary+4;
1805
1806     root = VACANT;
1807     dict_entries = 0;
1808 }
1809
1810 static int fdo_count;
1811 static void recursively_sort(int node)
1812 {   if (dtree[node].branch[0] != VACANT)
1813         recursively_sort(dtree[node].branch[0]);
1814     final_dict_order[node] = fdo_count++;
1815     if (dtree[node].branch[1] != VACANT)
1816         recursively_sort(dtree[node].branch[1]);
1817 }
1818
1819 extern void sort_dictionary(void)
1820 {   int i;
1821     if (module_switch)
1822     {   for (i=0; i<dict_entries; i++)
1823             final_dict_order[i] = i;
1824         return;
1825     }
1826
1827     if (root != VACANT)
1828     {   fdo_count = 0; recursively_sort(root);
1829     }
1830 }
1831
1832 /* ------------------------------------------------------------------------- */
1833 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
1834 /*   If not, return 0.                                                       */
1835 /* ------------------------------------------------------------------------- */
1836
1837 static int dictionary_find(char *dword)
1838 {   int at = root, n;
1839
1840     dictionary_prepare(dword, NULL);
1841
1842     while (at != VACANT)
1843     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1844         if (n==0) return at + 1;
1845         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1846     }
1847     return 0;
1848 }
1849
1850 /* ------------------------------------------------------------------------- */
1851 /*  Add "dword" to the dictionary with (x,y,z) as its data fields; unless    */
1852 /*  it already exists, in which case OR the data with (x,y,z)                */
1853 /*                                                                           */
1854 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
1855 /*                                                                           */
1856 /*  Returns: the accession number.                                           */
1857 /* ------------------------------------------------------------------------- */
1858
1859 extern int dictionary_add(char *dword, int x, int y, int z)
1860 {   int n; uchar *p;
1861     int ggfr = 0, gfr = 0, fr = 0, r = 0;
1862     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1863     int a, b;
1864     int res=((version_number==3)?4:6);
1865
1866     dictionary_prepare(dword, NULL);
1867
1868     if (root == VACANT)
1869     {   root = 0; goto CreateEntry;
1870     }
1871     while (TRUE)
1872     {
1873         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1874         if (n==0)
1875         {
1876             if (!glulx_mode) {
1877                 p = dictionary+7 + at*(3+res) + res;
1878                 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1879                 if (x & 128) p[0] = (p[0])|number_and_case;
1880             }
1881             else {
1882                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1883                 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1884                 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1885                 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1886                 if (x & 128) p[1] = (p[1]) | number_and_case;
1887             }
1888             return at;
1889         }
1890         if (n>0) r=1; else r=0;
1891
1892         a = dtree[at].branch[0]; b = dtree[at].branch[1];
1893         if ((a != VACANT) && (dtree[a].colour == RED) &&
1894             (b != VACANT) && (dtree[b].colour == RED))
1895         {   dtree[a].colour = BLACK;
1896             dtree[b].colour = BLACK;
1897
1898             dtree[at].colour = RED;
1899
1900         /* A tree rotation may be needed to avoid two red links in a row:
1901            e.g.
1902              ggf   (or else gf is root)         ggf (or f is root)
1903               |                                  |
1904               gf                                 f
1905              / \(red)                           / \ (both red)
1906                 f            becomes          gf   at
1907                / \(red)                      /  \ /  \
1908                   at
1909                  /  \
1910
1911            In effect we rehang the "gf" subtree from "f".
1912            See the Technical Manual for further details.
1913         */
1914
1915             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1916             {
1917               if (fr == gfr)
1918               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1919                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1920                 dtree[f].branch[1-fr] = gf;
1921                 dtree[f].colour = BLACK;
1922                 dtree[gf].colour = RED;
1923                 gf = ggf; gfr = ggfr;
1924               }
1925               else
1926               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1927                 dtree[at].colour = BLACK;
1928                 dtree[gf].colour = RED;
1929                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1930                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1931                 dtree[at].branch[gfr] = f;
1932                 dtree[at].branch[fr] = gf;
1933
1934                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1935                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1936               }
1937             }
1938         }
1939
1940         if (dtree[at].branch[r] == VACANT)
1941         {   dtree[at].colour = RED;
1942
1943             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1944             { if (fr == gfr)
1945               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1946                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1947                 dtree[f].branch[1-fr] = gf;
1948                 dtree[f].colour = BLACK;
1949                 dtree[gf].colour = RED;
1950               }
1951               else
1952               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1953                 dtree[at].colour = BLACK;
1954                 dtree[gf].colour = RED;
1955                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1956                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1957                 dtree[at].branch[gfr] = f;
1958                 dtree[at].branch[fr] = gf;
1959
1960                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1961                 f = n; gf = ggf;
1962               }
1963             }
1964             dtree[at].branch[r] = dict_entries;
1965             goto CreateEntry;
1966         }
1967         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1968         ggfr = gfr; gfr = fr; fr = r;
1969     }
1970
1971     CreateEntry:
1972
1973     if (dict_entries==MAX_DICT_ENTRIES)
1974         memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1975
1976     dtree[dict_entries].branch[0] = VACANT;
1977     dtree[dict_entries].branch[1] = VACANT;
1978     dtree[dict_entries].colour    = BLACK;
1979
1980     /*  Address in Inform's own dictionary table to write the record to      */
1981
1982     if (!glulx_mode) {
1983
1984         p = dictionary + (3+res)*dict_entries + 7;
1985
1986         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1987             bytes */
1988
1989         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1990         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1991         if (version_number > 3)
1992           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1993         p[res]=x; p[res+1]=y; p[res+2]=z;
1994         if (x & 128) p[res] = (p[res])|number_and_case;
1995
1996         dictionary_top += res+3;
1997
1998     }
1999     else {
2000         int i;
2001         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2002         p[0] = 0x60; /* type byte -- dict word */
2003
2004         p += DICT_CHAR_SIZE;
2005         for (i=0; i<DICT_WORD_BYTES; i++)
2006           p[i] = prepared_sort[i];
2007
2008         p += DICT_WORD_BYTES;
2009         p[0] = 0; p[1] = x;
2010         p[2] = y/256; p[3] = y%256;
2011         p[4] = 0; p[5] = z;
2012         if (x & 128)
2013           p[1] |= number_and_case;
2014
2015         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2016
2017     }
2018
2019     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2020
2021     return dict_entries++;
2022 }
2023
2024 /* ------------------------------------------------------------------------- */
2025 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2026 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2027 /*   change their verb-numbers.)                                             */
2028 /* ------------------------------------------------------------------------- */
2029
2030 extern void dictionary_set_verb_number(char *dword, int to)
2031 {   int i; uchar *p;
2032     int res=((version_number==3)?4:6);
2033     i=dictionary_find(dword);
2034     if (i!=0)
2035     {
2036         if (!glulx_mode) {
2037             p=dictionary+7+(i-1)*(3+res)+res;
2038             p[1]=to;
2039         }
2040         else {
2041             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2042             p[2]=to/256; p[3]=to%256;
2043         }
2044     }
2045 }
2046
2047 /* ------------------------------------------------------------------------- */
2048 /*   Tracing code for the dictionary: used not only by "trace" and text      */
2049 /*   transcription, but also (in the case of "word_to_ascii") in a vital     */
2050 /*   by the linker.                                                          */
2051 /* ------------------------------------------------------------------------- */
2052
2053 static char *d_show_to;
2054 static int d_show_total;
2055
2056 static void show_char(char c)
2057 {   if (d_show_to == NULL) printf("%c", c);
2058     else
2059     {   int i = strlen(d_show_to);
2060         d_show_to[i] = c; d_show_to[i+1] = 0;
2061     }
2062 }
2063
2064 extern void word_to_ascii(uchar *p, char *results)
2065 {   int i, shift, cc, zchar; uchar encoded_word[9];
2066     encoded_word[0] = (((int) p[0])&0x7c)/4;
2067     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2068     encoded_word[2] = ((int) p[1])&0x1f;
2069     encoded_word[3] = (((int) p[2])&0x7c)/4;
2070     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2071     encoded_word[5] = ((int) p[3])&0x1f;
2072     if (version_number > 3)
2073     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2074         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2075         encoded_word[8] = ((int) p[5])&0x1f;
2076     }
2077
2078     shift = 0; cc = 0;
2079     for (i=0; i< ((version_number==3)?6:9); i++)
2080     {   zchar = encoded_word[i];
2081
2082         if (zchar == 4) shift = 1;
2083         else
2084         if (zchar == 5) shift = 2;
2085         else
2086         {   if ((shift == 2) && (zchar == 6))
2087             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2088                 i += 2;
2089                 if ((zchar>=32) && (zchar<=126))
2090                     results[cc++] = zchar;
2091                 else
2092                 {   zscii_to_text(results+cc, zchar);
2093                     cc = strlen(results);
2094                 }
2095             }
2096             else
2097             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2098                 cc = strlen(results);
2099             }
2100             shift = 0;
2101         }
2102     }
2103     results[cc] = 0;
2104 }
2105
2106 static void recursively_show_z(int node)
2107 {   int i, cprinted, flags; uchar *p;
2108     char textual_form[32];
2109     int res = (version_number == 3)?4:6;
2110
2111     if (dtree[node].branch[0] != VACANT)
2112         recursively_show_z(dtree[node].branch[0]);
2113
2114     p = (uchar *)dictionary + 7 + (3+res)*node;
2115
2116     word_to_ascii(p, textual_form);
2117
2118     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2119         show_char(textual_form[cprinted]);
2120     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2121         show_char(' ');
2122
2123     if (d_show_to == NULL)
2124     {   for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2125
2126         flags = (int) p[res];
2127         if (flags & 128)
2128         {   printf("noun ");
2129             if (flags & 4)  printf("p"); else printf(" ");
2130             printf(" ");
2131         }
2132         else printf("       ");
2133         if (flags & 8)
2134         {   if (grammar_version_number == 1)
2135                 printf("preposition:%d  ", (int) p[res+2]);
2136             else
2137                 printf("preposition    ");
2138         }
2139         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2140         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2141         printf("\n");
2142     }
2143
2144     if (d_show_total++ == 5)
2145     {   d_show_total = 0;
2146         if (d_show_to != NULL)
2147         {   write_to_transcript_file(d_show_to);
2148             d_show_to[0] = 0;
2149         }
2150     }
2151
2152     if (dtree[node].branch[1] != VACANT)
2153         recursively_show_z(dtree[node].branch[1]);
2154 }
2155
2156 static void recursively_show_g(int node)
2157 {
2158   warning("### Glulx dictionary-show not yet implemented.\n");
2159 }
2160
2161 static void show_alphabet(int i)
2162 {   int j, c; char chartext[8];
2163
2164     for (j=0; j<26; j++)
2165     {   c = alphabet[i][j];
2166
2167         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2168
2169         zscii_to_text(chartext, c);
2170         printf("%s", chartext);
2171
2172         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2173     }
2174     printf("\n");
2175 }
2176
2177 extern void show_dictionary(void)
2178 {   printf("Dictionary contains %d entries:\n",dict_entries);
2179     if (dict_entries != 0)
2180     {   d_show_total = 0; d_show_to = NULL;
2181         if (!glulx_mode)
2182             recursively_show_z(root);
2183         else
2184             recursively_show_g(root);
2185     }
2186     printf("\nZ-machine alphabet entries:\n");
2187     show_alphabet(0);
2188     show_alphabet(1);
2189     show_alphabet(2);
2190 }
2191
2192 extern void write_dictionary_to_transcript(void)
2193 {   char d_buffer[81];
2194
2195     sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2196
2197     d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2198
2199     if (dict_entries != 0)
2200     {   d_show_total = 0; d_show_to = d_buffer;
2201         if (!glulx_mode)
2202             recursively_show_z(root);
2203         else
2204             recursively_show_g(root);
2205     }
2206     if (d_show_total != 0) write_to_transcript_file(d_buffer);
2207 }
2208
2209 /* ========================================================================= */
2210 /*   Data structure management routines                                      */
2211 /* ------------------------------------------------------------------------- */
2212
2213 extern void init_text_vars(void)
2214 {   int j;
2215     bestyet = NULL;
2216     bestyet2 = NULL;
2217     tlbtab = NULL;
2218     grandtable = NULL;
2219     grandflags = NULL;
2220     no_chars_transcribed = 0;
2221     is_abbreviation = FALSE;
2222
2223     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2224
2225     total_zchars_trans = 0;
2226
2227     dtree = NULL;
2228     final_dict_order = NULL;
2229     dict_sort_codes = NULL;
2230     dict_entries=0;
2231
2232     initialise_memory_block(&static_strings_area);
2233 }
2234
2235 extern void text_begin_pass(void)
2236 {   abbrevs_lookup_table_made = FALSE;
2237     no_abbreviations=0;
2238     total_chars_trans=0; total_bytes_trans=0;
2239     if (store_the_text) all_text_top=all_text;
2240     dictionary_begin_pass();
2241     low_strings_top = low_strings;
2242
2243     static_strings_extent = 0;
2244     no_strings = 0;
2245     no_dynamic_strings = 0;
2246     no_unicode_chars = 0;
2247 }
2248
2249 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2250
2251 extern void text_allocate_arrays(void)
2252 {   abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2253         "abbreviations");
2254     abbrev_values    = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2255     abbrev_quality   = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2256     abbrev_freqs     = my_calloc(sizeof(int),   MAX_ABBREVS, "abbrev freqs");
2257
2258     dtree            = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2259                                  "red-black tree for dictionary");
2260     final_dict_order = my_calloc(sizeof(int),  MAX_DICT_ENTRIES,
2261                                  "final dictionary ordering table");
2262     dict_sort_codes  = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2263                                  "dictionary sort codes");
2264
2265     if (!glulx_mode)
2266         dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2267             "dictionary");
2268     else
2269         dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2270             "dictionary");
2271
2272     strings_holding_area
2273          = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2274     low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2275
2276     huff_entities = NULL;
2277     hufflist = NULL;
2278     unicode_usage_entries = NULL;
2279     done_compression = FALSE;
2280     compression_table_size = 0;
2281     compressed_offsets = NULL;
2282
2283     MAX_CHARACTER_SET = 0;
2284
2285     if (glulx_mode) {
2286       if (compression_switch) {
2287         int ix;
2288         MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2289           + MAX_UNICODE_CHARS;
2290         huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2291           "huffman entities");
2292         hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2293           "huffman node list");
2294         unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2295           MAX_UNICODE_CHARS, "unicode entity entries");
2296         for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2297           unicode_usage_hash[ix] = NULL;
2298       }
2299       compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2300         "static strings index table");
2301     }
2302 }
2303
2304 extern void text_free_arrays(void)
2305 {
2306     my_free(&strings_holding_area, "static strings holding area");
2307     my_free(&low_strings, "low (abbreviation) strings");
2308     my_free(&abbreviations_at, "abbreviations");
2309     my_free(&abbrev_values,    "abbrev values");
2310     my_free(&abbrev_quality,   "abbrev quality");
2311     my_free(&abbrev_freqs,     "abbrev freqs");
2312
2313     my_free(&dtree,            "red-black tree for dictionary");
2314     my_free(&final_dict_order, "final dictionary ordering table");
2315     my_free(&dict_sort_codes,  "dictionary sort codes");
2316
2317     my_free(&dictionary,"dictionary");
2318
2319     my_free(&compressed_offsets, "static strings index table");
2320     my_free(&hufflist, "huffman node list");
2321     my_free(&huff_entities, "huffman entities");
2322     my_free(&unicode_usage_entries, "unicode entity entities");
2323
2324     deallocate_memory_block(&static_strings_area);
2325 }
2326
2327 extern void ao_free_arrays(void)
2328 {   my_free (&tlbtab,"tlb table");
2329     my_free (&sub_buffer,"sub_buffer");
2330     my_free (&bestyet,"bestyet");
2331     my_free (&bestyet2,"bestyet2");
2332     my_free (&grandtable,"grandtable");
2333     my_free (&grandflags,"grandflags");
2334 }
2335
2336 /* ========================================================================= */