src/text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /*   Part of Inform 6.35                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2020                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  *
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 uchar *low_strings, *low_strings_top;  /* Start and next free byte in the low
  25                                           strings pool */
  26
  27 int32 static_strings_extent;           /* Number of bytes of static strings
  28                                           made so far */
  29 memory_block static_strings_area;      /* Used if (!temporary_files_switch) to
  30                                           hold the static strings area so far */
  31
  32 static uchar *strings_holding_area;    /* Area holding translated strings
  33                                           until they are moved into either
  34                                           a temporary file, or the
  35                                           static_strings_area below */
  36
  37 char *all_text, *all_text_top;         /* Start and next byte free in (large)
  38                                           text buffer holding the entire text
  39                                           of the game, when it is being
  40                                           recorded                           */
  41 int put_strings_in_low_memory,         /* When TRUE, put static strings in
  42                                           the low strings pool at 0x100 rather
  43                                           than in the static strings area    */
  44     is_abbreviation,                   /* When TRUE, the string being trans
  45                                           is itself an abbreviation string
  46                                           so can't make use of abbreviations */
  47     abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  48                                           constructed when the first non-
  49                                           abbreviation string is translated:
  50                                           this flag is TRUE after that       */
  51     abbrevs_lookup[256];               /* Once this has been constructed,
  52                                           abbrevs_lookup[n] = the smallest
  53                                           number of any abbreviation beginning
  54                                           with ASCII character n, or -1
  55                                           if none of the abbreviations do    */
  56 int no_abbreviations;                  /* No of abbreviations defined so far */
  57 uchar *abbreviations_at;                 /* Memory to hold the text of any
  58                                           abbreviation strings declared      */
  59 /* ------------------------------------------------------------------------- */
  60 /*   Glulx string compression storage                                        */
  61 /* ------------------------------------------------------------------------- */
  62
  63 int no_strings;                        /* No of strings in static strings
  64                                           area.                              */
  65 int no_dynamic_strings;                /* No. of @.. string escapes used
  66                                           (actually, the highest value used
  67                                           plus one)                          */
  68 int no_unicode_chars;                  /* Number of distinct Unicode chars
  69                                           used. (Beyond 0xFF.)               */
  70
  71 static int MAX_CHARACTER_SET;          /* Number of possible entities */
  72 huffentity_t *huff_entities;           /* The list of entities (characters,
  73                                           abbreviations, @.. escapes, and
  74                                           the terminator)                    */
  75 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  76
  77 int no_huff_entities;                  /* The number of entities in the list */
  78 int huff_unicode_start;                /* Position in the list where Unicode
  79                                           chars begin.                       */
  80 int huff_abbrev_start;                 /* Position in the list where string
  81                                           abbreviations begin.               */
  82 int huff_dynam_start;                  /* Position in the list where @..
  83                                           entities begin.                    */
  84 int huff_entity_root;                  /* The position in the list of the root
  85                                           entry (when considering the table
  86                                           as a tree).                        */
  87
  88 int done_compression;                  /* Has the game text been compressed? */
  89 int32 compression_table_size;          /* Length of the Huffman table, in
  90                                           bytes                              */
  91 int32 compression_string_size;         /* Length of the compressed string
  92                                           data, in bytes                     */
  93 int32 *compressed_offsets;             /* The beginning of every string in
  94                                           the game, relative to the beginning
  95                                           of the Huffman table. (So entry 0
  96                                           is equal to compression_table_size)*/
  97
  98 #define UNICODE_HASH_BUCKETS (64)
  99 unicode_usage_t *unicode_usage_entries;
 100 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
 101
 102 static int unicode_entity_index(int32 unicode);
 103
 104 /* ------------------------------------------------------------------------- */
 105 /*   Abbreviation arrays                                                     */
 106 /* ------------------------------------------------------------------------- */
 107
 108 int *abbrev_values;
 109 int *abbrev_quality;
 110 int *abbrev_freqs;
 111
 112 /* ------------------------------------------------------------------------- */
 113
 114 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 115       total_bytes_trans,               /* Number of bytes of Z-code text out */
 116       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 117                                           needed only for abbrev efficiency
 118                                           calculation in "directs.c"         */
 119 static int32 total_zchars_trans,       /* Number of Z-chars of text out
 120                                           (only used to calculate the above) */
 121       no_chars_transcribed;            /* Number of ASCII chars written to
 122                                           the text transcription area (used
 123                                           for the -r and -u switches)        */
 124
 125 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 126                                           3 Z-chars at a time: when it's full
 127                                           these are written as a 2-byte word */
 128            zob_index;                  /* Index (0 to 2) into it             */
 129
 130 static unsigned char *text_out_pc;     /* The "program counter" during text
 131                                           translation: the next address to
 132                                           write Z-coded text output to       */
 133
 134 static unsigned char *text_out_limit;  /* The upper limit of text_out_pc
 135                                           during text translation            */
 136
 137 static int text_out_overflow;          /* During text translation, becomes
 138                                           true if text_out_pc tries to pass
 139                                           text_out_limit                     */
 140
 141 /* ------------------------------------------------------------------------- */
 142 /*   For variables/arrays used by the dictionary manager, see below          */
 143 /* ------------------------------------------------------------------------- */
 144
 145 /* ------------------------------------------------------------------------- */
 146 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 147 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 148 /*   alphabetical order (this is necessary for the detection algorithm to    */
 149 /*   to work).  Since the table is only prepared once, and for a table       */
 150 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 151 /* ------------------------------------------------------------------------- */
 152
 153 static void make_abbrevs_lookup(void)
 154 {   int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
 155     do
 156     {   bubble_sort = FALSE;
 157         for (j=0; j<no_abbreviations; j++)
 158             for (k=j+1; k<no_abbreviations; k++)
 159             {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 160                 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
 161                 if (strcmp(p1,p2)<0)
 162                 {   strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
 163                     l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
 164                     abbrev_values[k]=l;
 165                     l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
 166                     abbrev_quality[k]=l;
 167                     bubble_sort = TRUE;
 168                 }
 169             }
 170     } while (bubble_sort);
 171
 172     for (j=no_abbreviations-1; j>=0; j--)
 173     {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 174         abbrevs_lookup[(uchar)p1[0]]=j;
 175         abbrev_freqs[j]=0;
 176     }
 177     abbrevs_lookup_table_made = TRUE;
 178 }
 179
 180 /* ------------------------------------------------------------------------- */
 181 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 182 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 183 /*   is only called if text[i] is indeed the first character of at least one */
 184 /*   abbreviation, "from" begin the least index into the abbreviations table */
 185 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 186 /*   that the abbrevs table is in alphabetical order.                        */
 187 /*                                                                           */
 188 /*   The return value is -1 if there is no match.  If there is a match, the  */
 189 /*   text to be abbreviated out is over-written by a string of null chars    */
 190 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 191 /*                                                                           */
 192 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 193 /* ------------------------------------------------------------------------- */
 194
 195 static int try_abbreviations_from(unsigned char *text, int i, int from)
 196 {   int j, k; uchar *p, c;
 197     c=text[i];
 198     for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 199          (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
 200     {   if (text[i+1]==p[1])
 201         {   for (k=2; p[k]!=0; k++)
 202                 if (text[i+k]!=p[k]) goto NotMatched;
 203             if (!glulx_mode) {
 204                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 205             }
 206             abbrev_freqs[j]++;
 207             return(j);
 208             NotMatched: ;
 209         }
 210     }
 211     return(-1);
 212 }
 213
 214 extern void make_abbreviation(char *text)
 215 {
 216     strcpy((char *)abbreviations_at
 217             + no_abbreviations*MAX_ABBREV_LENGTH, text);
 218
 219     is_abbreviation = TRUE;
 220     abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
 221     is_abbreviation = FALSE;
 222
 223     /*   The quality is the number of Z-chars saved by using this            */
 224     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 225
 226     abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
 227 }
 228
 229 /* ------------------------------------------------------------------------- */
 230 /*   The front end routine for text translation                              */
 231 /* ------------------------------------------------------------------------- */
 232
 233 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
 234 {   int i, j; uchar *c;
 235
 236     is_abbreviation = is_abbrev;
 237
 238     /* Put into the low memory pool (at 0x100 in the Z-machine) of strings   */
 239     /* which may be wanted as possible entries in the abbreviations table    */
 240
 241     if (!glulx_mode && in_low_memory)
 242     {   j=subtract_pointers(low_strings_top,low_strings);
 243         low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
 244         if (!low_strings_top)
 245             memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
 246         is_abbreviation = FALSE;
 247         return(0x21+(j/2));
 248     }
 249
 250     if (glulx_mode && done_compression)
 251         compiler_error("Tried to add a string after compression was done.");
 252
 253     c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
 254     if (!c)
 255         memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 256
 257     i = subtract_pointers(c, strings_holding_area);
 258
 259     /* Insert null bytes as needed to ensure that the next static string */
 260     /* also occurs at an address expressible as a packed address         */
 261
 262     if (!glulx_mode) {
 263         int textalign;
 264         if (oddeven_packing_switch)
 265             textalign = scale_factor*2;
 266         else
 267             textalign = scale_factor;
 268         while ((i%textalign)!=0)
 269         {
 270             if (i+2 > MAX_STATIC_STRINGS)
 271                 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 272             i+=2; *c++ = 0; *c++ = 0;
 273         }
 274     }
 275
 276     j = static_strings_extent;
 277
 278     if (temporary_files_switch)
 279         for (c=strings_holding_area; c<strings_holding_area+i;
 280              c++, static_strings_extent++)
 281             fputc(*c,Temp1_fp);
 282     else
 283         for (c=strings_holding_area; c<strings_holding_area+i;
 284              c++, static_strings_extent++)
 285             write_byte_to_memory_block(&static_strings_area,
 286                 static_strings_extent, *c);
 287
 288     is_abbreviation = FALSE;
 289
 290     if (!glulx_mode) {
 291         return(j/scale_factor);
 292     }
 293     else {
 294         /* The marker value is a one-based string number. (We reserve zero
 295            to mean "not a string at all". */
 296         return (++no_strings);
 297     }
 298 }
 299
 300 /* ------------------------------------------------------------------------- */
 301 /*   Output a single Z-character into the buffer, and flush it if full       */
 302 /* ------------------------------------------------------------------------- */
 303
 304 static void write_z_char_z(int i)
 305 {   uint32 j;
 306     ASSERT_ZCODE();
 307     total_zchars_trans++;
 308     zchars_out_buffer[zob_index++]=(i%32);
 309     if (zob_index!=3) return;
 310     zob_index=0;
 311     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 312        + zchars_out_buffer[2];
 313     if (text_out_pc+2 > text_out_limit) {
 314         text_out_overflow = TRUE;
 315         return;
 316     }
 317     text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
 318     total_bytes_trans+=2;
 319 }
 320
 321 static void write_zscii(int zsc)
 322 {
 323     int lookup_value, in_alphabet;
 324
 325     if (zsc==' ')
 326     {   write_z_char_z(0);
 327         return;
 328     }
 329
 330     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 331
 332     else lookup_value = -1;
 333
 334     if (lookup_value >= 0)
 335     {   alphabet_used[lookup_value] = 'Y';
 336         in_alphabet = lookup_value/26;
 337         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 338         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 339         write_z_char_z(lookup_value%26 + 6);
 340     }
 341     else
 342     {   write_z_char_z(5); write_z_char_z(6);
 343         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 344     }
 345 }
 346
 347 /* ------------------------------------------------------------------------- */
 348 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 349 /*   setting the "end" bit on the final 2-byte word                          */
 350 /* ------------------------------------------------------------------------- */
 351
 352 static void end_z_chars(void)
 353 {   unsigned char *p;
 354     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 355     while (zob_index!=0) write_z_char_z(5);
 356     p=(unsigned char *) text_out_pc;
 357     *(p-2)= *(p-2)+128;
 358 }
 359
 360 /* Glulx handles this much more simply -- compression is done elsewhere. */
 361 static void write_z_char_g(int i)
 362 {
 363   ASSERT_GLULX();
 364   if (text_out_pc+1 > text_out_limit) {
 365       text_out_overflow = TRUE;
 366       return;
 367   }
 368   total_zchars_trans++;
 369   text_out_pc[0] = i;
 370   text_out_pc++;
 371   total_bytes_trans++;
 372 }
 373
 374 /* ------------------------------------------------------------------------- */
 375 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 376 /*   translator. p is the address to write output to, s_text the source text */
 377 /*   and the return value is the next free address to write output to.       */
 378 /*   The return value will not exceed p_limit. If the translation tries to   */
 379 /*   overflow this boundary, the return value will be NULL (and you should   */
 380 /*   display an error).                                                      */
 381 /*   Note that the source text may be corrupted by this routine.             */
 382 /* ------------------------------------------------------------------------- */
 383
 384 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
 385 {   int i, j, k, in_alphabet, lookup_value;
 386     int32 unicode; int zscii;
 387     unsigned char *text_in;
 388
 389     /*  Cast the input and output streams to unsigned char: text_out_pc will
 390         advance as bytes of Z-coded text are written, but text_in doesn't    */
 391
 392     text_in     = (unsigned char *) s_text;
 393     text_out_pc = (unsigned char *) p;
 394     text_out_limit = (unsigned char *) p_limit;
 395     text_out_overflow = FALSE;
 396
 397     /*  Remember the Z-chars total so that later we can subtract to find the
 398         number of Z-chars translated on this string                          */
 399
 400     zchars_trans_in_last_string = total_zchars_trans;
 401
 402     /*  Start with the Z-characters output buffer empty                      */
 403
 404     zob_index=0;
 405
 406     /*  If this is the first text translated since the abbreviations were
 407         declared, and if some were declared, then it's time to make the
 408         lookup table for abbreviations
 409
 410         (Except: we don't if the text being translated is itself
 411         the text of an abbreviation currently being defined)                 */
 412
 413     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 414         && (!is_abbreviation))
 415         make_abbrevs_lookup();
 416
 417     /*  If we're storing the whole game text to memory, then add this text   */
 418
 419     if ((!is_abbreviation) && (store_the_text))
 420     {   no_chars_transcribed += strlen(s_text)+2;
 421         if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
 422             memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
 423         sprintf(all_text_top, "%s\n\n", s_text);
 424         all_text_top += strlen(all_text_top);
 425     }
 426
 427     if (transcript_switch && (!veneer_mode))
 428         write_to_transcript_file(s_text);
 429
 430   if (!glulx_mode) {
 431
 432     /*  The empty string of Z-text is illegal, since it can't carry an end
 433         bit: so we translate an empty string of ASCII text to just the
 434         pad character 5.  Printing this causes nothing to appear on screen.  */
 435
 436     if (text_in[0]==0) write_z_char_z(5);
 437
 438     /*  Loop through the characters of the null-terminated input text: note
 439         that if 1 is written over a character in the input text, it is
 440         afterwards ignored                                                   */
 441
 442     for (i=0; text_in[i]!=0; i++)
 443     {   total_chars_trans++;
 444
 445         /*  Contract ".  " into ". " if double-space-removing switch set:
 446             likewise "?  " and "!  " if the setting is high enough           */
 447
 448         if ((double_space_setting >= 1)
 449             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 450         {   if (text_in[i]=='.') text_in[i+2]=1;
 451             if (double_space_setting >= 2)
 452             {   if (text_in[i]=='?') text_in[i+2]=1;
 453                 if (text_in[i]=='!') text_in[i+2]=1;
 454             }
 455         }
 456
 457         /*  Try abbreviations if the economy switch set                      */
 458
 459         if ((economy_switch) && (!is_abbreviation)
 460             && ((k=abbrevs_lookup[text_in[i]])!=-1))
 461         {   if ((j=try_abbreviations_from(text_in, i, k))!=-1)
 462             {   if (j<32) { write_z_char_z(2); write_z_char_z(j); }
 463                 else { write_z_char_z(3); write_z_char_z(j-32); }
 464             }
 465         }
 466
 467         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 468            decoding */
 469         if (character_set_unicode && (text_in[i] & 0x80))
 470         {   unicode = text_to_unicode((char *) (text_in+i));
 471             zscii = unicode_to_zscii(unicode);
 472             if (zscii != 5) write_zscii(zscii);
 473             else
 474             {   unicode_char_error(
 475                     "Character can only be used if declared in \
 476 advance as part of 'Zcharacter table':", unicode);
 477             }
 478             i += textual_form_length - 1;
 479             continue;
 480         }
 481
 482         /*  '@' is the escape character in Inform string notation: the various
 483             possibilities are:
 484
 485                 (printing only)
 486                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 487                 @twodigits       :  write the abbreviation string with this
 488                                     decimal number
 489
 490                 (any string context)
 491                 @accentcode      :  this accented character: e.g.,
 492                                         for @'e write an E-acute
 493                 @{...}           :  this Unicode char (in hex)              */
 494
 495         if (text_in[i]=='@')
 496         {   if (text_in[i+1]=='@')
 497             {
 498                 /*   @@...   */
 499
 500                 i+=2; j=atoi((char *) (text_in+i));
 501                 switch(j)
 502                 {   /* Prevent ~ and ^ from being translated to double-quote
 503                        and new-line, as they ordinarily would be */
 504
 505                     case 94:   write_z_char_z(5); write_z_char_z(6);
 506                                write_z_char_z(94/32); write_z_char_z(94%32);
 507                                break;
 508                     case 126:  write_z_char_z(5); write_z_char_z(6);
 509                                write_z_char_z(126/32); write_z_char_z(126%32);
 510                                break;
 511
 512                     default:   write_zscii(j); break;
 513                 }
 514                 while (isdigit(text_in[i])) i++; i--;
 515             }
 516             else if (isdigit(text_in[i+1])!=0)
 517             {   int d1, d2;
 518
 519                 /*   @..   */
 520
 521                 d1 = character_digit_value[text_in[i+1]];
 522                 d2 = character_digit_value[text_in[i+2]];
 523                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 524                     error("'@..' must have two decimal digits");
 525                 else
 526                 {   i+=2;
 527                     write_z_char_z(1); write_z_char_z(d1*10 + d2);
 528                 }
 529             }
 530             else
 531             {
 532                 /*   A string escape specifying an unusual character   */
 533
 534                 unicode = text_to_unicode((char *) (text_in+i));
 535                 zscii = unicode_to_zscii(unicode);
 536                 if (zscii != 5) write_zscii(zscii);
 537                 else
 538                 {   unicode_char_error(
 539                        "Character can only be used if declared in \
 540 advance as part of 'Zcharacter table':", unicode);
 541                 }
 542                 i += textual_form_length - 1;
 543             }
 544         }
 545         else
 546         {   /*  Skip a character which has been over-written with the null
 547                 value 1 earlier on                                           */
 548
 549             if (text_in[i]!=1)
 550             {   if (text_in[i]==' ') write_z_char_z(0);
 551                 else
 552                 {   j = (int) text_in[i];
 553                     lookup_value = iso_to_alphabet_grid[j];
 554                     if (lookup_value < 0)
 555                     {   /*  The character isn't in the standard alphabets, so
 556                             we have to use the ZSCII 4-Z-char sequence */
 557
 558                         if (lookup_value == -5)
 559                         {   /*  Character isn't in the ZSCII set at all */
 560
 561                             unicode = iso_to_unicode(j);
 562                             unicode_char_error(
 563                                 "Character can only be used if declared in \
 564 advance as part of 'Zcharacter table':", unicode);
 565                             write_zscii(0x200 + unicode/0x100);
 566                             write_zscii(0x300 + unicode%0x100);
 567                         }
 568                         else write_zscii(-lookup_value);
 569                     }
 570                     else
 571                     {   /*  The character is in one of the standard alphabets:
 572                             write a SHIFT to temporarily change alphabet if
 573                             it isn't in alphabet 0, then write the Z-char    */
 574
 575                         alphabet_used[lookup_value] = 'Y';
 576                         in_alphabet = lookup_value/26;
 577                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 578                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 579                         write_z_char_z(lookup_value%26 + 6);
 580                     }
 581                 }
 582             }
 583         }
 584     }
 585
 586     /*  Flush the Z-characters output buffer and set the "end" bit           */
 587
 588     end_z_chars();
 589
 590   }
 591   else {
 592
 593     /* The text storage here is, of course, temporary. Compression
 594        will occur when we're finished compiling, so that all the
 595        clever Huffman stuff will work.
 596        In the stored text, we use "@@" to indicate @,
 597        "@0" to indicate a zero byte,
 598        "@ANNNN" to indicate an abbreviation,
 599        "@DNNNN" to indicate a dynamic string thing.
 600        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 601        (NNNN is a four-digit hex number using the letters A-P... an
 602        ugly representation but a convenient one.)
 603     */
 604
 605     for (i=0; text_in[i]!=0; i++) {
 606
 607       /*  Contract ".  " into ". " if double-space-removing switch set:
 608           likewise "?  " and "!  " if the setting is high enough. */
 609       if ((double_space_setting >= 1)
 610         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 611         if (text_in[i]=='.'
 612           || (double_space_setting >= 2
 613             && (text_in[i]=='?' || text_in[i]=='!'))) {
 614           text_in[i+1] = text_in[i];
 615           i++;
 616         }
 617       }
 618
 619       total_chars_trans++;
 620
 621       /*  Try abbreviations if the economy switch set. We have to be in
 622           compression mode too, since the abbreviation mechanism is part
 623           of string decompression. */
 624
 625       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 626         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 627         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 628         char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 629         i += (strlen(cx)-1);
 630         write_z_char_g('@');
 631         write_z_char_g('A');
 632         write_z_char_g('A' + ((j >>12) & 0x0F));
 633         write_z_char_g('A' + ((j >> 8) & 0x0F));
 634         write_z_char_g('A' + ((j >> 4) & 0x0F));
 635         write_z_char_g('A' + ((j     ) & 0x0F));
 636       }
 637       else if (text_in[i] == '@') {
 638         if (text_in[i+1]=='@') {
 639           /* An ASCII code */
 640           i+=2; j=atoi((char *) (text_in+i));
 641           if (j == '@' || j == '\0') {
 642             write_z_char_g('@');
 643             if (j == 0) {
 644               j = '0';
 645               if (!compression_switch)
 646                 warning("Ascii @@0 will prematurely terminate non-compressed \
 647 string.");
 648             }
 649           }
 650           write_z_char_g(j);
 651           while (isdigit(text_in[i])) i++; i--;
 652         }
 653         else if (isdigit(text_in[i+1])) {
 654           int d1, d2;
 655           d1 = character_digit_value[text_in[i+1]];
 656           d2 = character_digit_value[text_in[i+2]];
 657           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 658             error("'@..' must have two decimal digits");
 659           }
 660           else {
 661             if (!compression_switch)
 662               warning("'@..' print variable will not work in non-compressed \
 663 string; substituting '   '.");
 664             i += 2;
 665             j = d1*10 + d2;
 666             if (j >= MAX_DYNAMIC_STRINGS) {
 667               memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
 668               j = 0;
 669             }
 670             if (j+1 >= no_dynamic_strings)
 671               no_dynamic_strings = j+1;
 672             write_z_char_g('@');
 673             write_z_char_g('D');
 674             write_z_char_g('A' + ((j >>12) & 0x0F));
 675             write_z_char_g('A' + ((j >> 8) & 0x0F));
 676             write_z_char_g('A' + ((j >> 4) & 0x0F));
 677             write_z_char_g('A' + ((j     ) & 0x0F));
 678           }
 679         }
 680         else {
 681           unicode = text_to_unicode((char *) (text_in+i));
 682           i += textual_form_length - 1;
 683           if (unicode == '@' || unicode == '\0') {
 684             write_z_char_g('@');
 685             write_z_char_g(unicode ? '@' : '0');
 686           }
 687           else if (unicode >= 0 && unicode < 256) {
 688             write_z_char_g(unicode);
 689           }
 690           else {
 691             if (!compression_switch) {
 692               warning("Unicode characters will not work in non-compressed \
 693 string; substituting '?'.");
 694               write_z_char_g('?');
 695             }
 696             else {
 697               j = unicode_entity_index(unicode);
 698               write_z_char_g('@');
 699               write_z_char_g('U');
 700               write_z_char_g('A' + ((j >>12) & 0x0F));
 701               write_z_char_g('A' + ((j >> 8) & 0x0F));
 702               write_z_char_g('A' + ((j >> 4) & 0x0F));
 703               write_z_char_g('A' + ((j     ) & 0x0F));
 704             }
 705           }
 706         }
 707       }
 708       else if (text_in[i] == '^')
 709         write_z_char_g(0x0A);
 710       else if (text_in[i] == '~')
 711         write_z_char_g('"');
 712       else if (character_set_unicode) {
 713         if (text_in[i] & 0x80) {
 714           unicode = text_to_unicode((char *) (text_in+i));
 715           i += textual_form_length - 1;
 716           if (unicode >= 0 && unicode < 256) {
 717             write_z_char_g(unicode);
 718           }
 719           else {
 720             if (!compression_switch) {
 721               warning("Unicode characters will not work in non-compressed \
 722 string; substituting '?'.");
 723               write_z_char_g('?');
 724             }
 725             else {
 726               j = unicode_entity_index(unicode);
 727               write_z_char_g('@');
 728               write_z_char_g('U');
 729               write_z_char_g('A' + ((j >>12) & 0x0F));
 730               write_z_char_g('A' + ((j >> 8) & 0x0F));
 731               write_z_char_g('A' + ((j >> 4) & 0x0F));
 732               write_z_char_g('A' + ((j     ) & 0x0F));
 733             }
 734           }
 735         }
 736         else {
 737           write_z_char_g(text_in[i]);
 738         }
 739       }
 740       else {
 741         unicode = iso_to_unicode_grid[text_in[i]];
 742         if (unicode >= 0 && unicode < 256) {
 743           write_z_char_g(unicode);
 744         }
 745         else {
 746           if (!compression_switch) {
 747             warning("Unicode characters will not work in non-compressed \
 748 string; substituting '?'.");
 749             write_z_char_g('?');
 750           }
 751           else {
 752             j = unicode_entity_index(unicode);
 753             write_z_char_g('@');
 754             write_z_char_g('U');
 755             write_z_char_g('A' + ((j >>12) & 0x0F));
 756             write_z_char_g('A' + ((j >> 8) & 0x0F));
 757             write_z_char_g('A' + ((j >> 4) & 0x0F));
 758             write_z_char_g('A' + ((j     ) & 0x0F));
 759           }
 760         }
 761       }
 762     }
 763     write_z_char_g(0);
 764
 765   }
 766
 767   if (text_out_overflow)
 768       return NULL;
 769   else
 770       return((uchar *) text_out_pc);
 771 }
 772
 773 static int unicode_entity_index(int32 unicode)
 774 {
 775   unicode_usage_t *uptr;
 776   int j;
 777   int buck = unicode % UNICODE_HASH_BUCKETS;
 778
 779   for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
 780     if (uptr->ch == unicode)
 781       break;
 782   }
 783   if (uptr) {
 784     j = (uptr - unicode_usage_entries);
 785   }
 786   else {
 787     if (no_unicode_chars >= MAX_UNICODE_CHARS) {
 788       memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
 789       j = 0;
 790     }
 791     else {
 792       j = no_unicode_chars;
 793       no_unicode_chars++;
 794       uptr = unicode_usage_entries + j;
 795       uptr->ch = unicode;
 796       uptr->next = unicode_usage_hash[buck];
 797       unicode_usage_hash[buck] = uptr;
 798     }
 799   }
 800
 801   return j;
 802 }
 803
 804 /* ------------------------------------------------------------------------- */
 805 /*   Glulx compression code                                                  */
 806 /* ------------------------------------------------------------------------- */
 807
 808
 809 static void compress_makebits(int entnum, int depth, int prevbit,
 810   huffbitlist_t *bits);
 811
 812 /*   The compressor. This uses the usual Huffman compression algorithm. */
 813 void compress_game_text()
 814 {
 815   int entities=0, branchstart, branches;
 816   int numlive;
 817   int32 lx;
 818   int jx;
 819   int ch;
 820   int32 ix;
 821   huffbitlist_t bits;
 822
 823   if (compression_switch) {
 824
 825     /* How many entities have we currently got? Well, 256 plus the
 826        string-terminator plus Unicode chars plus abbrevations plus
 827        dynamic strings. */
 828     entities = 256+1;
 829     huff_unicode_start = entities;
 830     entities += no_unicode_chars;
 831     huff_abbrev_start = entities;
 832     if (economy_switch)
 833       entities += no_abbreviations;
 834     huff_dynam_start = entities;
 835     entities += no_dynamic_strings;
 836
 837     if (entities > MAX_CHARACTER_SET)
 838       memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
 839
 840     /* Characters */
 841     for (jx=0; jx<256; jx++) {
 842       huff_entities[jx].type = 2;
 843       huff_entities[jx].count = 0;
 844       huff_entities[jx].u.ch = jx;
 845     }
 846     /* Terminator */
 847     huff_entities[256].type = 1;
 848     huff_entities[256].count = 0;
 849     for (jx=0; jx<no_unicode_chars; jx++) {
 850       huff_entities[huff_unicode_start+jx].type = 4;
 851       huff_entities[huff_unicode_start+jx].count = 0;
 852       huff_entities[huff_unicode_start+jx].u.val = jx;
 853     }
 854     if (economy_switch) {
 855       for (jx=0; jx<no_abbreviations; jx++) {
 856         huff_entities[huff_abbrev_start+jx].type = 3;
 857         huff_entities[huff_abbrev_start+jx].count = 0;
 858         huff_entities[huff_abbrev_start+jx].u.val = jx;
 859       }
 860     }
 861     for (jx=0; jx<no_dynamic_strings; jx++) {
 862       huff_entities[huff_dynam_start+jx].type = 9;
 863       huff_entities[huff_dynam_start+jx].count = 0;
 864       huff_entities[huff_dynam_start+jx].u.val = jx;
 865     }
 866   }
 867   else {
 868     /* No compression; use defaults that will make it easy to check
 869        for errors. */
 870     no_huff_entities = 257;
 871     huff_unicode_start = 257;
 872     huff_abbrev_start = 257;
 873     huff_dynam_start = 257+MAX_ABBREVS;
 874     compression_table_size = 0;
 875   }
 876
 877   if (temporary_files_switch) {
 878     fclose(Temp1_fp);
 879     Temp1_fp=fopen(Temp1_Name,"rb");
 880     if (Temp1_fp==NULL)
 881       fatalerror("I/O failure: couldn't reopen temporary file 1");
 882   }
 883
 884   if (compression_switch) {
 885
 886     for (lx=0, ix=0; lx<no_strings; lx++) {
 887       int escapelen=0, escapetype=0;
 888       int done=FALSE;
 889       int32 escapeval=0;
 890       while (!done) {
 891         if (temporary_files_switch)
 892           ch = fgetc(Temp1_fp);
 893         else
 894           ch = read_byte_from_memory_block(&static_strings_area, ix);
 895         ix++;
 896         if (ix > static_strings_extent || ch < 0)
 897           compiler_error("Read too much not-yet-compressed text.");
 898         if (escapelen == -1) {
 899           escapelen = 0;
 900           if (ch == '@') {
 901             ch = '@';
 902           }
 903           else if (ch == '0') {
 904             ch = '\0';
 905           }
 906           else if (ch == 'A' || ch == 'D' || ch == 'U') {
 907             escapelen = 4;
 908             escapetype = ch;
 909             escapeval = 0;
 910             continue;
 911           }
 912           else {
 913             compiler_error("Strange @ escape in processed text.");
 914           }
 915         }
 916         else if (escapelen) {
 917           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
 918           escapelen--;
 919           if (escapelen == 0) {
 920             if (escapetype == 'A') {
 921               ch = huff_abbrev_start+escapeval;
 922             }
 923             else if (escapetype == 'D') {
 924               ch = huff_dynam_start+escapeval;
 925             }
 926             else if (escapetype == 'U') {
 927               ch = huff_unicode_start+escapeval;
 928             }
 929             else {
 930               compiler_error("Strange @ escape in processed text.");
 931             }
 932           }
 933           else
 934             continue;
 935         }
 936         else {
 937           if (ch == '@') {
 938             escapelen = -1;
 939             continue;
 940           }
 941           if (ch == 0) {
 942             ch = 256;
 943             done = TRUE;
 944           }
 945         }
 946         huff_entities[ch].count++;
 947       }
 948     }
 949
 950     numlive = 0;
 951     for (jx=0; jx<entities; jx++) {
 952       if (huff_entities[jx].count) {
 953         hufflist[numlive] = &(huff_entities[jx]);
 954         numlive++;
 955       }
 956     }
 957
 958     branchstart = entities;
 959     branches = 0;
 960
 961     while (numlive > 1) {
 962       int best1, best2;
 963       int best1num, best2num;
 964       huffentity_t *bran;
 965
 966       if (hufflist[0]->count < hufflist[1]->count) {
 967         best1 = 0;
 968         best2 = 1;
 969       }
 970       else {
 971         best2 = 0;
 972         best1 = 1;
 973       }
 974
 975       best1num = hufflist[best1]->count;
 976       best2num = hufflist[best2]->count;
 977
 978       for (jx=2; jx<numlive; jx++) {
 979         if (hufflist[jx]->count < best1num) {
 980           best2 = best1;
 981           best2num = best1num;
 982           best1 = jx;
 983           best1num = hufflist[best1]->count;
 984         }
 985         else if (hufflist[jx]->count < best2num) {
 986           best2 = jx;
 987           best2num = hufflist[best2]->count;
 988         }
 989       }
 990
 991       bran = &(huff_entities[branchstart+branches]);
 992       branches++;
 993       bran->type = 0;
 994       bran->count = hufflist[best1]->count + hufflist[best2]->count;
 995       bran->u.branch[0] = (hufflist[best1] - huff_entities);
 996       bran->u.branch[1] = (hufflist[best2] - huff_entities);
 997       hufflist[best1] = bran;
 998       if (best2 < numlive-1) {
 999         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1000           ((numlive-1) - best2) * sizeof(huffentity_t *));
1001       }
1002       numlive--;
1003     }
1004
1005     huff_entity_root = (hufflist[0] - huff_entities);
1006
1007     for (ix=0; ix<MAXHUFFBYTES; ix++)
1008       bits.b[ix] = 0;
1009     compression_table_size = 12;
1010
1011     no_huff_entities = 0; /* compress_makebits will total this up */
1012     compress_makebits(huff_entity_root, 0, -1, &bits);
1013   }
1014
1015   /* Now, sadly, we have to compute the size of the string section,
1016      without actually doing the compression. */
1017   compression_string_size = 0;
1018
1019   if (temporary_files_switch) {
1020     fseek(Temp1_fp, 0, SEEK_SET);
1021   }
1022
1023   if (no_strings >= MAX_NUM_STATIC_STRINGS)
1024     memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1025
1026   for (lx=0, ix=0; lx<no_strings; lx++) {
1027     int escapelen=0, escapetype=0;
1028     int done=FALSE;
1029     int32 escapeval=0;
1030     jx = 0;
1031     compressed_offsets[lx] = compression_table_size + compression_string_size;
1032     compression_string_size++; /* for the type byte */
1033     while (!done) {
1034       if (temporary_files_switch)
1035         ch = fgetc(Temp1_fp);
1036       else
1037         ch = read_byte_from_memory_block(&static_strings_area, ix);
1038       ix++;
1039       if (ix > static_strings_extent || ch < 0)
1040         compiler_error("Read too much not-yet-compressed text.");
1041       if (escapelen == -1) {
1042         escapelen = 0;
1043         if (ch == '@') {
1044           ch = '@';
1045         }
1046         else if (ch == '0') {
1047           ch = '\0';
1048         }
1049         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1050           escapelen = 4;
1051           escapetype = ch;
1052           escapeval = 0;
1053           continue;
1054         }
1055         else {
1056           compiler_error("Strange @ escape in processed text.");
1057         }
1058       }
1059       else if (escapelen) {
1060         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1061         escapelen--;
1062         if (escapelen == 0) {
1063           if (escapetype == 'A') {
1064             ch = huff_abbrev_start+escapeval;
1065           }
1066           else if (escapetype == 'D') {
1067             ch = huff_dynam_start+escapeval;
1068           }
1069           else if (escapetype == 'U') {
1070             ch = huff_unicode_start+escapeval;
1071           }
1072           else {
1073             compiler_error("Strange @ escape in processed text.");
1074           }
1075         }
1076         else
1077           continue;
1078       }
1079       else {
1080         if (ch == '@') {
1081           escapelen = -1;
1082           continue;
1083         }
1084         if (ch == 0) {
1085           ch = 256;
1086           done = TRUE;
1087         }
1088       }
1089
1090       if (compression_switch) {
1091         jx += huff_entities[ch].depth;
1092         compression_string_size += (jx/8);
1093         jx = (jx % 8);
1094       }
1095       else {
1096         if (ch >= huff_dynam_start) {
1097           compression_string_size += 3;
1098         }
1099         else if (ch >= huff_unicode_start) {
1100           compiler_error("Abbreviation/Unicode in non-compressed string \
1101 should be impossible.");
1102         }
1103         else
1104           compression_string_size += 1;
1105       }
1106     }
1107     if (compression_switch && jx)
1108       compression_string_size++;
1109   }
1110
1111   done_compression = TRUE;
1112 }
1113
1114 static void compress_makebits(int entnum, int depth, int prevbit,
1115   huffbitlist_t *bits)
1116 {
1117   huffentity_t *ent = &(huff_entities[entnum]);
1118   char *cx;
1119
1120   no_huff_entities++;
1121   ent->addr = compression_table_size;
1122   ent->depth = depth;
1123   ent->bits = *bits;
1124   if (depth > 0) {
1125     if (prevbit)
1126       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1127   }
1128
1129   switch (ent->type) {
1130   case 0:
1131     compression_table_size += 9;
1132     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1133     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1134     break;
1135   case 1:
1136     compression_table_size += 1;
1137     break;
1138   case 2:
1139     compression_table_size += 2;
1140     break;
1141   case 3:
1142     cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1143     compression_table_size += (1 + 1 + strlen(cx));
1144     break;
1145   case 4:
1146   case 9:
1147     compression_table_size += 5;
1148     break;
1149   }
1150 }
1151
1152 /* ------------------------------------------------------------------------- */
1153 /*   The abbreviations optimiser                                             */
1154 /*                                                                           */
1155 /*   This is a very complex, memory and time expensive algorithm to          */
1156 /*   approximately solve the problem of which abbreviation strings would     */
1157 /*   minimise the total number of Z-chars to which the game text translates. */
1158 /*   It is in some ways a quite separate program but remains inside Inform   */
1159 /*   for compatibility with previous releases.                               */
1160 /* ------------------------------------------------------------------------- */
1161
1162 typedef struct tlb_s
1163 {   char text[4];
1164     int32 intab, occurrences;
1165 } tlb;
1166 static tlb *tlbtab;
1167 static int32 no_occs;
1168
1169 static int32 *grandtable;
1170 static int32 *grandflags;
1171 typedef struct optab_s
1172 {   int32  length;
1173     int32  popularity;
1174     int32  score;
1175     int32  location;
1176     char text[MAX_ABBREV_LENGTH];
1177 } optab;
1178 static optab *bestyet, *bestyet2;
1179
1180 static int pass_no;
1181
1182 static char *sub_buffer;
1183
1184 static void optimise_pass(void)
1185 {   int32 i; int t1, t2;
1186     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1187     for (i=0; i<256; i++) bestyet[i].length=0;
1188     for (i=0; i<no_occs; i++)
1189     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1190         {
1191 #ifdef MAC_FACE
1192             if (i%((**g_pm_hndl).linespercheck) == 0)
1193             {   ProcessEvents (&g_proc);
1194                 if (g_proc != true)
1195                 {   free_arrays();
1196                     if (store_the_text)
1197                         my_free(&all_text,"transcription text");
1198                     longjmp (g_fallback, 1);
1199                 }
1200             }
1201 #endif
1202             printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1203                 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1204                 (long int) tlbtab[i].occurrences);
1205             t1=(int) (time(0));
1206             for (j=0; j<tlbtab[i].occurrences; j++)
1207             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1208                 nl=2; noflags=tlbtab[i].occurrences;
1209                 while ((noflags>=2)&&(nl<=62))
1210                 {   nl++;
1211                     for (j2=0; j2<nl; j2++)
1212                         if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1213                             goto FinishEarly;
1214                     matches=0;
1215                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1216                     {   if (grandflags[j2]==1)
1217                         {   x=grandtable[tlbtab[i].intab+j2]
1218                               - grandtable[tlbtab[i].intab+j];
1219                          if (((x>-nl)&&(x<nl))
1220                             || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1221                                        all_text+grandtable[tlbtab[i].intab+j2],
1222                                        nl)!=0))
1223                             {   grandflags[j2]=0; noflags--; }
1224                             else matches++;
1225                         }
1226                     }
1227                     scrabble=0;
1228                     for (k=0; k<nl; k++)
1229                     {   scrabble++;
1230                         c=all_text[grandtable[tlbtab[i].intab+j+k]];
1231                         if (c!=(int) ' ')
1232                         {   if (iso_to_alphabet_grid[c]<0)
1233                                 scrabble+=2;
1234                             else
1235                                 if (iso_to_alphabet_grid[c]>=26)
1236                                     scrabble++;
1237                         }
1238                     }
1239                     score=(matches-1)*(scrabble-2);
1240                     min=score;
1241                     for (j2=0; j2<256; j2++)
1242                     {   if ((nl==bestyet[j2].length)
1243                                 && (memcmp(all_text+bestyet[j2].location,
1244                                        all_text+grandtable[tlbtab[i].intab+j],
1245                                        nl)==0))
1246                         {   j2=256; min=score; }
1247                         else
1248                         {   if (bestyet[j2].score<min)
1249                             {   min=bestyet[j2].score; minat=j2;
1250                             }
1251                         }
1252                     }
1253                     if (min!=score)
1254                     {   bestyet[minat].score=score;
1255                         bestyet[minat].length=nl;
1256                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1257                         bestyet[minat].popularity=matches;
1258                         for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1259                             all_text[bestyet[minat].location+j2];
1260                         sub_buffer[nl]=0;
1261                     }
1262                 }
1263                 FinishEarly: ;
1264             }
1265             t2=((int) time(0)) - t1;
1266             printf(" (%d seconds)\n",t2);
1267         }
1268     }
1269 }
1270
1271 static int any_overlap(char *s1, char *s2)
1272 {   int a, b, i, j, flag;
1273     a=strlen(s1); b=strlen(s2);
1274     for (i=1-b; i<a; i++)
1275     {   flag=0;
1276         for (j=0; j<b; j++)
1277             if ((0<=i+j)&&(i+j<=a-1))
1278                 if (s1[i+j]!=s2[j]) flag=1;
1279         if (flag==0) return(1);
1280     }
1281     return(0);
1282 }
1283
1284 #define MAX_TLBS 8000
1285
1286 extern void optimise_abbreviations(void)
1287 {   int32 i, j, t, max=0, MAX_GTABLE;
1288     int32 j2, selected, available, maxat=0, nl;
1289     tlb test;
1290
1291     printf("Beginning calculation of optimal abbreviations...\n");
1292
1293     pass_no = 0;
1294     tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1295     sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1296     for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1297
1298     bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1299     bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1300
1301     bestyet2[0].text[0]='.';
1302     bestyet2[0].text[1]=' ';
1303     bestyet2[0].text[2]=0;
1304
1305     bestyet2[1].text[0]=',';
1306     bestyet2[1].text[1]=' ';
1307     bestyet2[1].text[2]=0;
1308
1309     for (i=0; all_text+i<all_text_top; i++)
1310     {
1311         if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1312         {   all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1313             bestyet2[0].popularity++;
1314         }
1315
1316         if ((all_text[i]=='.') && (all_text[i+1]==' '))
1317         {   all_text[i]='\n'; all_text[i+1]='\n';
1318             bestyet2[0].popularity++;
1319         }
1320
1321         if ((all_text[i]==',') && (all_text[i+1]==' '))
1322         {   all_text[i]='\n'; all_text[i+1]='\n';
1323             bestyet2[1].popularity++;
1324         }
1325     }
1326
1327     MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1328     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1329
1330     for (i=0, t=0; all_text+i<all_text_top; i++)
1331     {   test.text[0]=all_text[i];
1332         test.text[1]=all_text[i+1];
1333         test.text[2]=all_text[i+2];
1334         test.text[3]=0;
1335         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1336             goto DontKeep;
1337         for (j=0; j<no_occs; j++)
1338             if (strcmp(test.text,tlbtab[j].text)==0)
1339                 goto DontKeep;
1340         test.occurrences=0;
1341         for (j=i+3; all_text+j<all_text_top; j++)
1342         {
1343 #ifdef MAC_FACE
1344             if (j%((**g_pm_hndl).linespercheck) == 0)
1345             {   ProcessEvents (&g_proc);
1346                 if (g_proc != true)
1347                 {   free_arrays();
1348                     if (store_the_text)
1349                         my_free(&all_text,"transcription text");
1350                     longjmp (g_fallback, 1);
1351                 }
1352             }
1353 #endif
1354             if ((all_text[i]==all_text[j])
1355                  && (all_text[i+1]==all_text[j+1])
1356                  && (all_text[i+2]==all_text[j+2]))
1357                  {   grandtable[t+test.occurrences]=j;
1358                      test.occurrences++;
1359                      if (t+test.occurrences==MAX_GTABLE)
1360                      {   printf("All %ld cross-references used\n",
1361                              (long int) MAX_GTABLE);
1362                          goto Built;
1363                      }
1364                  }
1365         }
1366         if (test.occurrences>=2)
1367         {   tlbtab[no_occs]=test;
1368             tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1369             if (max<tlbtab[no_occs].occurrences)
1370                 max=tlbtab[no_occs].occurrences;
1371             no_occs++;
1372             if (no_occs==MAX_TLBS)
1373             {   printf("All %d three-letter-blocks used\n",
1374                     MAX_TLBS);
1375                 goto Built;
1376             }
1377         }
1378         DontKeep: ;
1379     }
1380
1381     Built:
1382     grandflags=my_calloc(sizeof(int), max, "grandflags");
1383
1384
1385     printf("Cross-reference table (%ld entries) built...\n",
1386         (long int) no_occs);
1387     /*  for (i=0; i<no_occs; i++)
1388             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1389                 tlbtab[i].occurrences);
1390     */
1391
1392     for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1393     available=256;
1394     while ((available>0)&&(selected<64))
1395     {   printf("Pass %d\n", ++pass_no);
1396
1397         optimise_pass();
1398         available=0;
1399         for (i=0; i<256; i++)
1400             if (bestyet[i].score!=0)
1401             {   available++;
1402                 nl=bestyet[i].length;
1403                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1404                     all_text[bestyet[i].location+j2];
1405                 bestyet[i].text[nl]=0;
1406             }
1407
1408     /*  printf("End of pass results:\n");
1409         printf("\nno   score  freq   string\n");
1410         for (i=0; i<256; i++)
1411             if (bestyet[i].score>0)
1412                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1413                     bestyet[i].popularity, bestyet[i].text);
1414     */
1415
1416         do
1417         {   max=0;
1418             for (i=0; i<256; i++)
1419                 if (max<bestyet[i].score)
1420                 {   max=bestyet[i].score;
1421                     maxat=i;
1422                 }
1423
1424             if (max>0)
1425             {   bestyet2[selected++]=bestyet[maxat];
1426
1427                 printf(
1428                     "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1429                     (long int) selected,bestyet[maxat].text,
1430                     (long int) bestyet[maxat].popularity,
1431                     (long int) bestyet[maxat].score);
1432
1433                 test.text[0]=bestyet[maxat].text[0];
1434                 test.text[1]=bestyet[maxat].text[1];
1435                 test.text[2]=bestyet[maxat].text[2];
1436                 test.text[3]=0;
1437
1438                 for (i=0; i<no_occs; i++)
1439                     if (strcmp(test.text,tlbtab[i].text)==0)
1440                         break;
1441
1442                 for (j=0; j<tlbtab[i].occurrences; j++)
1443                 {   if (memcmp(bestyet[maxat].text,
1444                                all_text+grandtable[tlbtab[i].intab+j],
1445                                bestyet[maxat].length)==0)
1446                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1447                             all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1448                     }
1449                 }
1450
1451                 for (i=0; i<256; i++)
1452                     if ((bestyet[i].score>0)&&
1453                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1454                     {   bestyet[i].score=0;
1455                        /* printf("Discarding '%s' as overlapping\n",
1456                             bestyet[i].text); */
1457                     }
1458             }
1459         } while ((max>0)&&(available>0)&&(selected<64));
1460     }
1461
1462     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1463     for (i=0; i<selected; i++)
1464         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1465
1466     text_free_arrays();
1467 }
1468
1469 /* ------------------------------------------------------------------------- */
1470 /*   The dictionary manager begins here.                                     */
1471 /*                                                                           */
1472 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1473 /*   routine were used to search the dictionary words so far built up, then  */
1474 /*   Inform would crawl.                                                     */
1475 /*                                                                           */
1476 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1477 /*   balanced with the red-black algorithm.                                  */
1478 /* ------------------------------------------------------------------------- */
1479 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1480 /*   7-byte header (left blank here to be filled in at the                   */
1481 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1482 /*   records, one per word, in the form                                      */
1483 /*                                                                           */
1484 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1485 /*        4 or 6 bytes       byte        byte             byte               */
1486 /*                                                                           */
1487 /*   For Glulx, the form is instead: (But see below about Unicode-valued     */
1488 /*   dictionaries and my heinie.)                                            */
1489 /*                                                                           */
1490 /*        <plain text>      <flags>   <verbnumber>     <adjectivenumber>     */
1491 /*        DICT_WORD_SIZE     short       short            short              */
1492 /*                                                                           */
1493 /*   These records are stored in "accession order" (i.e. in order of their   */
1494 /*   first being received by these routines) and only alphabetically sorted  */
1495 /*   by construct_storyfile() (using the array below).                       */
1496 /* ------------------------------------------------------------------------- */
1497 /*                                                                           */
1498 /*   Further notes about the data fields...                                  */
1499 /*   The flags are currently:                                                */
1500 /*     bit 0: word is used as a verb (in verb grammar)                       */
1501 /*     bit 1: word is used as a meta verb                                    */
1502 /*     bit 2: word is plural (set by '//p')                                  */
1503 /*     bit 3: word is used as a preposition (in verb grammar)                */
1504 /*     bit 6: set for all verbs, but not used by the parser?                 */
1505 /*     bit 7: word is used as a noun (set for every word that appears in     */
1506 /*       code or in an object property)                                      */
1507 /*                                                                           */
1508 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1509 /*   zero).                                                                  */
1510 /*                                                                           */
1511 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1512 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1513 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1514 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1515 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1516 /* ------------------------------------------------------------------------- */
1517
1518 uchar *dictionary,                    /* (These two pointers are externally
1519                                          used only in "tables.c" when
1520                                          building the story-file)            */
1521     *dictionary_top;                  /* Pointer to next free record         */
1522
1523 int dict_entries;                     /* Total number of records entered     */
1524
1525 /* ------------------------------------------------------------------------- */
1526 /*   dict_word is a typedef for a struct of 6 unsigned chars (defined in     */
1527 /*   "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word.     */
1528 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1529 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1530 /*   dict_word is called the "sort code" of the original text word.          */
1531 /*                                                                           */
1532 /*   ###- In modifying the compiler, I've found it easier to discard the     */
1533 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1534 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1535 /*   as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE.      */
1536 /*   (That limit is defined as 40 in the header; it exists only for a few    */
1537 /*   static buffers, and can be increased without using significant memory.) */
1538 /*                                                                           */
1539 /*   ###- Well, that certainly bit me on the butt, didn't it. In further     */
1540 /*   modifying the compiler to generate a Unicode dictionary, I have to      */
1541 /*   store four-byte values in the uchar array. This is handled by making    */
1542 /*   the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1543 /*   Then we store the 32-bit character value big-endian. This lets us       */
1544 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1545 /* ------------------------------------------------------------------------- */
1546
1547 extern int compare_sorts(uchar *d1, uchar *d2)
1548 {   int i;
1549     for (i=0; i<DICT_WORD_BYTES; i++)
1550         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1551     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1552        libraries) */
1553     return(0);
1554 }
1555
1556 extern void copy_sorts(uchar *d1, uchar *d2)
1557 {   int i;
1558     for (i=0; i<DICT_WORD_BYTES; i++)
1559         d1[i] = d2[i];
1560 }
1561
1562 static uchar prepared_sort[MAX_DICT_WORD_BYTES];     /* Holds the sort code
1563                                                         of current word */
1564
1565 static int number_and_case;
1566
1567 /* Also used by verbs.c */
1568 static void dictionary_prepare_z(char *dword, uchar *optresult)
1569 {   int i, j, k, k2, wd[13]; int32 tot;
1570
1571     /* A rapid text translation algorithm using only the simplified rules
1572        applying to the text of dictionary entries: first produce a sequence
1573        of 6 (v3) or 9 (v4+) Z-characters                                     */
1574
1575     int dictsize = (version_number==3) ? 6 : 9;
1576
1577     number_and_case = 0;
1578
1579     for (i=0, j=0; dword[j]!=0; i++, j++)
1580     {   if ((dword[j] == '/') && (dword[j+1] == '/'))
1581         {   for (j+=2; dword[j] != 0; j++)
1582             {   switch(dword[j])
1583                 {   case 'p': number_and_case |= 4;  break;
1584                     default:
1585                         error_named("Expected 'p' after '//' \
1586 to give number of dictionary word", dword);
1587                         break;
1588                 }
1589             }
1590             break;
1591         }
1592         if (i>=dictsize) break;
1593
1594         k=(int) dword[j];
1595         if (k==(int) '\'')
1596             warning_named("Obsolete usage: use the ^ character for the \
1597 apostrophe in", dword);
1598         if (k==(int) '^') k=(int) '\'';
1599         if (k=='\"') k='~';
1600
1601         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1602         {   int unicode = text_to_unicode(dword+j);
1603             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1604             k = unicode_to_zscii(unicode);
1605             j += textual_form_length - 1;
1606             if ((k == 5) || (k >= 0x100))
1607             {   unicode_char_error(
1608                    "Character can be printed but not input:", unicode);
1609                 k = '?';
1610             }
1611             k2 = zscii_to_alphabet_grid[(uchar) k];
1612         }
1613         else
1614         {   if (isupper(k)) k = tolower(k);
1615             k2 = iso_to_alphabet_grid[(uchar) k];
1616         }
1617
1618         if (k2 < 0)
1619         {   if ((k2 == -5) || (k2 <= -0x100))
1620                 char_error("Character can be printed but not input:", k);
1621             else
1622             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
1623
1624                 wd[i++] = 5; wd[i++] = 6;
1625                 k2 = -k2;
1626                 wd[i++] = k2/32; wd[i] = k2%32;
1627             }
1628         }
1629         else
1630         {   alphabet_used[k2] = 'Y';
1631             if ((k2/26)!=0)
1632                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
1633             wd[i]=6+(k2%26);                  /* Write the Z character       */
1634         }
1635     }
1636
1637     /* Fill up to the end of the dictionary block with PAD characters        */
1638
1639     for (; i<9; i++) wd[i]=5;
1640
1641     /* The array of Z-chars is converted to three 2-byte blocks              */
1642
1643     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1644     prepared_sort[1]=tot%0x100;
1645     prepared_sort[0]=(tot/0x100)%0x100;
1646     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1647     prepared_sort[3]=tot%0x100;
1648     prepared_sort[2]=(tot/0x100)%0x100;
1649     tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1650     prepared_sort[5]=tot%0x100;
1651     prepared_sort[4]=(tot/0x100)%0x100;
1652
1653     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
1654
1655     if (version_number==3) prepared_sort[2]+=0x80;
1656                       else prepared_sort[4]+=0x80;
1657
1658     if (optresult) copy_sorts(optresult, prepared_sort);
1659 }
1660
1661 /* Also used by verbs.c */
1662 static void dictionary_prepare_g(char *dword, uchar *optresult)
1663 {
1664   int i, j, k;
1665   int32 unicode;
1666
1667   number_and_case = 0;
1668
1669   for (i=0, j=0; (dword[j]!=0); i++, j++) {
1670     if ((dword[j] == '/') && (dword[j+1] == '/')) {
1671       for (j+=2; dword[j] != 0; j++) {
1672         switch(dword[j]) {
1673         case 'p':
1674           number_and_case |= 4;
1675           break;
1676         default:
1677           error_named("Expected 'p' after '//' \
1678 to give gender or number of dictionary word", dword);
1679           break;
1680         }
1681       }
1682       break;
1683     }
1684     if (i>=DICT_WORD_SIZE) break;
1685
1686     k= ((unsigned char *)dword)[j];
1687     if (k=='\'')
1688       warning_named("Obsolete usage: use the ^ character for the \
1689 apostrophe in", dword);
1690     if (k=='^')
1691       k='\'';
1692     if (k=='~') /* as in iso_to_alphabet_grid */
1693       k='\"';
1694
1695     if (k=='@' || (character_set_unicode && (k & 0x80))) {
1696       unicode = text_to_unicode(dword+j);
1697       j += textual_form_length - 1;
1698     }
1699     else {
1700       unicode = iso_to_unicode_grid[k];
1701     }
1702
1703     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1704       k = unicode;
1705     }
1706     else {
1707       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1708 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1709       k = '?';
1710     }
1711
1712     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1713       k += ('a' - 'A');
1714
1715     if (DICT_CHAR_SIZE == 1) {
1716       prepared_sort[i] = k;
1717     }
1718     else {
1719       prepared_sort[4*i]   = (k >> 24) & 0xFF;
1720       prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1721       prepared_sort[4*i+2] = (k >>  8) & 0xFF;
1722       prepared_sort[4*i+3] = (k)       & 0xFF;
1723     }
1724   }
1725
1726   if (DICT_CHAR_SIZE == 1) {
1727     for (; i<DICT_WORD_SIZE; i++)
1728       prepared_sort[i] = 0;
1729   }
1730   else {
1731     for (; i<DICT_WORD_SIZE; i++) {
1732       prepared_sort[4*i]   = 0;
1733       prepared_sort[4*i+1] = 0;
1734       prepared_sort[4*i+2] = 0;
1735       prepared_sort[4*i+3] = 0;
1736     }
1737   }
1738
1739   if (optresult) copy_sorts(optresult, prepared_sort);
1740 }
1741
1742 extern void dictionary_prepare(char *dword, uchar *optresult)
1743 {
1744   if (!glulx_mode)
1745     dictionary_prepare_z(dword, optresult);
1746   else
1747     dictionary_prepare_g(dword, optresult);
1748 }
1749
1750 /* ------------------------------------------------------------------------- */
1751 /*   The arrays below are all concerned with the problem of alphabetically   */
1752 /*   sorting the dictionary during the compilation pass.                     */
1753 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
1754 /*   the end of the pass: we need to ensure that no duplicates are ever      */
1755 /*   created.                                                                */
1756 /*                                                                           */
1757 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
1758 /*                          word to be entered into the dictionary, where    */
1759 /*                          n counts upward from 0                           */
1760 /*                          (n is also called the "accession number")        */
1761 /*                                                                           */
1762 /*   The tree structure encodes an ordering.  The special value VACANT means */
1763 /*   "no node here": otherwise, node numbers are the same as accession       */
1764 /*   numbers.  At all times, "root" holds the node number of the top of the  */
1765 /*   tree; each node has up to two branches, such that the subtree of the    */
1766 /*   left branch is always alphabetically before what's at the node, and     */
1767 /*   the subtree to the right is always after; and all branches are coloured */
1768 /*   either "black" or "red".  These colours are used to detect points where */
1769 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
1770 /*   to search).                                                             */
1771 /* ------------------------------------------------------------------------- */
1772
1773 #define RED    'r'
1774 #define BLACK  'b'
1775 #define VACANT -1
1776
1777 static int root;
1778 typedef struct dict_tree_node_s
1779 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
1780     char colour;                  /* The colour of the branch to the parent */
1781 } dict_tree_node;
1782
1783 static dict_tree_node *dtree;
1784
1785 int   *final_dict_order;
1786 static uchar *dict_sort_codes;
1787
1788 static void dictionary_begin_pass(void)
1789 {
1790     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
1791     /*  Glulx has a 4-byte header instead. */
1792
1793     if (!glulx_mode)
1794         dictionary_top=dictionary+7;
1795     else
1796         dictionary_top=dictionary+4;
1797
1798     root = VACANT;
1799     dict_entries = 0;
1800 }
1801
1802 static int fdo_count;
1803 static void recursively_sort(int node)
1804 {   if (dtree[node].branch[0] != VACANT)
1805         recursively_sort(dtree[node].branch[0]);
1806     final_dict_order[node] = fdo_count++;
1807     if (dtree[node].branch[1] != VACANT)
1808         recursively_sort(dtree[node].branch[1]);
1809 }
1810
1811 extern void sort_dictionary(void)
1812 {   int i;
1813     if (module_switch)
1814     {   for (i=0; i<dict_entries; i++)
1815             final_dict_order[i] = i;
1816         return;
1817     }
1818
1819     if (root != VACANT)
1820     {   fdo_count = 0; recursively_sort(root);
1821     }
1822 }
1823
1824 /* ------------------------------------------------------------------------- */
1825 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
1826 /*   If not, return 0.                                                       */
1827 /* ------------------------------------------------------------------------- */
1828
1829 static int dictionary_find(char *dword)
1830 {   int at = root, n;
1831
1832     dictionary_prepare(dword, NULL);
1833
1834     while (at != VACANT)
1835     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1836         if (n==0) return at + 1;
1837         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1838     }
1839     return 0;
1840 }
1841
1842 /* ------------------------------------------------------------------------- */
1843 /*  Add "dword" to the dictionary with (x,y,z) as its data fields; unless    */
1844 /*  it already exists, in which case OR the data with (x,y,z)                */
1845 /*                                                                           */
1846 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
1847 /*                                                                           */
1848 /*  Returns: the accession number.                                           */
1849 /* ------------------------------------------------------------------------- */
1850
1851 extern int dictionary_add(char *dword, int x, int y, int z)
1852 {   int n; uchar *p;
1853     int ggfr = 0, gfr = 0, fr = 0, r = 0;
1854     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1855     int a, b;
1856     int res=((version_number==3)?4:6);
1857
1858     dictionary_prepare(dword, NULL);
1859
1860     if (root == VACANT)
1861     {   root = 0; goto CreateEntry;
1862     }
1863     while (TRUE)
1864     {
1865         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1866         if (n==0)
1867         {
1868             if (!glulx_mode) {
1869                 p = dictionary+7 + at*(3+res) + res;
1870                 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1871                 if (x & 128) p[0] = (p[0])|number_and_case;
1872             }
1873             else {
1874                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1875                 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1876                 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1877                 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1878                 if (x & 128) p[1] = (p[1]) | number_and_case;
1879             }
1880             return at;
1881         }
1882         if (n>0) r=1; else r=0;
1883
1884         a = dtree[at].branch[0]; b = dtree[at].branch[1];
1885         if ((a != VACANT) && (dtree[a].colour == RED) &&
1886             (b != VACANT) && (dtree[b].colour == RED))
1887         {   dtree[a].colour = BLACK;
1888             dtree[b].colour = BLACK;
1889
1890             dtree[at].colour = RED;
1891
1892         /* A tree rotation may be needed to avoid two red links in a row:
1893            e.g.
1894              ggf   (or else gf is root)         ggf (or f is root)
1895               |                                  |
1896               gf                                 f
1897              / \(red)                           / \ (both red)
1898                 f            becomes          gf   at
1899                / \(red)                      /  \ /  \
1900                   at
1901                  /  \
1902
1903            In effect we rehang the "gf" subtree from "f".
1904            See the Technical Manual for further details.
1905         */
1906
1907             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1908             {
1909               if (fr == gfr)
1910               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1911                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1912                 dtree[f].branch[1-fr] = gf;
1913                 dtree[f].colour = BLACK;
1914                 dtree[gf].colour = RED;
1915                 gf = ggf; gfr = ggfr;
1916               }
1917               else
1918               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1919                 dtree[at].colour = BLACK;
1920                 dtree[gf].colour = RED;
1921                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1922                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1923                 dtree[at].branch[gfr] = f;
1924                 dtree[at].branch[fr] = gf;
1925
1926                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1927                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1928               }
1929             }
1930         }
1931
1932         if (dtree[at].branch[r] == VACANT)
1933         {   dtree[at].colour = RED;
1934
1935             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1936             { if (fr == gfr)
1937               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1938                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1939                 dtree[f].branch[1-fr] = gf;
1940                 dtree[f].colour = BLACK;
1941                 dtree[gf].colour = RED;
1942               }
1943               else
1944               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1945                 dtree[at].colour = BLACK;
1946                 dtree[gf].colour = RED;
1947                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1948                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1949                 dtree[at].branch[gfr] = f;
1950                 dtree[at].branch[fr] = gf;
1951
1952                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1953                 f = n; gf = ggf;
1954               }
1955             }
1956             dtree[at].branch[r] = dict_entries;
1957             goto CreateEntry;
1958         }
1959         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1960         ggfr = gfr; gfr = fr; fr = r;
1961     }
1962
1963     CreateEntry:
1964
1965     if (dict_entries==MAX_DICT_ENTRIES)
1966         memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1967
1968     dtree[dict_entries].branch[0] = VACANT;
1969     dtree[dict_entries].branch[1] = VACANT;
1970     dtree[dict_entries].colour    = BLACK;
1971
1972     /*  Address in Inform's own dictionary table to write the record to      */
1973
1974     if (!glulx_mode) {
1975
1976         p = dictionary + (3+res)*dict_entries + 7;
1977
1978         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1979             bytes */
1980
1981         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1982         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1983         if (version_number > 3)
1984           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1985         p[res]=x; p[res+1]=y; p[res+2]=z;
1986         if (x & 128) p[res] = (p[res])|number_and_case;
1987
1988         dictionary_top += res+3;
1989
1990     }
1991     else {
1992         int i;
1993         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
1994         p[0] = 0x60; /* type byte -- dict word */
1995
1996         p += DICT_CHAR_SIZE;
1997         for (i=0; i<DICT_WORD_BYTES; i++)
1998           p[i] = prepared_sort[i];
1999
2000         p += DICT_WORD_BYTES;
2001         p[0] = 0; p[1] = x;
2002         p[2] = y/256; p[3] = y%256;
2003         p[4] = 0; p[5] = z;
2004         if (x & 128)
2005           p[1] |= number_and_case;
2006
2007         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2008
2009     }
2010
2011     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2012
2013     return dict_entries++;
2014 }
2015
2016 /* ------------------------------------------------------------------------- */
2017 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2018 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2019 /*   change their verb-numbers.)                                             */
2020 /* ------------------------------------------------------------------------- */
2021
2022 extern void dictionary_set_verb_number(char *dword, int to)
2023 {   int i; uchar *p;
2024     int res=((version_number==3)?4:6);
2025     i=dictionary_find(dword);
2026     if (i!=0)
2027     {
2028         if (!glulx_mode) {
2029             p=dictionary+7+(i-1)*(3+res)+res;
2030             p[1]=to;
2031         }
2032         else {
2033             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2034             p[2]=to/256; p[3]=to%256;
2035         }
2036     }
2037 }
2038
2039 /* ------------------------------------------------------------------------- */
2040 /*   Tracing code for the dictionary: used not only by "trace" and text      */
2041 /*   transcription, but also (in the case of "word_to_ascii") in a vital     */
2042 /*   by the linker.                                                          */
2043 /* ------------------------------------------------------------------------- */
2044
2045 static char *d_show_to;
2046 static int d_show_total;
2047
2048 static void show_char(char c)
2049 {   if (d_show_to == NULL) printf("%c", c);
2050     else
2051     {   int i = strlen(d_show_to);
2052         d_show_to[i] = c; d_show_to[i+1] = 0;
2053     }
2054 }
2055
2056 extern void word_to_ascii(uchar *p, char *results)
2057 {   int i, shift, cc, zchar; uchar encoded_word[9];
2058     encoded_word[0] = (((int) p[0])&0x7c)/4;
2059     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2060     encoded_word[2] = ((int) p[1])&0x1f;
2061     encoded_word[3] = (((int) p[2])&0x7c)/4;
2062     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2063     encoded_word[5] = ((int) p[3])&0x1f;
2064     if (version_number > 3)
2065     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2066         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2067         encoded_word[8] = ((int) p[5])&0x1f;
2068     }
2069
2070     shift = 0; cc = 0;
2071     for (i=0; i< ((version_number==3)?6:9); i++)
2072     {   zchar = encoded_word[i];
2073
2074         if (zchar == 4) shift = 1;
2075         else
2076         if (zchar == 5) shift = 2;
2077         else
2078         {   if ((shift == 2) && (zchar == 6))
2079             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2080                 i += 2;
2081                 if ((zchar>=32) && (zchar<=126))
2082                     results[cc++] = zchar;
2083                 else
2084                 {   zscii_to_text(results+cc, zchar);
2085                     cc = strlen(results);
2086                 }
2087             }
2088             else
2089             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2090                 cc = strlen(results);
2091             }
2092             shift = 0;
2093         }
2094     }
2095     results[cc] = 0;
2096 }
2097
2098 static void recursively_show_z(int node)
2099 {   int i, cprinted, flags; uchar *p;
2100     char textual_form[32];
2101     int res = (version_number == 3)?4:6;
2102
2103     if (dtree[node].branch[0] != VACANT)
2104         recursively_show_z(dtree[node].branch[0]);
2105
2106     p = (uchar *)dictionary + 7 + (3+res)*node;
2107
2108     word_to_ascii(p, textual_form);
2109
2110     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2111         show_char(textual_form[cprinted]);
2112     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2113         show_char(' ');
2114
2115     if (d_show_to == NULL)
2116     {   for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2117
2118         flags = (int) p[res];
2119         if (flags & 128)
2120         {   printf("noun ");
2121             if (flags & 4)  printf("p"); else printf(" ");
2122             printf(" ");
2123         }
2124         else printf("       ");
2125         if (flags & 8)
2126         {   if (grammar_version_number == 1)
2127                 printf("preposition:%d  ", (int) p[res+2]);
2128             else
2129                 printf("preposition    ");
2130         }
2131         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2132         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2133         printf("\n");
2134     }
2135
2136     if (d_show_total++ == 5)
2137     {   d_show_total = 0;
2138         if (d_show_to != NULL)
2139         {   write_to_transcript_file(d_show_to);
2140             d_show_to[0] = 0;
2141         }
2142     }
2143
2144     if (dtree[node].branch[1] != VACANT)
2145         recursively_show_z(dtree[node].branch[1]);
2146 }
2147
2148 static void recursively_show_g(int node)
2149 {
2150   warning("### Glulx dictionary-show not yet implemented.\n");
2151 }
2152
2153 static void show_alphabet(int i)
2154 {   int j, c; char chartext[8];
2155
2156     for (j=0; j<26; j++)
2157     {   c = alphabet[i][j];
2158
2159         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2160
2161         zscii_to_text(chartext, c);
2162         printf("%s", chartext);
2163
2164         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2165     }
2166     printf("\n");
2167 }
2168
2169 extern void show_dictionary(void)
2170 {   printf("Dictionary contains %d entries:\n",dict_entries);
2171     if (dict_entries != 0)
2172     {   d_show_total = 0; d_show_to = NULL;
2173         if (!glulx_mode)
2174             recursively_show_z(root);
2175         else
2176             recursively_show_g(root);
2177     }
2178     printf("\nZ-machine alphabet entries:\n");
2179     show_alphabet(0);
2180     show_alphabet(1);
2181     show_alphabet(2);
2182 }
2183
2184 extern void write_dictionary_to_transcript(void)
2185 {   char d_buffer[81];
2186
2187     sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2188
2189     d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2190
2191     if (dict_entries != 0)
2192     {   d_show_total = 0; d_show_to = d_buffer;
2193         if (!glulx_mode)
2194             recursively_show_z(root);
2195         else
2196             recursively_show_g(root);
2197     }
2198     if (d_show_total != 0) write_to_transcript_file(d_buffer);
2199 }
2200
2201 /* ========================================================================= */
2202 /*   Data structure management routines                                      */
2203 /* ------------------------------------------------------------------------- */
2204
2205 extern void init_text_vars(void)
2206 {   int j;
2207     bestyet = NULL;
2208     bestyet2 = NULL;
2209     tlbtab = NULL;
2210     grandtable = NULL;
2211     grandflags = NULL;
2212     no_chars_transcribed = 0;
2213     is_abbreviation = FALSE;
2214     put_strings_in_low_memory = FALSE;
2215
2216     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2217
2218     total_zchars_trans = 0;
2219
2220     dtree = NULL;
2221     final_dict_order = NULL;
2222     dict_sort_codes = NULL;
2223     dict_entries=0;
2224
2225     initialise_memory_block(&static_strings_area);
2226 }
2227
2228 extern void text_begin_pass(void)
2229 {   abbrevs_lookup_table_made = FALSE;
2230     no_abbreviations=0;
2231     total_chars_trans=0; total_bytes_trans=0;
2232     if (store_the_text) all_text_top=all_text;
2233     dictionary_begin_pass();
2234     low_strings_top = low_strings;
2235
2236     static_strings_extent = 0;
2237     no_strings = 0;
2238     no_dynamic_strings = 0;
2239     no_unicode_chars = 0;
2240 }
2241
2242 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2243
2244 extern void text_allocate_arrays(void)
2245 {   abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2246         "abbreviations");
2247     abbrev_values    = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2248     abbrev_quality   = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2249     abbrev_freqs     = my_calloc(sizeof(int),   MAX_ABBREVS, "abbrev freqs");
2250
2251     dtree            = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2252                                  "red-black tree for dictionary");
2253     final_dict_order = my_calloc(sizeof(int),  MAX_DICT_ENTRIES,
2254                                  "final dictionary ordering table");
2255     dict_sort_codes  = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2256                                  "dictionary sort codes");
2257
2258     if (!glulx_mode)
2259         dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2260             "dictionary");
2261     else
2262         dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2263             "dictionary");
2264
2265     strings_holding_area
2266          = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2267     low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2268
2269     huff_entities = NULL;
2270     hufflist = NULL;
2271     unicode_usage_entries = NULL;
2272     done_compression = FALSE;
2273     compression_table_size = 0;
2274     compressed_offsets = NULL;
2275
2276     MAX_CHARACTER_SET = 0;
2277
2278     if (glulx_mode) {
2279       if (compression_switch) {
2280         int ix;
2281         MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2282           + MAX_UNICODE_CHARS;
2283         huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2284           "huffman entities");
2285         hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2286           "huffman node list");
2287         unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2288           MAX_UNICODE_CHARS, "unicode entity entries");
2289         for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2290           unicode_usage_hash[ix] = NULL;
2291       }
2292       compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2293         "static strings index table");
2294     }
2295 }
2296
2297 extern void text_free_arrays(void)
2298 {
2299     my_free(&strings_holding_area, "static strings holding area");
2300     my_free(&low_strings, "low (abbreviation) strings");
2301     my_free(&abbreviations_at, "abbreviations");
2302     my_free(&abbrev_values,    "abbrev values");
2303     my_free(&abbrev_quality,   "abbrev quality");
2304     my_free(&abbrev_freqs,     "abbrev freqs");
2305
2306     my_free(&dtree,            "red-black tree for dictionary");
2307     my_free(&final_dict_order, "final dictionary ordering table");
2308     my_free(&dict_sort_codes,  "dictionary sort codes");
2309
2310     my_free(&dictionary,"dictionary");
2311
2312     my_free(&compressed_offsets, "static strings index table");
2313     my_free(&hufflist, "huffman node list");
2314     my_free(&huff_entities, "huffman entities");
2315     my_free(&unicode_usage_entries, "unicode entity entities");
2316
2317     deallocate_memory_block(&static_strings_area);
2318 }
2319
2320 extern void ao_free_arrays(void)
2321 {   my_free (&tlbtab,"tlb table");
2322     my_free (&sub_buffer,"sub_buffer");
2323     my_free (&bestyet,"bestyet");
2324     my_free (&bestyet2,"bestyet2");
2325     my_free (&grandtable,"grandtable");
2326     my_free (&grandflags,"grandflags");
2327 }
2328
2329 /* ========================================================================= */