text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /* Copyright (c) Graham Nelson 1993 - 2016                                   */
   5 /*                                                                           */
   6 /* This file is part of Inform.                                              */
   7 /*                                                                           */
   8 /* Inform is free software: you can redistribute it and/or modify            */
   9 /* it under the terms of the GNU General Public License as published by      */
  10 /* the Free Software Foundation, either version 3 of the License, or         */
  11 /* (at your option) any later version.                                       */
  12 /*                                                                           */
  13 /* Inform is distributed in the hope that it will be useful,                 */
  14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  16 /* GNU General Public License for more details.                              */
  17 /*                                                                           */
  18 /* You should have received a copy of the GNU General Public License         */
  19 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  20 /*                                                                           */
  21 /* ------------------------------------------------------------------------- */
  22
  23 #include "header.h"
  24
  25 uchar *low_strings, *low_strings_top;  /* Start and next free byte in the low
  26                                           strings pool */
  27
  28 int32 static_strings_extent;           /* Number of bytes of static strings
  29                                           made so far */
  30 memory_block static_strings_area;      /* Used if (!temporary_files_switch) to
  31                                           hold the static strings area so far */
  32
  33 static uchar *strings_holding_area;    /* Area holding translated strings
  34                                           until they are moved into either
  35                                           a temporary file, or the
  36                                           static_strings_area below */
  37
  38 char *all_text, *all_text_top;         /* Start and next byte free in (large)
  39                                           text buffer holding the entire text
  40                                           of the game, when it is being
  41                                           recorded                           */
  42 int put_strings_in_low_memory,         /* When TRUE, put static strings in
  43                                           the low strings pool at 0x100 rather
  44                                           than in the static strings area    */
  45     is_abbreviation,                   /* When TRUE, the string being trans
  46                                           is itself an abbreviation string
  47                                           so can't make use of abbreviations */
  48     abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  49                                           constructed when the first non-
  50                                           abbreviation string is translated:
  51                                           this flag is TRUE after that       */
  52     abbrevs_lookup[256];               /* Once this has been constructed,
  53                                           abbrevs_lookup[n] = the smallest
  54                                           number of any abbreviation beginning
  55                                           with ASCII character n, or -1
  56                                           if none of the abbreviations do    */
  57 int no_abbreviations;                  /* No of abbreviations defined so far */
  58 uchar *abbreviations_at;                 /* Memory to hold the text of any
  59                                           abbreviation strings declared      */
  60 /* ------------------------------------------------------------------------- */
  61 /*   Glulx string compression storage                                        */
  62 /* ------------------------------------------------------------------------- */
  63
  64 int no_strings;                        /* No of strings in static strings
  65                                           area.                              */
  66 int no_dynamic_strings;                /* No. of @.. string escapes used
  67                                           (actually, the highest value used
  68                                           plus one)                          */
  69 int no_unicode_chars;                  /* Number of distinct Unicode chars
  70                                           used. (Beyond 0xFF.)               */
  71
  72 static int MAX_CHARACTER_SET;          /* Number of possible entities */
  73 huffentity_t *huff_entities;           /* The list of entities (characters,
  74                                           abbreviations, @.. escapes, and
  75                                           the terminator)                    */
  76 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  77
  78 int no_huff_entities;                  /* The number of entities in the list */
  79 int huff_unicode_start;                /* Position in the list where Unicode
  80                                           chars begin.                       */
  81 int huff_abbrev_start;                 /* Position in the list where string
  82                                           abbreviations begin.               */
  83 int huff_dynam_start;                  /* Position in the list where @..
  84                                           entities begin.                    */
  85 int huff_entity_root;                  /* The position in the list of the root
  86                                           entry (when considering the table
  87                                           as a tree).                        */
  88
  89 int done_compression;                  /* Has the game text been compressed? */
  90 int32 compression_table_size;          /* Length of the Huffman table, in
  91                                           bytes                              */
  92 int32 compression_string_size;         /* Length of the compressed string
  93                                           data, in bytes                     */
  94 int32 *compressed_offsets;             /* The beginning of every string in
  95                                           the game, relative to the beginning
  96                                           of the Huffman table. (So entry 0
  97                                           is equal to compression_table_size)*/
  98
  99 #define UNICODE_HASH_BUCKETS (64)
 100 unicode_usage_t *unicode_usage_entries;
 101 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
 102
 103 static int unicode_entity_index(int32 unicode);
 104
 105 /* ------------------------------------------------------------------------- */
 106 /*   Abbreviation arrays                                                     */
 107 /* ------------------------------------------------------------------------- */
 108
 109 int *abbrev_values;
 110 int *abbrev_quality;
 111 int *abbrev_freqs;
 112
 113 /* ------------------------------------------------------------------------- */
 114
 115 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 116       total_bytes_trans,               /* Number of bytes of Z-code text out */
 117       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 118                                           needed only for abbrev efficiency
 119                                           calculation in "directs.c"         */
 120 static int32 total_zchars_trans,       /* Number of Z-chars of text out
 121                                           (only used to calculate the above) */
 122       no_chars_transcribed;            /* Number of ASCII chars written to
 123                                           the text transcription area (used
 124                                           for the -r and -u switches)        */
 125
 126 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 127                                           3 Z-chars at a time: when it's full
 128                                           these are written as a 2-byte word */
 129            zob_index;                  /* Index (0 to 2) into it             */
 130
 131 static unsigned char *text_out_pc;     /* The "program counter" during text
 132                                           translation: the next address to
 133                                           write Z-coded text output to       */
 134
 135 static unsigned char *text_out_limit;  /* The upper limit of text_out_pc
 136                                           during text translation            */
 137
 138 static int text_out_overflow;          /* During text translation, becomes
 139                                           true if text_out_pc tries to pass
 140                                           text_out_limit                     */
 141
 142 /* ------------------------------------------------------------------------- */
 143 /*   For variables/arrays used by the dictionary manager, see below          */
 144 /* ------------------------------------------------------------------------- */
 145
 146 /* ------------------------------------------------------------------------- */
 147 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 148 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 149 /*   alphabetical order (this is necessary for the detection algorithm to    */
 150 /*   to work).  Since the table is only prepared once, and for a table       */
 151 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 152 /* ------------------------------------------------------------------------- */
 153
 154 static void make_abbrevs_lookup(void)
 155 {   int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
 156     do
 157     {   bubble_sort = FALSE;
 158         for (j=0; j<no_abbreviations; j++)
 159             for (k=j+1; k<no_abbreviations; k++)
 160             {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 161                 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
 162                 if (strcmp(p1,p2)<0)
 163                 {   strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
 164                     l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
 165                     abbrev_values[k]=l;
 166                     l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
 167                     abbrev_quality[k]=l;
 168                     bubble_sort = TRUE;
 169                 }
 170             }
 171     } while (bubble_sort);
 172
 173     for (j=no_abbreviations-1; j>=0; j--)
 174     {   p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 175         abbrevs_lookup[(uchar)p1[0]]=j;
 176         abbrev_freqs[j]=0;
 177     }
 178     abbrevs_lookup_table_made = TRUE;
 179 }
 180
 181 /* ------------------------------------------------------------------------- */
 182 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 183 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 184 /*   is only called if text[i] is indeed the first character of at least one */
 185 /*   abbreviation, "from" begin the least index into the abbreviations table */
 186 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 187 /*   that the abbrevs table is in alphabetical order.                        */
 188 /*                                                                           */
 189 /*   The return value is -1 if there is no match.  If there is a match, the  */
 190 /*   text to be abbreviated out is over-written by a string of null chars    */
 191 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 192 /*                                                                           */
 193 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 194 /* ------------------------------------------------------------------------- */
 195
 196 static int try_abbreviations_from(unsigned char *text, int i, int from)
 197 {   int j, k; uchar *p, c;
 198     c=text[i];
 199     for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
 200          (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
 201     {   if (text[i+1]==p[1])
 202         {   for (k=2; p[k]!=0; k++)
 203                 if (text[i+k]!=p[k]) goto NotMatched;
 204             if (!glulx_mode) {
 205                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 206             }
 207             abbrev_freqs[j]++;
 208             return(j);
 209             NotMatched: ;
 210         }
 211     }
 212     return(-1);
 213 }
 214
 215 extern void make_abbreviation(char *text)
 216 {
 217     strcpy((char *)abbreviations_at
 218             + no_abbreviations*MAX_ABBREV_LENGTH, text);
 219
 220     is_abbreviation = TRUE;
 221     abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
 222     is_abbreviation = FALSE;
 223
 224     /*   The quality is the number of Z-chars saved by using this            */
 225     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 226
 227     abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
 228 }
 229
 230 /* ------------------------------------------------------------------------- */
 231 /*   The front end routine for text translation                              */
 232 /* ------------------------------------------------------------------------- */
 233
 234 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
 235 {   int i, j; uchar *c;
 236
 237     is_abbreviation = is_abbrev;
 238
 239     /* Put into the low memory pool (at 0x100 in the Z-machine) of strings   */
 240     /* which may be wanted as possible entries in the abbreviations table    */
 241
 242     if (!glulx_mode && in_low_memory)
 243     {   j=subtract_pointers(low_strings_top,low_strings);
 244         low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
 245         if (!low_strings_top)
 246             memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
 247         is_abbreviation = FALSE;
 248         return(0x21+(j/2));
 249     }
 250
 251     if (glulx_mode && done_compression)
 252         compiler_error("Tried to add a string after compression was done.");
 253
 254     c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
 255     if (!c)
 256         memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 257
 258     i = subtract_pointers(c, strings_holding_area);
 259
 260     /* Insert null bytes as needed to ensure that the next static string */
 261     /* also occurs at an address expressible as a packed address         */
 262
 263     if (!glulx_mode) {
 264         int textalign;
 265         if (oddeven_packing_switch)
 266             textalign = scale_factor*2;
 267         else
 268             textalign = scale_factor;
 269         while ((i%textalign)!=0)
 270         {
 271             if (i+2 > MAX_STATIC_STRINGS)
 272                 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
 273             i+=2; *c++ = 0; *c++ = 0;
 274         }
 275     }
 276
 277     j = static_strings_extent;
 278
 279     if (temporary_files_switch)
 280         for (c=strings_holding_area; c<strings_holding_area+i;
 281              c++, static_strings_extent++)
 282             fputc(*c,Temp1_fp);
 283     else
 284         for (c=strings_holding_area; c<strings_holding_area+i;
 285              c++, static_strings_extent++)
 286             write_byte_to_memory_block(&static_strings_area,
 287                 static_strings_extent, *c);
 288
 289     is_abbreviation = FALSE;
 290
 291     if (!glulx_mode) {
 292         return(j/scale_factor);
 293     }
 294     else {
 295         /* The marker value is a one-based string number. (We reserve zero
 296            to mean "not a string at all". */
 297         return (++no_strings);
 298     }
 299 }
 300
 301 /* ------------------------------------------------------------------------- */
 302 /*   Output a single Z-character into the buffer, and flush it if full       */
 303 /* ------------------------------------------------------------------------- */
 304
 305 static void write_z_char_z(int i)
 306 {   uint32 j;
 307     ASSERT_ZCODE();
 308     total_zchars_trans++;
 309     zchars_out_buffer[zob_index++]=(i%32);
 310     if (zob_index!=3) return;
 311     zob_index=0;
 312     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 313        + zchars_out_buffer[2];
 314     if (text_out_pc+2 > text_out_limit) {
 315         text_out_overflow = TRUE;
 316         return;
 317     }
 318     text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
 319     total_bytes_trans+=2;
 320 }
 321
 322 static void write_zscii(int zsc)
 323 {
 324     int lookup_value, in_alphabet;
 325
 326     if (zsc==' ')
 327     {   write_z_char_z(0);
 328         return;
 329     }
 330
 331     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 332
 333     else lookup_value = -1;
 334
 335     if (lookup_value >= 0)
 336     {   alphabet_used[lookup_value] = 'Y';
 337         in_alphabet = lookup_value/26;
 338         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 339         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 340         write_z_char_z(lookup_value%26 + 6);
 341     }
 342     else
 343     {   write_z_char_z(5); write_z_char_z(6);
 344         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 345     }
 346 }
 347
 348 /* ------------------------------------------------------------------------- */
 349 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 350 /*   setting the "end" bit on the final 2-byte word                          */
 351 /* ------------------------------------------------------------------------- */
 352
 353 static void end_z_chars(void)
 354 {   unsigned char *p;
 355     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 356     while (zob_index!=0) write_z_char_z(5);
 357     p=(unsigned char *) text_out_pc;
 358     *(p-2)= *(p-2)+128;
 359 }
 360
 361 /* Glulx handles this much more simply -- compression is done elsewhere. */
 362 static void write_z_char_g(int i)
 363 {
 364   ASSERT_GLULX();
 365   if (text_out_pc+1 > text_out_limit) {
 366       text_out_overflow = TRUE;
 367       return;
 368   }
 369   total_zchars_trans++;
 370   text_out_pc[0] = i;
 371   text_out_pc++;
 372   total_bytes_trans++;
 373 }
 374
 375 /* ------------------------------------------------------------------------- */
 376 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 377 /*   translator. p is the address to write output to, s_text the source text */
 378 /*   and the return value is the next free address to write output to.       */
 379 /*   The return value will not exceed p_limit. If the translation tries to   */
 380 /*   overflow this boundary, the return value will be NULL (and you should   */
 381 /*   display an error).                                                      */
 382 /*   Note that the source text may be corrupted by this routine.             */
 383 /* ------------------------------------------------------------------------- */
 384
 385 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
 386 {   int i, j, k, in_alphabet, lookup_value;
 387     int32 unicode; int zscii;
 388     unsigned char *text_in;
 389
 390     /*  Cast the input and output streams to unsigned char: text_out_pc will
 391         advance as bytes of Z-coded text are written, but text_in doesn't    */
 392
 393     text_in     = (unsigned char *) s_text;
 394     text_out_pc = (unsigned char *) p;
 395     text_out_limit = (unsigned char *) p_limit;
 396     text_out_overflow = FALSE;
 397
 398     /*  Remember the Z-chars total so that later we can subtract to find the
 399         number of Z-chars translated on this string                          */
 400
 401     zchars_trans_in_last_string = total_zchars_trans;
 402
 403     /*  Start with the Z-characters output buffer empty                      */
 404
 405     zob_index=0;
 406
 407     /*  If this is the first text translated since the abbreviations were
 408         declared, and if some were declared, then it's time to make the
 409         lookup table for abbreviations
 410
 411         (Except: we don't if the text being translated is itself
 412         the text of an abbreviation currently being defined)                 */
 413
 414     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 415         && (!is_abbreviation))
 416         make_abbrevs_lookup();
 417
 418     /*  If we're storing the whole game text to memory, then add this text   */
 419
 420     if ((!is_abbreviation) && (store_the_text))
 421     {   no_chars_transcribed += strlen(s_text)+2;
 422         if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
 423             memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
 424         sprintf(all_text_top, "%s\n\n", s_text);
 425         all_text_top += strlen(all_text_top);
 426     }
 427
 428     if (transcript_switch && (!veneer_mode))
 429         write_to_transcript_file(s_text);
 430
 431   if (!glulx_mode) {
 432
 433     /*  The empty string of Z-text is illegal, since it can't carry an end
 434         bit: so we translate an empty string of ASCII text to just the
 435         pad character 5.  Printing this causes nothing to appear on screen.  */
 436
 437     if (text_in[0]==0) write_z_char_z(5);
 438
 439     /*  Loop through the characters of the null-terminated input text: note
 440         that if 1 is written over a character in the input text, it is
 441         afterwards ignored                                                   */
 442
 443     for (i=0; text_in[i]!=0; i++)
 444     {   total_chars_trans++;
 445
 446         /*  Contract ".  " into ". " if double-space-removing switch set:
 447             likewise "?  " and "!  " if the setting is high enough           */
 448
 449         if ((double_space_setting >= 1)
 450             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 451         {   if (text_in[i]=='.') text_in[i+2]=1;
 452             if (double_space_setting >= 2)
 453             {   if (text_in[i]=='?') text_in[i+2]=1;
 454                 if (text_in[i]=='!') text_in[i+2]=1;
 455             }
 456         }
 457
 458         /*  Try abbreviations if the economy switch set                      */
 459
 460         if ((economy_switch) && (!is_abbreviation)
 461             && ((k=abbrevs_lookup[text_in[i]])!=-1))
 462         {   if ((j=try_abbreviations_from(text_in, i, k))!=-1)
 463             {   if (j<32) { write_z_char_z(2); write_z_char_z(j); }
 464                 else { write_z_char_z(3); write_z_char_z(j-32); }
 465             }
 466         }
 467
 468         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 469            decoding */
 470         if (character_set_unicode && (text_in[i] & 0x80))
 471         {   unicode = text_to_unicode((char *) (text_in+i));
 472             zscii = unicode_to_zscii(unicode);
 473             if (zscii != 5) write_zscii(zscii);
 474             else
 475             {   unicode_char_error(
 476                     "Character can only be used if declared in \
 477 advance as part of 'Zcharacter table':", unicode);
 478             }
 479             i += textual_form_length - 1;
 480             continue;
 481         }
 482
 483         /*  '@' is the escape character in Inform string notation: the various
 484             possibilities are:
 485
 486                 (printing only)
 487                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 488                 @twodigits       :  write the abbreviation string with this
 489                                     decimal number
 490
 491                 (any string context)
 492                 @accentcode      :  this accented character: e.g.,
 493                                         for @'e write an E-acute
 494                 @{...}           :  this Unicode char (in hex)              */
 495
 496         if (text_in[i]=='@')
 497         {   if (text_in[i+1]=='@')
 498             {
 499                 /*   @@...   */
 500
 501                 i+=2; j=atoi((char *) (text_in+i));
 502                 switch(j)
 503                 {   /* Prevent ~ and ^ from being translated to double-quote
 504                        and new-line, as they ordinarily would be */
 505
 506                     case 94:   write_z_char_z(5); write_z_char_z(6);
 507                                write_z_char_z(94/32); write_z_char_z(94%32);
 508                                break;
 509                     case 126:  write_z_char_z(5); write_z_char_z(6);
 510                                write_z_char_z(126/32); write_z_char_z(126%32);
 511                                break;
 512
 513                     default:   write_zscii(j); break;
 514                 }
 515                 while (isdigit(text_in[i])) i++; i--;
 516             }
 517             else if (isdigit(text_in[i+1])!=0)
 518             {   int d1, d2;
 519
 520                 /*   @..   */
 521
 522                 d1 = character_digit_value[text_in[i+1]];
 523                 d2 = character_digit_value[text_in[i+2]];
 524                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 525                     error("'@..' must have two decimal digits");
 526                 else
 527                 {   i+=2;
 528                     write_z_char_z(1); write_z_char_z(d1*10 + d2);
 529                 }
 530             }
 531             else
 532             {
 533                 /*   A string escape specifying an unusual character   */
 534
 535                 unicode = text_to_unicode((char *) (text_in+i));
 536                 zscii = unicode_to_zscii(unicode);
 537                 if (zscii != 5) write_zscii(zscii);
 538                 else
 539                 {   unicode_char_error(
 540                        "Character can only be used if declared in \
 541 advance as part of 'Zcharacter table':", unicode);
 542                 }
 543                 i += textual_form_length - 1;
 544             }
 545         }
 546         else
 547         {   /*  Skip a character which has been over-written with the null
 548                 value 1 earlier on                                           */
 549
 550             if (text_in[i]!=1)
 551             {   if (text_in[i]==' ') write_z_char_z(0);
 552                 else
 553                 {   j = (int) text_in[i];
 554                     lookup_value = iso_to_alphabet_grid[j];
 555                     if (lookup_value < 0)
 556                     {   /*  The character isn't in the standard alphabets, so
 557                             we have to use the ZSCII 4-Z-char sequence */
 558
 559                         if (lookup_value == -5)
 560                         {   /*  Character isn't in the ZSCII set at all */
 561
 562                             unicode = iso_to_unicode(j);
 563                             unicode_char_error(
 564                                 "Character can only be used if declared in \
 565 advance as part of 'Zcharacter table':", unicode);
 566                             write_zscii(0x200 + unicode/0x100);
 567                             write_zscii(0x300 + unicode%0x100);
 568                         }
 569                         else write_zscii(-lookup_value);
 570                     }
 571                     else
 572                     {   /*  The character is in one of the standard alphabets:
 573                             write a SHIFT to temporarily change alphabet if
 574                             it isn't in alphabet 0, then write the Z-char    */
 575
 576                         alphabet_used[lookup_value] = 'Y';
 577                         in_alphabet = lookup_value/26;
 578                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 579                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 580                         write_z_char_z(lookup_value%26 + 6);
 581                     }
 582                 }
 583             }
 584         }
 585     }
 586
 587     /*  Flush the Z-characters output buffer and set the "end" bit           */
 588
 589     end_z_chars();
 590
 591   }
 592   else {
 593
 594     /* The text storage here is, of course, temporary. Compression
 595        will occur when we're finished compiling, so that all the
 596        clever Huffman stuff will work.
 597        In the stored text, we use "@@" to indicate @,
 598        "@0" to indicate a zero byte,
 599        "@ANNNN" to indicate an abbreviation,
 600        "@DNNNN" to indicate a dynamic string thing.
 601        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 602        (NNNN is a four-digit hex number using the letters A-P... an
 603        ugly representation but a convenient one.)
 604     */
 605
 606     for (i=0; text_in[i]!=0; i++) {
 607
 608       /*  Contract ".  " into ". " if double-space-removing switch set:
 609           likewise "?  " and "!  " if the setting is high enough. */
 610       if ((double_space_setting >= 1)
 611         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 612         if (text_in[i]=='.'
 613           || (double_space_setting >= 2
 614             && (text_in[i]=='?' || text_in[i]=='!'))) {
 615           text_in[i+1] = text_in[i];
 616           i++;
 617         }
 618       }
 619
 620       total_chars_trans++;
 621
 622       /*  Try abbreviations if the economy switch set. We have to be in
 623           compression mode too, since the abbreviation mechanism is part
 624           of string decompression. */
 625
 626       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 627         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 628         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 629         char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
 630         i += (strlen(cx)-1);
 631         write_z_char_g('@');
 632         write_z_char_g('A');
 633         write_z_char_g('A' + ((j >>12) & 0x0F));
 634         write_z_char_g('A' + ((j >> 8) & 0x0F));
 635         write_z_char_g('A' + ((j >> 4) & 0x0F));
 636         write_z_char_g('A' + ((j     ) & 0x0F));
 637       }
 638       else if (text_in[i] == '@') {
 639         if (text_in[i+1]=='@') {
 640           /* An ASCII code */
 641           i+=2; j=atoi((char *) (text_in+i));
 642           if (j == '@' || j == '\0') {
 643             write_z_char_g('@');
 644             if (j == 0) {
 645               j = '0';
 646               if (!compression_switch)
 647                 warning("Ascii @@0 will prematurely terminate non-compressed \
 648 string.");
 649             }
 650           }
 651           write_z_char_g(j);
 652           while (isdigit(text_in[i])) i++; i--;
 653         }
 654         else if (isdigit(text_in[i+1])) {
 655           int d1, d2;
 656           d1 = character_digit_value[text_in[i+1]];
 657           d2 = character_digit_value[text_in[i+2]];
 658           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 659             error("'@..' must have two decimal digits");
 660           }
 661           else {
 662             if (!compression_switch)
 663               warning("'@..' print variable will not work in non-compressed \
 664 string; substituting '   '.");
 665             i += 2;
 666             j = d1*10 + d2;
 667             if (j >= MAX_DYNAMIC_STRINGS) {
 668               memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
 669               j = 0;
 670             }
 671             if (j+1 >= no_dynamic_strings)
 672               no_dynamic_strings = j+1;
 673             write_z_char_g('@');
 674             write_z_char_g('D');
 675             write_z_char_g('A' + ((j >>12) & 0x0F));
 676             write_z_char_g('A' + ((j >> 8) & 0x0F));
 677             write_z_char_g('A' + ((j >> 4) & 0x0F));
 678             write_z_char_g('A' + ((j     ) & 0x0F));
 679           }
 680         }
 681         else {
 682           unicode = text_to_unicode((char *) (text_in+i));
 683           i += textual_form_length - 1;
 684           if (unicode == '@' || unicode == '\0') {
 685             write_z_char_g('@');
 686             write_z_char_g(unicode ? '@' : '0');
 687           }
 688           else if (unicode >= 0 && unicode < 256) {
 689             write_z_char_g(unicode);
 690           }
 691           else {
 692             if (!compression_switch) {
 693               warning("Unicode characters will not work in non-compressed \
 694 string; substituting '?'.");
 695               write_z_char_g('?');
 696             }
 697             else {
 698               j = unicode_entity_index(unicode);
 699               write_z_char_g('@');
 700               write_z_char_g('U');
 701               write_z_char_g('A' + ((j >>12) & 0x0F));
 702               write_z_char_g('A' + ((j >> 8) & 0x0F));
 703               write_z_char_g('A' + ((j >> 4) & 0x0F));
 704               write_z_char_g('A' + ((j     ) & 0x0F));
 705             }
 706           }
 707         }
 708       }
 709       else if (text_in[i] == '^')
 710         write_z_char_g(0x0A);
 711       else if (text_in[i] == '~')
 712         write_z_char_g('"');
 713       else if (character_set_unicode) {
 714         if (text_in[i] & 0x80) {
 715           unicode = text_to_unicode((char *) (text_in+i));
 716           i += textual_form_length - 1;
 717           if (unicode >= 0 && unicode < 256) {
 718             write_z_char_g(unicode);
 719           }
 720           else {
 721             if (!compression_switch) {
 722               warning("Unicode characters will not work in non-compressed \
 723 string; substituting '?'.");
 724               write_z_char_g('?');
 725             }
 726             else {
 727               j = unicode_entity_index(unicode);
 728               write_z_char_g('@');
 729               write_z_char_g('U');
 730               write_z_char_g('A' + ((j >>12) & 0x0F));
 731               write_z_char_g('A' + ((j >> 8) & 0x0F));
 732               write_z_char_g('A' + ((j >> 4) & 0x0F));
 733               write_z_char_g('A' + ((j     ) & 0x0F));
 734             }
 735           }
 736         }
 737         else {
 738           write_z_char_g(text_in[i]);
 739         }
 740       }
 741       else {
 742         unicode = iso_to_unicode_grid[text_in[i]];
 743         if (unicode >= 0 && unicode < 256) {
 744           write_z_char_g(unicode);
 745         }
 746         else {
 747           if (!compression_switch) {
 748             warning("Unicode characters will not work in non-compressed \
 749 string; substituting '?'.");
 750             write_z_char_g('?');
 751           }
 752           else {
 753             j = unicode_entity_index(unicode);
 754             write_z_char_g('@');
 755             write_z_char_g('U');
 756             write_z_char_g('A' + ((j >>12) & 0x0F));
 757             write_z_char_g('A' + ((j >> 8) & 0x0F));
 758             write_z_char_g('A' + ((j >> 4) & 0x0F));
 759             write_z_char_g('A' + ((j     ) & 0x0F));
 760           }
 761         }
 762       }
 763     }
 764     write_z_char_g(0);
 765
 766   }
 767
 768   if (text_out_overflow)
 769       return NULL;
 770   else
 771       return((uchar *) text_out_pc);
 772 }
 773
 774 static int unicode_entity_index(int32 unicode)
 775 {
 776   unicode_usage_t *uptr;
 777   int j;
 778   int buck = unicode % UNICODE_HASH_BUCKETS;
 779
 780   for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
 781     if (uptr->ch == unicode)
 782       break;
 783   }
 784   if (uptr) {
 785     j = (uptr - unicode_usage_entries);
 786   }
 787   else {
 788     if (no_unicode_chars >= MAX_UNICODE_CHARS) {
 789       memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
 790       j = 0;
 791     }
 792     else {
 793       j = no_unicode_chars;
 794       no_unicode_chars++;
 795       uptr = unicode_usage_entries + j;
 796       uptr->ch = unicode;
 797       uptr->next = unicode_usage_hash[buck];
 798       unicode_usage_hash[buck] = uptr;
 799     }
 800   }
 801
 802   return j;
 803 }
 804
 805 /* ------------------------------------------------------------------------- */
 806 /*   Glulx compression code                                                  */
 807 /* ------------------------------------------------------------------------- */
 808
 809
 810 static void compress_makebits(int entnum, int depth, int prevbit,
 811   huffbitlist_t *bits);
 812
 813 /*   The compressor. This uses the usual Huffman compression algorithm. */
 814 void compress_game_text()
 815 {
 816   int entities=0, branchstart, branches;
 817   int numlive;
 818   int32 lx;
 819   int jx;
 820   int ch;
 821   int32 ix;
 822   huffbitlist_t bits;
 823
 824   if (compression_switch) {
 825
 826     /* How many entities have we currently got? Well, 256 plus the
 827        string-terminator plus Unicode chars plus abbrevations plus
 828        dynamic strings. */
 829     entities = 256+1;
 830     huff_unicode_start = entities;
 831     entities += no_unicode_chars;
 832     huff_abbrev_start = entities;
 833     if (economy_switch)
 834       entities += no_abbreviations;
 835     huff_dynam_start = entities;
 836     entities += no_dynamic_strings;
 837
 838     if (entities > MAX_CHARACTER_SET)
 839       memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
 840
 841     /* Characters */
 842     for (jx=0; jx<256; jx++) {
 843       huff_entities[jx].type = 2;
 844       huff_entities[jx].count = 0;
 845       huff_entities[jx].u.ch = jx;
 846     }
 847     /* Terminator */
 848     huff_entities[256].type = 1;
 849     huff_entities[256].count = 0;
 850     for (jx=0; jx<no_unicode_chars; jx++) {
 851       huff_entities[huff_unicode_start+jx].type = 4;
 852       huff_entities[huff_unicode_start+jx].count = 0;
 853       huff_entities[huff_unicode_start+jx].u.val = jx;
 854     }
 855     if (economy_switch) {
 856       for (jx=0; jx<no_abbreviations; jx++) {
 857         huff_entities[huff_abbrev_start+jx].type = 3;
 858         huff_entities[huff_abbrev_start+jx].count = 0;
 859         huff_entities[huff_abbrev_start+jx].u.val = jx;
 860       }
 861     }
 862     for (jx=0; jx<no_dynamic_strings; jx++) {
 863       huff_entities[huff_dynam_start+jx].type = 9;
 864       huff_entities[huff_dynam_start+jx].count = 0;
 865       huff_entities[huff_dynam_start+jx].u.val = jx;
 866     }
 867   }
 868   else {
 869     /* No compression; use defaults that will make it easy to check
 870        for errors. */
 871     no_huff_entities = 257;
 872     huff_unicode_start = 257;
 873     huff_abbrev_start = 257;
 874     huff_dynam_start = 257+MAX_ABBREVS;
 875     compression_table_size = 0;
 876   }
 877
 878   if (temporary_files_switch) {
 879     fclose(Temp1_fp);
 880     Temp1_fp=fopen(Temp1_Name,"rb");
 881     if (Temp1_fp==NULL)
 882       fatalerror("I/O failure: couldn't reopen temporary file 1");
 883   }
 884
 885   if (compression_switch) {
 886
 887     for (lx=0, ix=0; lx<no_strings; lx++) {
 888       int escapelen=0, escapetype=0;
 889       int done=FALSE;
 890       int32 escapeval=0;
 891       while (!done) {
 892         if (temporary_files_switch)
 893           ch = fgetc(Temp1_fp);
 894         else
 895           ch = read_byte_from_memory_block(&static_strings_area, ix);
 896         ix++;
 897         if (ix > static_strings_extent || ch < 0)
 898           compiler_error("Read too much not-yet-compressed text.");
 899         if (escapelen == -1) {
 900           escapelen = 0;
 901           if (ch == '@') {
 902             ch = '@';
 903           }
 904           else if (ch == '0') {
 905             ch = '\0';
 906           }
 907           else if (ch == 'A' || ch == 'D' || ch == 'U') {
 908             escapelen = 4;
 909             escapetype = ch;
 910             escapeval = 0;
 911             continue;
 912           }
 913           else {
 914             compiler_error("Strange @ escape in processed text.");
 915           }
 916         }
 917         else if (escapelen) {
 918           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
 919           escapelen--;
 920           if (escapelen == 0) {
 921             if (escapetype == 'A') {
 922               ch = huff_abbrev_start+escapeval;
 923             }
 924             else if (escapetype == 'D') {
 925               ch = huff_dynam_start+escapeval;
 926             }
 927             else if (escapetype == 'U') {
 928               ch = huff_unicode_start+escapeval;
 929             }
 930             else {
 931               compiler_error("Strange @ escape in processed text.");
 932             }
 933           }
 934           else
 935             continue;
 936         }
 937         else {
 938           if (ch == '@') {
 939             escapelen = -1;
 940             continue;
 941           }
 942           if (ch == 0) {
 943             ch = 256;
 944             done = TRUE;
 945           }
 946         }
 947         huff_entities[ch].count++;
 948       }
 949     }
 950
 951     numlive = 0;
 952     for (jx=0; jx<entities; jx++) {
 953       if (huff_entities[jx].count) {
 954         hufflist[numlive] = &(huff_entities[jx]);
 955         numlive++;
 956       }
 957     }
 958
 959     branchstart = entities;
 960     branches = 0;
 961
 962     while (numlive > 1) {
 963       int best1, best2;
 964       int best1num, best2num;
 965       huffentity_t *bran;
 966
 967       if (hufflist[0]->count < hufflist[1]->count) {
 968         best1 = 0;
 969         best2 = 1;
 970       }
 971       else {
 972         best2 = 0;
 973         best1 = 1;
 974       }
 975
 976       best1num = hufflist[best1]->count;
 977       best2num = hufflist[best2]->count;
 978
 979       for (jx=2; jx<numlive; jx++) {
 980         if (hufflist[jx]->count < best1num) {
 981           best2 = best1;
 982           best2num = best1num;
 983           best1 = jx;
 984           best1num = hufflist[best1]->count;
 985         }
 986         else if (hufflist[jx]->count < best2num) {
 987           best2 = jx;
 988           best2num = hufflist[best2]->count;
 989         }
 990       }
 991
 992       bran = &(huff_entities[branchstart+branches]);
 993       branches++;
 994       bran->type = 0;
 995       bran->count = hufflist[best1]->count + hufflist[best2]->count;
 996       bran->u.branch[0] = (hufflist[best1] - huff_entities);
 997       bran->u.branch[1] = (hufflist[best2] - huff_entities);
 998       hufflist[best1] = bran;
 999       if (best2 < numlive-1) {
1000         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1001           ((numlive-1) - best2) * sizeof(huffentity_t *));
1002       }
1003       numlive--;
1004     }
1005
1006     huff_entity_root = (hufflist[0] - huff_entities);
1007
1008     for (ix=0; ix<MAXHUFFBYTES; ix++)
1009       bits.b[ix] = 0;
1010     compression_table_size = 12;
1011
1012     no_huff_entities = 0; /* compress_makebits will total this up */
1013     compress_makebits(huff_entity_root, 0, -1, &bits);
1014   }
1015
1016   /* Now, sadly, we have to compute the size of the string section,
1017      without actually doing the compression. */
1018   compression_string_size = 0;
1019
1020   if (temporary_files_switch) {
1021     fseek(Temp1_fp, 0, SEEK_SET);
1022   }
1023
1024   if (no_strings >= MAX_NUM_STATIC_STRINGS)
1025     memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1026
1027   for (lx=0, ix=0; lx<no_strings; lx++) {
1028     int escapelen=0, escapetype=0;
1029     int done=FALSE;
1030     int32 escapeval=0;
1031     jx = 0;
1032     compressed_offsets[lx] = compression_table_size + compression_string_size;
1033     compression_string_size++; /* for the type byte */
1034     while (!done) {
1035       if (temporary_files_switch)
1036         ch = fgetc(Temp1_fp);
1037       else
1038         ch = read_byte_from_memory_block(&static_strings_area, ix);
1039       ix++;
1040       if (ix > static_strings_extent || ch < 0)
1041         compiler_error("Read too much not-yet-compressed text.");
1042       if (escapelen == -1) {
1043         escapelen = 0;
1044         if (ch == '@') {
1045           ch = '@';
1046         }
1047         else if (ch == '0') {
1048           ch = '\0';
1049         }
1050         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1051           escapelen = 4;
1052           escapetype = ch;
1053           escapeval = 0;
1054           continue;
1055         }
1056         else {
1057           compiler_error("Strange @ escape in processed text.");
1058         }
1059       }
1060       else if (escapelen) {
1061         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1062         escapelen--;
1063         if (escapelen == 0) {
1064           if (escapetype == 'A') {
1065             ch = huff_abbrev_start+escapeval;
1066           }
1067           else if (escapetype == 'D') {
1068             ch = huff_dynam_start+escapeval;
1069           }
1070           else if (escapetype == 'U') {
1071             ch = huff_unicode_start+escapeval;
1072           }
1073           else {
1074             compiler_error("Strange @ escape in processed text.");
1075           }
1076         }
1077         else
1078           continue;
1079       }
1080       else {
1081         if (ch == '@') {
1082           escapelen = -1;
1083           continue;
1084         }
1085         if (ch == 0) {
1086           ch = 256;
1087           done = TRUE;
1088         }
1089       }
1090
1091       if (compression_switch) {
1092         jx += huff_entities[ch].depth;
1093         compression_string_size += (jx/8);
1094         jx = (jx % 8);
1095       }
1096       else {
1097         if (ch >= huff_dynam_start) {
1098           compression_string_size += 3;
1099         }
1100         else if (ch >= huff_unicode_start) {
1101           compiler_error("Abbreviation/Unicode in non-compressed string \
1102 should be impossible.");
1103         }
1104         else
1105           compression_string_size += 1;
1106       }
1107     }
1108     if (compression_switch && jx)
1109       compression_string_size++;
1110   }
1111
1112   done_compression = TRUE;
1113 }
1114
1115 static void compress_makebits(int entnum, int depth, int prevbit,
1116   huffbitlist_t *bits)
1117 {
1118   huffentity_t *ent = &(huff_entities[entnum]);
1119   char *cx;
1120
1121   no_huff_entities++;
1122   ent->addr = compression_table_size;
1123   ent->depth = depth;
1124   ent->bits = *bits;
1125   if (depth > 0) {
1126     if (prevbit)
1127       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1128   }
1129
1130   switch (ent->type) {
1131   case 0:
1132     compression_table_size += 9;
1133     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1134     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1135     break;
1136   case 1:
1137     compression_table_size += 1;
1138     break;
1139   case 2:
1140     compression_table_size += 2;
1141     break;
1142   case 3:
1143     cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1144     compression_table_size += (1 + 1 + strlen(cx));
1145     break;
1146   case 4:
1147   case 9:
1148     compression_table_size += 5;
1149     break;
1150   }
1151 }
1152
1153 /* ------------------------------------------------------------------------- */
1154 /*   The abbreviations optimiser                                             */
1155 /*                                                                           */
1156 /*   This is a very complex, memory and time expensive algorithm to          */
1157 /*   approximately solve the problem of which abbreviation strings would     */
1158 /*   minimise the total number of Z-chars to which the game text translates. */
1159 /*   It is in some ways a quite separate program but remains inside Inform   */
1160 /*   for compatibility with previous releases.                               */
1161 /* ------------------------------------------------------------------------- */
1162
1163 typedef struct tlb_s
1164 {   char text[4];
1165     int32 intab, occurrences;
1166 } tlb;
1167 static tlb *tlbtab;
1168 static int32 no_occs;
1169
1170 static int32 *grandtable;
1171 static int32 *grandflags;
1172 typedef struct optab_s
1173 {   int32  length;
1174     int32  popularity;
1175     int32  score;
1176     int32  location;
1177     char text[MAX_ABBREV_LENGTH];
1178 } optab;
1179 static optab *bestyet, *bestyet2;
1180
1181 static int pass_no;
1182
1183 static char *sub_buffer;
1184
1185 static void optimise_pass(void)
1186 {   int32 i; int t1, t2;
1187     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1188     for (i=0; i<256; i++) bestyet[i].length=0;
1189     for (i=0; i<no_occs; i++)
1190     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1191         {
1192 #ifdef MAC_FACE
1193             if (i%((**g_pm_hndl).linespercheck) == 0)
1194             {   ProcessEvents (&g_proc);
1195                 if (g_proc != true)
1196                 {   free_arrays();
1197                     if (store_the_text)
1198                         my_free(&all_text,"transcription text");
1199                     longjmp (g_fallback, 1);
1200                 }
1201             }
1202 #endif
1203             printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1204                 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1205                 (long int) tlbtab[i].occurrences);
1206             t1=(int) (time(0));
1207             for (j=0; j<tlbtab[i].occurrences; j++)
1208             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1209                 nl=2; noflags=tlbtab[i].occurrences;
1210                 while ((noflags>=2)&&(nl<=62))
1211                 {   nl++;
1212                     for (j2=0; j2<nl; j2++)
1213                         if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1214                             goto FinishEarly;
1215                     matches=0;
1216                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1217                     {   if (grandflags[j2]==1)
1218                         {   x=grandtable[tlbtab[i].intab+j2]
1219                               - grandtable[tlbtab[i].intab+j];
1220                          if (((x>-nl)&&(x<nl))
1221                             || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1222                                        all_text+grandtable[tlbtab[i].intab+j2],
1223                                        nl)!=0))
1224                             {   grandflags[j2]=0; noflags--; }
1225                             else matches++;
1226                         }
1227                     }
1228                     scrabble=0;
1229                     for (k=0; k<nl; k++)
1230                     {   scrabble++;
1231                         c=all_text[grandtable[tlbtab[i].intab+j+k]];
1232                         if (c!=(int) ' ')
1233                         {   if (iso_to_alphabet_grid[c]<0)
1234                                 scrabble+=2;
1235                             else
1236                                 if (iso_to_alphabet_grid[c]>=26)
1237                                     scrabble++;
1238                         }
1239                     }
1240                     score=(matches-1)*(scrabble-2);
1241                     min=score;
1242                     for (j2=0; j2<256; j2++)
1243                     {   if ((nl==bestyet[j2].length)
1244                                 && (memcmp(all_text+bestyet[j2].location,
1245                                        all_text+grandtable[tlbtab[i].intab+j],
1246                                        nl)==0))
1247                         {   j2=256; min=score; }
1248                         else
1249                         {   if (bestyet[j2].score<min)
1250                             {   min=bestyet[j2].score; minat=j2;
1251                             }
1252                         }
1253                     }
1254                     if (min!=score)
1255                     {   bestyet[minat].score=score;
1256                         bestyet[minat].length=nl;
1257                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1258                         bestyet[minat].popularity=matches;
1259                         for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1260                             all_text[bestyet[minat].location+j2];
1261                         sub_buffer[nl]=0;
1262                     }
1263                 }
1264                 FinishEarly: ;
1265             }
1266             t2=((int) time(0)) - t1;
1267             printf(" (%d seconds)\n",t2);
1268         }
1269     }
1270 }
1271
1272 static int any_overlap(char *s1, char *s2)
1273 {   int a, b, i, j, flag;
1274     a=strlen(s1); b=strlen(s2);
1275     for (i=1-b; i<a; i++)
1276     {   flag=0;
1277         for (j=0; j<b; j++)
1278             if ((0<=i+j)&&(i+j<=a-1))
1279                 if (s1[i+j]!=s2[j]) flag=1;
1280         if (flag==0) return(1);
1281     }
1282     return(0);
1283 }
1284
1285 #define MAX_TLBS 8000
1286
1287 extern void optimise_abbreviations(void)
1288 {   int32 i, j, t, max=0, MAX_GTABLE;
1289     int32 j2, selected, available, maxat=0, nl;
1290     tlb test;
1291
1292     printf("Beginning calculation of optimal abbreviations...\n");
1293
1294     pass_no = 0;
1295     tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1296     sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1297     for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1298
1299     bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1300     bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1301
1302     bestyet2[0].text[0]='.';
1303     bestyet2[0].text[1]=' ';
1304     bestyet2[0].text[2]=0;
1305
1306     bestyet2[1].text[0]=',';
1307     bestyet2[1].text[1]=' ';
1308     bestyet2[1].text[2]=0;
1309
1310     for (i=0; all_text+i<all_text_top; i++)
1311     {
1312         if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1313         {   all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1314             bestyet2[0].popularity++;
1315         }
1316
1317         if ((all_text[i]=='.') && (all_text[i+1]==' '))
1318         {   all_text[i]='\n'; all_text[i+1]='\n';
1319             bestyet2[0].popularity++;
1320         }
1321
1322         if ((all_text[i]==',') && (all_text[i+1]==' '))
1323         {   all_text[i]='\n'; all_text[i+1]='\n';
1324             bestyet2[1].popularity++;
1325         }
1326     }
1327
1328     MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1329     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1330
1331     for (i=0, t=0; all_text+i<all_text_top; i++)
1332     {   test.text[0]=all_text[i];
1333         test.text[1]=all_text[i+1];
1334         test.text[2]=all_text[i+2];
1335         test.text[3]=0;
1336         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1337             goto DontKeep;
1338         for (j=0; j<no_occs; j++)
1339             if (strcmp(test.text,tlbtab[j].text)==0)
1340                 goto DontKeep;
1341         test.occurrences=0;
1342         for (j=i+3; all_text+j<all_text_top; j++)
1343         {
1344 #ifdef MAC_FACE
1345             if (j%((**g_pm_hndl).linespercheck) == 0)
1346             {   ProcessEvents (&g_proc);
1347                 if (g_proc != true)
1348                 {   free_arrays();
1349                     if (store_the_text)
1350                         my_free(&all_text,"transcription text");
1351                     longjmp (g_fallback, 1);
1352                 }
1353             }
1354 #endif
1355             if ((all_text[i]==all_text[j])
1356                  && (all_text[i+1]==all_text[j+1])
1357                  && (all_text[i+2]==all_text[j+2]))
1358                  {   grandtable[t+test.occurrences]=j;
1359                      test.occurrences++;
1360                      if (t+test.occurrences==MAX_GTABLE)
1361                      {   printf("All %ld cross-references used\n",
1362                              (long int) MAX_GTABLE);
1363                          goto Built;
1364                      }
1365                  }
1366         }
1367         if (test.occurrences>=2)
1368         {   tlbtab[no_occs]=test;
1369             tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1370             if (max<tlbtab[no_occs].occurrences)
1371                 max=tlbtab[no_occs].occurrences;
1372             no_occs++;
1373             if (no_occs==MAX_TLBS)
1374             {   printf("All %d three-letter-blocks used\n",
1375                     MAX_TLBS);
1376                 goto Built;
1377             }
1378         }
1379         DontKeep: ;
1380     }
1381
1382     Built:
1383     grandflags=my_calloc(sizeof(int), max, "grandflags");
1384
1385
1386     printf("Cross-reference table (%ld entries) built...\n",
1387         (long int) no_occs);
1388     /*  for (i=0; i<no_occs; i++)
1389             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1390                 tlbtab[i].occurrences);
1391     */
1392
1393     for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1394     available=256;
1395     while ((available>0)&&(selected<64))
1396     {   printf("Pass %d\n", ++pass_no);
1397
1398         optimise_pass();
1399         available=0;
1400         for (i=0; i<256; i++)
1401             if (bestyet[i].score!=0)
1402             {   available++;
1403                 nl=bestyet[i].length;
1404                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1405                     all_text[bestyet[i].location+j2];
1406                 bestyet[i].text[nl]=0;
1407             }
1408
1409     /*  printf("End of pass results:\n");
1410         printf("\nno   score  freq   string\n");
1411         for (i=0; i<256; i++)
1412             if (bestyet[i].score>0)
1413                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1414                     bestyet[i].popularity, bestyet[i].text);
1415     */
1416
1417         do
1418         {   max=0;
1419             for (i=0; i<256; i++)
1420                 if (max<bestyet[i].score)
1421                 {   max=bestyet[i].score;
1422                     maxat=i;
1423                 }
1424
1425             if (max>0)
1426             {   bestyet2[selected++]=bestyet[maxat];
1427
1428                 printf(
1429                     "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1430                     (long int) selected,bestyet[maxat].text,
1431                     (long int) bestyet[maxat].popularity,
1432                     (long int) bestyet[maxat].score);
1433
1434                 test.text[0]=bestyet[maxat].text[0];
1435                 test.text[1]=bestyet[maxat].text[1];
1436                 test.text[2]=bestyet[maxat].text[2];
1437                 test.text[3]=0;
1438
1439                 for (i=0; i<no_occs; i++)
1440                     if (strcmp(test.text,tlbtab[i].text)==0)
1441                         break;
1442
1443                 for (j=0; j<tlbtab[i].occurrences; j++)
1444                 {   if (memcmp(bestyet[maxat].text,
1445                                all_text+grandtable[tlbtab[i].intab+j],
1446                                bestyet[maxat].length)==0)
1447                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1448                             all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1449                     }
1450                 }
1451
1452                 for (i=0; i<256; i++)
1453                     if ((bestyet[i].score>0)&&
1454                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1455                     {   bestyet[i].score=0;
1456                        /* printf("Discarding '%s' as overlapping\n",
1457                             bestyet[i].text); */
1458                     }
1459             }
1460         } while ((max>0)&&(available>0)&&(selected<64));
1461     }
1462
1463     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1464     for (i=0; i<selected; i++)
1465         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1466
1467     text_free_arrays();
1468 }
1469
1470 /* ------------------------------------------------------------------------- */
1471 /*   The dictionary manager begins here.                                     */
1472 /*                                                                           */
1473 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1474 /*   routine were used to search the dictionary words so far built up, then  */
1475 /*   Inform would crawl.                                                     */
1476 /*                                                                           */
1477 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1478 /*   balanced with the red-black algorithm.                                  */
1479 /* ------------------------------------------------------------------------- */
1480 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1481 /*   7-byte header (left blank here to be filled in at the                   */
1482 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1483 /*   records, one per word, in the form                                      */
1484 /*                                                                           */
1485 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1486 /*        4 or 6 bytes       byte        byte             byte               */
1487 /*                                                                           */
1488 /*   For Glulx, the form is instead: (But see below about Unicode-valued     */
1489 /*   dictionaries and my heinie.)                                            */
1490 /*                                                                           */
1491 /*        <plain text>      <flags>   <verbnumber>     <adjectivenumber>     */
1492 /*        DICT_WORD_SIZE     short       short            short              */
1493 /*                                                                           */
1494 /*   These records are stored in "accession order" (i.e. in order of their   */
1495 /*   first being received by these routines) and only alphabetically sorted  */
1496 /*   by construct_storyfile() (using the array below).                       */
1497 /* ------------------------------------------------------------------------- */
1498 /*                                                                           */
1499 /*   Further notes about the data fields...                                  */
1500 /*   The flags are currently:                                                */
1501 /*     bit 0: word is used as a verb (in verb grammar)                       */
1502 /*     bit 1: word is used as a meta verb                                    */
1503 /*     bit 2: word is plural (set by '//p')                                  */
1504 /*     bit 3: word is used as a preposition (in verb grammar)                */
1505 /*     bit 6: set for all verbs, but not used by the parser?                 */
1506 /*     bit 7: word is used as a noun (set for every word that appears in     */
1507 /*       code or in an object property)                                      */
1508 /*                                                                           */
1509 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1510 /*   zero).                                                                  */
1511 /*                                                                           */
1512 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1513 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1514 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1515 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1516 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1517 /* ------------------------------------------------------------------------- */
1518
1519 uchar *dictionary,                    /* (These two pointers are externally
1520                                          used only in "tables.c" when
1521                                          building the story-file)            */
1522     *dictionary_top;                  /* Pointer to next free record         */
1523
1524 int dict_entries;                     /* Total number of records entered     */
1525
1526 /* ------------------------------------------------------------------------- */
1527 /*   dict_word is a typedef for a struct of 6 unsigned chars (defined in     */
1528 /*   "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word.     */
1529 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1530 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1531 /*   dict_word is called the "sort code" of the original text word.          */
1532 /*                                                                           */
1533 /*   ###- In modifying the compiler, I've found it easier to discard the     */
1534 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1535 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1536 /*   as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE.      */
1537 /*   (That limit is defined as 40 in the header; it exists only for a few    */
1538 /*   static buffers, and can be increased without using significant memory.) */
1539 /*                                                                           */
1540 /*   ###- Well, that certainly bit me on the butt, didn't it. In further     */
1541 /*   modifying the compiler to generate a Unicode dictionary, I have to      */
1542 /*   store four-byte values in the uchar array. This is handled by making    */
1543 /*   the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1544 /*   Then we store the 32-bit character value big-endian. This lets us       */
1545 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1546 /* ------------------------------------------------------------------------- */
1547
1548 extern int compare_sorts(uchar *d1, uchar *d2)
1549 {   int i;
1550     for (i=0; i<DICT_WORD_BYTES; i++)
1551         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1552     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1553        libraries) */
1554     return(0);
1555 }
1556
1557 extern void copy_sorts(uchar *d1, uchar *d2)
1558 {   int i;
1559     for (i=0; i<DICT_WORD_BYTES; i++)
1560         d1[i] = d2[i];
1561 }
1562
1563 static uchar prepared_sort[MAX_DICT_WORD_BYTES];     /* Holds the sort code
1564                                                         of current word */
1565
1566 static int number_and_case;
1567
1568 /* Also used by verbs.c */
1569 static void dictionary_prepare_z(char *dword, uchar *optresult)
1570 {   int i, j, k, k2, wd[13]; int32 tot;
1571
1572     /* A rapid text translation algorithm using only the simplified rules
1573        applying to the text of dictionary entries: first produce a sequence
1574        of 6 (v3) or 9 (v4+) Z-characters                                     */
1575
1576     number_and_case = 0;
1577
1578     for (i=0, j=0; dword[j]!=0; i++, j++)
1579     {   if ((dword[j] == '/') && (dword[j+1] == '/'))
1580         {   for (j+=2; dword[j] != 0; j++)
1581             {   switch(dword[j])
1582                 {   case 'p': number_and_case |= 4;  break;
1583                     default:
1584                         error_named("Expected 'p' after '//' \
1585 to give number of dictionary word", dword);
1586                         break;
1587                 }
1588             }
1589             break;
1590         }
1591         if (i>=9) break;
1592
1593         k=(int) dword[j];
1594         if (k==(int) '\'')
1595             warning_named("Obsolete usage: use the ^ character for the \
1596 apostrophe in", dword);
1597         if (k==(int) '^') k=(int) '\'';
1598         if (k=='\"') k='~';
1599
1600         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1601         {   int unicode = text_to_unicode(dword+j);
1602             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1603             k = unicode_to_zscii(unicode);
1604             j += textual_form_length - 1;
1605             if ((k == 5) || (k >= 0x100))
1606             {   unicode_char_error(
1607                    "Character can be printed but not input:", unicode);
1608                 k = '?';
1609             }
1610             k2 = zscii_to_alphabet_grid[(uchar) k];
1611         }
1612         else
1613         {   if (isupper(k)) k = tolower(k);
1614             k2 = iso_to_alphabet_grid[(uchar) k];
1615         }
1616
1617         if (k2 < 0)
1618         {   if ((k2 == -5) || (k2 <= -0x100))
1619                 char_error("Character can be printed but not input:", k);
1620             else
1621             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
1622
1623                 wd[i++] = 5; wd[i++] = 6;
1624                 k2 = -k2;
1625                 wd[i++] = k2/32; wd[i] = k2%32;
1626             }
1627         }
1628         else
1629         {   alphabet_used[k2] = 'Y';
1630             if ((k2/26)!=0)
1631                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
1632             wd[i]=6+(k2%26);                  /* Write the Z character       */
1633         }
1634     }
1635
1636     /* Fill up to the end of the dictionary block with PAD characters        */
1637
1638     for (; i<9; i++) wd[i]=5;
1639
1640     /* The array of Z-chars is converted to three 2-byte blocks              */
1641
1642     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1643     prepared_sort[1]=tot%0x100;
1644     prepared_sort[0]=(tot/0x100)%0x100;
1645     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1646     prepared_sort[3]=tot%0x100;
1647     prepared_sort[2]=(tot/0x100)%0x100;
1648     tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1649     prepared_sort[5]=tot%0x100;
1650     prepared_sort[4]=(tot/0x100)%0x100;
1651
1652     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
1653
1654     if (version_number==3) prepared_sort[2]+=0x80;
1655                       else prepared_sort[4]+=0x80;
1656
1657     if (optresult) copy_sorts(optresult, prepared_sort);
1658 }
1659
1660 /* Also used by verbs.c */
1661 static void dictionary_prepare_g(char *dword, uchar *optresult)
1662 {
1663   int i, j, k;
1664   int32 unicode;
1665
1666   number_and_case = 0;
1667
1668   for (i=0, j=0; (dword[j]!=0); i++, j++) {
1669     if ((dword[j] == '/') && (dword[j+1] == '/')) {
1670       for (j+=2; dword[j] != 0; j++) {
1671         switch(dword[j]) {
1672         case 'p':
1673           number_and_case |= 4;
1674           break;
1675         default:
1676           error_named("Expected 'p' after '//' \
1677 to give gender or number of dictionary word", dword);
1678           break;
1679         }
1680       }
1681       break;
1682     }
1683     if (i>=DICT_WORD_SIZE) break;
1684
1685     k= ((unsigned char *)dword)[j];
1686     if (k=='\'')
1687       warning_named("Obsolete usage: use the ^ character for the \
1688 apostrophe in", dword);
1689     if (k=='^')
1690       k='\'';
1691     if (k=='~') /* as in iso_to_alphabet_grid */
1692       k='\"';
1693
1694     if (k=='@' || (character_set_unicode && (k & 0x80))) {
1695       unicode = text_to_unicode(dword+j);
1696       j += textual_form_length - 1;
1697     }
1698     else {
1699       unicode = iso_to_unicode_grid[k];
1700     }
1701
1702     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1703       k = unicode;
1704     }
1705     else {
1706       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1707 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1708       k = '?';
1709     }
1710
1711     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1712       k += ('a' - 'A');
1713
1714     if (DICT_CHAR_SIZE == 1) {
1715       prepared_sort[i] = k;
1716     }
1717     else {
1718       prepared_sort[4*i]   = (k >> 24) & 0xFF;
1719       prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1720       prepared_sort[4*i+2] = (k >>  8) & 0xFF;
1721       prepared_sort[4*i+3] = (k)       & 0xFF;
1722     }
1723   }
1724
1725   if (DICT_CHAR_SIZE == 1) {
1726     for (; i<DICT_WORD_SIZE; i++)
1727       prepared_sort[i] = 0;
1728   }
1729   else {
1730     for (; i<DICT_WORD_SIZE; i++) {
1731       prepared_sort[4*i]   = 0;
1732       prepared_sort[4*i+1] = 0;
1733       prepared_sort[4*i+2] = 0;
1734       prepared_sort[4*i+3] = 0;
1735     }
1736   }
1737
1738   if (optresult) copy_sorts(optresult, prepared_sort);
1739 }
1740
1741 extern void dictionary_prepare(char *dword, uchar *optresult)
1742 {
1743   if (!glulx_mode)
1744     dictionary_prepare_z(dword, optresult);
1745   else
1746     dictionary_prepare_g(dword, optresult);
1747 }
1748
1749 /* ------------------------------------------------------------------------- */
1750 /*   The arrays below are all concerned with the problem of alphabetically   */
1751 /*   sorting the dictionary during the compilation pass.                     */
1752 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
1753 /*   the end of the pass: we need to ensure that no duplicates are ever      */
1754 /*   created.                                                                */
1755 /*                                                                           */
1756 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
1757 /*                          word to be entered into the dictionary, where    */
1758 /*                          n counts upward from 0                           */
1759 /*                          (n is also called the "accession number")        */
1760 /*                                                                           */
1761 /*   The tree structure encodes an ordering.  The special value VACANT means */
1762 /*   "no node here": otherwise, node numbers are the same as accession       */
1763 /*   numbers.  At all times, "root" holds the node number of the top of the  */
1764 /*   tree; each node has up to two branches, such that the subtree of the    */
1765 /*   left branch is always alphabetically before what's at the node, and     */
1766 /*   the subtree to the right is always after; and all branches are coloured */
1767 /*   either "black" or "red".  These colours are used to detect points where */
1768 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
1769 /*   to search).                                                             */
1770 /* ------------------------------------------------------------------------- */
1771
1772 #define RED    'r'
1773 #define BLACK  'b'
1774 #define VACANT -1
1775
1776 static int root;
1777 typedef struct dict_tree_node_s
1778 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
1779     char colour;                  /* The colour of the branch to the parent */
1780 } dict_tree_node;
1781
1782 static dict_tree_node *dtree;
1783
1784 int   *final_dict_order;
1785 static uchar *dict_sort_codes;
1786
1787 static void dictionary_begin_pass(void)
1788 {
1789     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
1790     /*  Glulx has a 4-byte header instead. */
1791
1792     if (!glulx_mode)
1793         dictionary_top=dictionary+7;
1794     else
1795         dictionary_top=dictionary+4;
1796
1797     root = VACANT;
1798     dict_entries = 0;
1799 }
1800
1801 static int fdo_count;
1802 static void recursively_sort(int node)
1803 {   if (dtree[node].branch[0] != VACANT)
1804         recursively_sort(dtree[node].branch[0]);
1805     final_dict_order[node] = fdo_count++;
1806     if (dtree[node].branch[1] != VACANT)
1807         recursively_sort(dtree[node].branch[1]);
1808 }
1809
1810 extern void sort_dictionary(void)
1811 {   int i;
1812     if (module_switch)
1813     {   for (i=0; i<dict_entries; i++)
1814             final_dict_order[i] = i;
1815         return;
1816     }
1817
1818     if (root != VACANT)
1819     {   fdo_count = 0; recursively_sort(root);
1820     }
1821 }
1822
1823 /* ------------------------------------------------------------------------- */
1824 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
1825 /*   If not, return 0.                                                       */
1826 /* ------------------------------------------------------------------------- */
1827
1828 static int dictionary_find(char *dword)
1829 {   int at = root, n;
1830
1831     dictionary_prepare(dword, NULL);
1832
1833     while (at != VACANT)
1834     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1835         if (n==0) return at + 1;
1836         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1837     }
1838     return 0;
1839 }
1840
1841 /* ------------------------------------------------------------------------- */
1842 /*  Add "dword" to the dictionary with (x,y,z) as its data fields; unless    */
1843 /*  it already exists, in which case OR the data with (x,y,z)                */
1844 /*                                                                           */
1845 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
1846 /*                                                                           */
1847 /*  Returns: the accession number.                                           */
1848 /* ------------------------------------------------------------------------- */
1849
1850 extern int dictionary_add(char *dword, int x, int y, int z)
1851 {   int n; uchar *p;
1852     int ggfr = 0, gfr = 0, fr = 0, r = 0;
1853     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1854     int a, b;
1855     int res=((version_number==3)?4:6);
1856
1857     dictionary_prepare(dword, NULL);
1858
1859     if (root == VACANT)
1860     {   root = 0; goto CreateEntry;
1861     }
1862     while (TRUE)
1863     {
1864         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1865         if (n==0)
1866         {
1867             if (!glulx_mode) {
1868                 p = dictionary+7 + at*(3+res) + res;
1869                 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1870                 if (x & 128) p[0] = (p[0])|number_and_case;
1871             }
1872             else {
1873                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1874                 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1875                 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1876                 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1877                 if (x & 128) p[1] = (p[1]) | number_and_case;
1878             }
1879             return at;
1880         }
1881         if (n>0) r=1; else r=0;
1882
1883         a = dtree[at].branch[0]; b = dtree[at].branch[1];
1884         if ((a != VACANT) && (dtree[a].colour == RED) &&
1885             (b != VACANT) && (dtree[b].colour == RED))
1886         {   dtree[a].colour = BLACK;
1887             dtree[b].colour = BLACK;
1888
1889             dtree[at].colour = RED;
1890
1891         /* A tree rotation may be needed to avoid two red links in a row:
1892            e.g.
1893              ggf   (or else gf is root)         ggf (or f is root)
1894               |                                  |
1895               gf                                 f
1896              / \(red)                           / \ (both red)
1897                 f            becomes          gf   at
1898                / \(red)                      /  \ /  \
1899                   at
1900                  /  \
1901
1902            In effect we rehang the "gf" subtree from "f".
1903            See the Technical Manual for further details.
1904         */
1905
1906             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1907             {
1908               if (fr == gfr)
1909               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1910                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1911                 dtree[f].branch[1-fr] = gf;
1912                 dtree[f].colour = BLACK;
1913                 dtree[gf].colour = RED;
1914                 gf = ggf; gfr = ggfr;
1915               }
1916               else
1917               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1918                 dtree[at].colour = BLACK;
1919                 dtree[gf].colour = RED;
1920                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1921                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1922                 dtree[at].branch[gfr] = f;
1923                 dtree[at].branch[fr] = gf;
1924
1925                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1926                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1927               }
1928             }
1929         }
1930
1931         if (dtree[at].branch[r] == VACANT)
1932         {   dtree[at].colour = RED;
1933
1934             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1935             { if (fr == gfr)
1936               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1937                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1938                 dtree[f].branch[1-fr] = gf;
1939                 dtree[f].colour = BLACK;
1940                 dtree[gf].colour = RED;
1941               }
1942               else
1943               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1944                 dtree[at].colour = BLACK;
1945                 dtree[gf].colour = RED;
1946                 dtree[f].branch[fr] = dtree[at].branch[gfr];
1947                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1948                 dtree[at].branch[gfr] = f;
1949                 dtree[at].branch[fr] = gf;
1950
1951                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1952                 f = n; gf = ggf;
1953               }
1954             }
1955             dtree[at].branch[r] = dict_entries;
1956             goto CreateEntry;
1957         }
1958         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1959         ggfr = gfr; gfr = fr; fr = r;
1960     }
1961
1962     CreateEntry:
1963
1964     if (dict_entries==MAX_DICT_ENTRIES)
1965         memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1966
1967     dtree[dict_entries].branch[0] = VACANT;
1968     dtree[dict_entries].branch[1] = VACANT;
1969     dtree[dict_entries].colour    = BLACK;
1970
1971     /*  Address in Inform's own dictionary table to write the record to      */
1972
1973     if (!glulx_mode) {
1974
1975         p = dictionary + (3+res)*dict_entries + 7;
1976
1977         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1978             bytes */
1979
1980         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1981         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1982         if (version_number > 3)
1983           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1984         p[res]=x; p[res+1]=y; p[res+2]=z;
1985         if (x & 128) p[res] = (p[res])|number_and_case;
1986
1987         dictionary_top += res+3;
1988
1989     }
1990     else {
1991         int i;
1992         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
1993         p[0] = 0x60; /* type byte -- dict word */
1994
1995         p += DICT_CHAR_SIZE;
1996         for (i=0; i<DICT_WORD_BYTES; i++)
1997           p[i] = prepared_sort[i];
1998
1999         p += DICT_WORD_BYTES;
2000         p[0] = 0; p[1] = x;
2001         p[2] = y/256; p[3] = y%256;
2002         p[4] = 0; p[5] = z;
2003         if (x & 128)
2004           p[1] |= number_and_case;
2005
2006         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2007
2008     }
2009
2010     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2011
2012     return dict_entries++;
2013 }
2014
2015 /* ------------------------------------------------------------------------- */
2016 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2017 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2018 /*   change their verb-numbers.)                                             */
2019 /* ------------------------------------------------------------------------- */
2020
2021 extern void dictionary_set_verb_number(char *dword, int to)
2022 {   int i; uchar *p;
2023     int res=((version_number==3)?4:6);
2024     i=dictionary_find(dword);
2025     if (i!=0)
2026     {
2027         if (!glulx_mode) {
2028             p=dictionary+7+(i-1)*(3+res)+res;
2029             p[1]=to;
2030         }
2031         else {
2032             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2033             p[2]=to/256; p[3]=to%256;
2034         }
2035     }
2036 }
2037
2038 /* ------------------------------------------------------------------------- */
2039 /*   Tracing code for the dictionary: used not only by "trace" and text      */
2040 /*   transcription, but also (in the case of "word_to_ascii") in a vital     */
2041 /*   by the linker.                                                          */
2042 /* ------------------------------------------------------------------------- */
2043
2044 static char *d_show_to;
2045 static int d_show_total;
2046
2047 static void show_char(char c)
2048 {   if (d_show_to == NULL) printf("%c", c);
2049     else
2050     {   int i = strlen(d_show_to);
2051         d_show_to[i] = c; d_show_to[i+1] = 0;
2052     }
2053 }
2054
2055 extern void word_to_ascii(uchar *p, char *results)
2056 {   int i, shift, cc, zchar; uchar encoded_word[9];
2057     encoded_word[0] = (((int) p[0])&0x7c)/4;
2058     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2059     encoded_word[2] = ((int) p[1])&0x1f;
2060     encoded_word[3] = (((int) p[2])&0x7c)/4;
2061     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2062     encoded_word[5] = ((int) p[3])&0x1f;
2063     if (version_number > 3)
2064     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2065         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2066         encoded_word[8] = ((int) p[5])&0x1f;
2067     }
2068
2069     shift = 0; cc = 0;
2070     for (i=0; i< ((version_number==3)?6:9); i++)
2071     {   zchar = encoded_word[i];
2072
2073         if (zchar == 4) shift = 1;
2074         else
2075         if (zchar == 5) shift = 2;
2076         else
2077         {   if ((shift == 2) && (zchar == 6))
2078             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2079                 i += 2;
2080                 if ((zchar>=32) && (zchar<=126))
2081                     results[cc++] = zchar;
2082                 else
2083                 {   zscii_to_text(results+cc, zchar);
2084                     cc = strlen(results);
2085                 }
2086             }
2087             else
2088             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2089                 cc = strlen(results);
2090             }
2091             shift = 0;
2092         }
2093     }
2094     results[cc] = 0;
2095 }
2096
2097 static void recursively_show_z(int node)
2098 {   int i, cprinted, flags; uchar *p;
2099     char textual_form[32];
2100     int res = (version_number == 3)?4:6;
2101
2102     if (dtree[node].branch[0] != VACANT)
2103         recursively_show_z(dtree[node].branch[0]);
2104
2105     p = (uchar *)dictionary + 7 + (3+res)*node;
2106
2107     word_to_ascii(p, textual_form);
2108
2109     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2110         show_char(textual_form[cprinted]);
2111     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2112         show_char(' ');
2113
2114     if (d_show_to == NULL)
2115     {   for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2116
2117         flags = (int) p[res];
2118         if (flags & 128)
2119         {   printf("noun ");
2120             if (flags & 4)  printf("p"); else printf(" ");
2121             printf(" ");
2122         }
2123         else printf("       ");
2124         if (flags & 8)
2125         {   if (grammar_version_number == 1)
2126                 printf("preposition:%d  ", (int) p[res+2]);
2127             else
2128                 printf("preposition    ");
2129         }
2130         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2131         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2132         printf("\n");
2133     }
2134
2135     if (d_show_total++ == 5)
2136     {   d_show_total = 0;
2137         if (d_show_to != NULL)
2138         {   write_to_transcript_file(d_show_to);
2139             d_show_to[0] = 0;
2140         }
2141     }
2142
2143     if (dtree[node].branch[1] != VACANT)
2144         recursively_show_z(dtree[node].branch[1]);
2145 }
2146
2147 static void recursively_show_g(int node)
2148 {
2149   warning("### Glulx dictionary-show not yet implemented.\n");
2150 }
2151
2152 static void show_alphabet(int i)
2153 {   int j, c; char chartext[8];
2154
2155     for (j=0; j<26; j++)
2156     {   c = alphabet[i][j];
2157
2158         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2159
2160         zscii_to_text(chartext, c);
2161         printf("%s", chartext);
2162
2163         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2164     }
2165     printf("\n");
2166 }
2167
2168 extern void show_dictionary(void)
2169 {   printf("Dictionary contains %d entries:\n",dict_entries);
2170     if (dict_entries != 0)
2171     {   d_show_total = 0; d_show_to = NULL;
2172         if (!glulx_mode)
2173             recursively_show_z(root);
2174         else
2175             recursively_show_g(root);
2176     }
2177     printf("\nZ-machine alphabet entries:\n");
2178     show_alphabet(0);
2179     show_alphabet(1);
2180     show_alphabet(2);
2181 }
2182
2183 extern void write_dictionary_to_transcript(void)
2184 {   char d_buffer[81];
2185
2186     sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2187
2188     d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2189
2190     if (dict_entries != 0)
2191     {   d_show_total = 0; d_show_to = d_buffer;
2192         if (!glulx_mode)
2193             recursively_show_z(root);
2194         else
2195             recursively_show_g(root);
2196     }
2197     if (d_show_total != 0) write_to_transcript_file(d_buffer);
2198 }
2199
2200 /* ========================================================================= */
2201 /*   Data structure management routines                                      */
2202 /* ------------------------------------------------------------------------- */
2203
2204 extern void init_text_vars(void)
2205 {   int j;
2206     bestyet = NULL;
2207     bestyet2 = NULL;
2208     tlbtab = NULL;
2209     grandtable = NULL;
2210     grandflags = NULL;
2211     no_chars_transcribed = 0;
2212     is_abbreviation = FALSE;
2213     put_strings_in_low_memory = FALSE;
2214
2215     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2216
2217     total_zchars_trans = 0;
2218
2219     dtree = NULL;
2220     final_dict_order = NULL;
2221     dict_sort_codes = NULL;
2222     dict_entries=0;
2223
2224     initialise_memory_block(&static_strings_area);
2225 }
2226
2227 extern void text_begin_pass(void)
2228 {   abbrevs_lookup_table_made = FALSE;
2229     no_abbreviations=0;
2230     total_chars_trans=0; total_bytes_trans=0;
2231     if (store_the_text) all_text_top=all_text;
2232     dictionary_begin_pass();
2233     low_strings_top = low_strings;
2234
2235     static_strings_extent = 0;
2236     no_strings = 0;
2237     no_dynamic_strings = 0;
2238     no_unicode_chars = 0;
2239 }
2240
2241 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2242
2243 extern void text_allocate_arrays(void)
2244 {   abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2245         "abbreviations");
2246     abbrev_values    = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2247     abbrev_quality   = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2248     abbrev_freqs     = my_calloc(sizeof(int),   MAX_ABBREVS, "abbrev freqs");
2249
2250     dtree            = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2251                                  "red-black tree for dictionary");
2252     final_dict_order = my_calloc(sizeof(int),  MAX_DICT_ENTRIES,
2253                                  "final dictionary ordering table");
2254     dict_sort_codes  = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2255                                  "dictionary sort codes");
2256
2257     if (!glulx_mode)
2258         dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2259             "dictionary");
2260     else
2261         dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2262             "dictionary");
2263
2264     strings_holding_area
2265          = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2266     low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2267
2268     huff_entities = NULL;
2269     hufflist = NULL;
2270     unicode_usage_entries = NULL;
2271     done_compression = FALSE;
2272     compression_table_size = 0;
2273     compressed_offsets = NULL;
2274
2275     MAX_CHARACTER_SET = 0;
2276
2277     if (glulx_mode) {
2278       if (compression_switch) {
2279         int ix;
2280         MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2281           + MAX_UNICODE_CHARS;
2282         huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2283           "huffman entities");
2284         hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2285           "huffman node list");
2286         unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2287           MAX_UNICODE_CHARS, "unicode entity entries");
2288         for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2289           unicode_usage_hash[ix] = NULL;
2290       }
2291       compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2292         "static strings index table");
2293     }
2294 }
2295
2296 extern void text_free_arrays(void)
2297 {
2298     my_free(&strings_holding_area, "static strings holding area");
2299     my_free(&low_strings, "low (abbreviation) strings");
2300     my_free(&abbreviations_at, "abbreviations");
2301     my_free(&abbrev_values,    "abbrev values");
2302     my_free(&abbrev_quality,   "abbrev quality");
2303     my_free(&abbrev_freqs,     "abbrev freqs");
2304
2305     my_free(&dtree,            "red-black tree for dictionary");
2306     my_free(&final_dict_order, "final dictionary ordering table");
2307     my_free(&dict_sort_codes,  "dictionary sort codes");
2308
2309     my_free(&dictionary,"dictionary");
2310
2311     my_free(&compressed_offsets, "static strings index table");
2312     my_free(&hufflist, "huffman node list");
2313     my_free(&huff_entities, "huffman entities");
2314     my_free(&unicode_usage_entries, "unicode entity entities");
2315
2316     deallocate_memory_block(&static_strings_area);
2317 }
2318
2319 extern void ao_free_arrays(void)
2320 {   my_free (&tlbtab,"tlb table");
2321     my_free (&sub_buffer,"sub_buffer");
2322     my_free (&bestyet,"bestyet");
2323     my_free (&bestyet2,"bestyet2");
2324     my_free (&grandtable,"grandtable");
2325     my_free (&grandflags,"grandflags");
2326 }
2327
2328 /* ========================================================================= */