src/text.c

   1 /* ------------------------------------------------------------------------- */
   2 /*   "text" : Text translation, the abbreviations optimiser, the dictionary  */
   3 /*                                                                           */
   4 /*   Part of Inform 6.42                                                     */
   5 /*   copyright (c) Graham Nelson 1993 - 2024                                 */
   6 /*                                                                           */
   7 /* Inform is free software: you can redistribute it and/or modify            */
   8 /* it under the terms of the GNU General Public License as published by      */
   9 /* the Free Software Foundation, either version 3 of the License, or         */
  10 /* (at your option) any later version.                                       */
  11 /*                                                                           */
  12 /* Inform is distributed in the hope that it will be useful,                 */
  13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
  14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              */
  15 /* GNU General Public License for more details.                              */
  16 /*                                                                           */
  17 /* You should have received a copy of the GNU General Public License         */
  18 /* along with Inform. If not, see https://gnu.org/licenses/                  */
  19 /*                                                                           */
  20 /* ------------------------------------------------------------------------- */
  21
  22 #include "header.h"
  23
  24 uchar *low_strings;                    /* Allocated to low_strings_top       */
  25 int32 low_strings_top;
  26 static memory_list low_strings_memlist;
  27
  28 int32 static_strings_extent;           /* Number of bytes of static strings
  29                                           made so far */
  30 uchar *static_strings_area;            /* Used to hold the static strings
  31                                           area so far
  32                                           Allocated to static_strings_extent */
  33 memory_list static_strings_area_memlist;
  34
  35 static char *all_text;                 /* Text buffer holding the entire text
  36                                           of the game, when it is being
  37                                           recorded
  38                                           (Allocated to all_text_top) */
  39 static memory_list all_text_memlist;
  40 static int32 all_text_top;
  41
  42 int abbrevs_lookup_table_made,         /* The abbreviations lookup table is
  43                                           constructed when the first non-
  44                                           abbreviation string is translated:
  45                                           this flag is TRUE after that       */
  46     abbrevs_lookup[256];               /* Once this has been constructed,
  47                                           abbrevs_lookup[n] = the smallest
  48                                           number of any abbreviation beginning
  49                                           with ASCII character n, or -1
  50                                           if none of the abbreviations do    */
  51 int no_abbreviations;                  /* No of abbreviations defined so far */
  52 /* ------------------------------------------------------------------------- */
  53 /*   Glulx string compression storage                                        */
  54 /* ------------------------------------------------------------------------- */
  55
  56 int no_strings;                        /* No of strings in static strings
  57                                           area.                              */
  58 int no_dynamic_strings;                /* No. of @.. string escapes used
  59                                           (actually, the highest value used
  60                                           plus one)                          */
  61 int no_unicode_chars;                  /* Number of distinct Unicode chars
  62                                           used. (Beyond 0xFF.)               */
  63
  64 huffentity_t *huff_entities;           /* The list of entities (characters,
  65                                           abbreviations, @.. escapes, and
  66                                           the terminator)                    */
  67 static huffentity_t **hufflist;        /* Copy of the list, for sorting      */
  68
  69 int no_huff_entities;                  /* The number of entities in the list */
  70 int huff_unicode_start;                /* Position in the list where Unicode
  71                                           chars begin.                       */
  72 int huff_abbrev_start;                 /* Position in the list where string
  73                                           abbreviations begin.               */
  74 int huff_dynam_start;                  /* Position in the list where @..
  75                                           entities begin.                    */
  76 int huff_entity_root;                  /* The position in the list of the root
  77                                           entry (when considering the table
  78                                           as a tree).                        */
  79
  80 int done_compression;                  /* Has the game text been compressed? */
  81 int32 compression_table_size;          /* Length of the Huffman table, in
  82                                           bytes                              */
  83 int32 compression_string_size;         /* Length of the compressed string
  84                                           data, in bytes                     */
  85 int32 *compressed_offsets;             /* The beginning of every string in
  86                                           the game, relative to the beginning
  87                                           of the Huffman table. (So entry 0
  88                                           is equal to compression_table_size).
  89                                           Allocated to no_strings at
  90                                           compress_game_text() time.         */
  91 static memory_list compressed_offsets_memlist;
  92
  93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars     */
  94 static memory_list unicode_usage_entries_memlist;
  95
  96 #define UNICODE_HASH_BUCKETS (64)
  97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
  98
  99 static int unicode_entity_index(int32 unicode);
 100
 101 /* ------------------------------------------------------------------------- */
 102 /*   Abbreviation arrays                                                     */
 103 /* ------------------------------------------------------------------------- */
 104
 105 abbreviation *abbreviations;             /* Allocated up to no_abbreviations */
 106 static memory_list abbreviations_memlist;
 107
 108 /* Memory to hold the text of any abbreviation strings declared.             */
 109 static int32 abbreviations_totaltext;
 110 static char *abbreviations_text;  /* Allocated up to abbreviations_totaltext */
 111 static memory_list abbreviations_text_memlist;
 112
 113 static int *abbreviations_optimal_parse_schedule;
 114 static memory_list abbreviations_optimal_parse_schedule_memlist;
 115
 116 static int *abbreviations_optimal_parse_scores;
 117 static memory_list abbreviations_optimal_parse_scores_memlist;
 118
 119 /* ------------------------------------------------------------------------- */
 120
 121 int32 total_chars_trans,               /* Number of ASCII chars of text in   */
 122       total_bytes_trans,               /* Number of bytes of Z-code text out */
 123       zchars_trans_in_last_string;     /* Number of Z-chars in last string:
 124                                           needed only for abbrev efficiency
 125                                           calculation in "directs.c"         */
 126 static int32 total_zchars_trans;       /* Number of Z-chars of text out
 127                                           (only used to calculate the above) */
 128
 129 static int zchars_out_buffer[3],       /* During text translation, a buffer of
 130                                           3 Z-chars at a time: when it's full
 131                                           these are written as a 2-byte word */
 132            zob_index;                  /* Index (0 to 2) into it             */
 133
 134 uchar *translated_text;                /* Area holding translated strings
 135                                           until they are moved into the
 136                                           static_strings_area below */
 137 static memory_list translated_text_memlist;
 138
 139 static char *temp_symbol;              /* Temporary symbol name used while
 140                                           processing "@(...)".               */
 141 static memory_list temp_symbol_memlist;
 142
 143
 144 static int32 text_out_pos;             /* The "program counter" during text
 145                                           translation: the next position to
 146                                           write Z-coded text output to       */
 147
 148 static int32 text_out_limit;           /* The upper limit of text_out_pos
 149                                           during text translation (or -1
 150                                           for no limit)                      */
 151
 152 static int text_out_overflow;          /* During text translation, becomes
 153                                           true if text_out_pos tries to pass
 154                                           text_out_limit                     */
 155
 156 /* ------------------------------------------------------------------------- */
 157 /*   For variables/arrays used by the dictionary manager, see below          */
 158 /* ------------------------------------------------------------------------- */
 159
 160 /* ------------------------------------------------------------------------- */
 161 /*   Prepare the abbreviations lookup table (used to speed up abbreviation   */
 162 /*   detection in text translation).  We first bubble-sort the abbrevs into  */
 163 /*   alphabetical order (this is necessary for the detection algorithm to    */
 164 /*   to work).  Since the table is only prepared once, and for a table       */
 165 /*   of size at most 96, there's no point using an efficient sort algorithm. */
 166 /* ------------------------------------------------------------------------- */
 167
 168 static void make_abbrevs_lookup(void)
 169 {   int bubble_sort, j, k;
 170     char *p1, *p2;
 171     do
 172     {   bubble_sort = FALSE;
 173         for (j=0; j<no_abbreviations; j++)
 174             for (k=j+1; k<no_abbreviations; k++)
 175             {   p1=abbreviation_text(j);
 176                 p2=abbreviation_text(k);
 177                 if (strcmp(p1,p2)<0)
 178                 {
 179                     abbreviation temp = abbreviations[j];
 180                     abbreviations[j] = abbreviations[k];
 181                     abbreviations[k] = temp;
 182                     bubble_sort = TRUE;
 183                 }
 184             }
 185     } while (bubble_sort);
 186
 187     for (j=no_abbreviations-1; j>=0; j--)
 188     {   p1=abbreviation_text(j);
 189         abbrevs_lookup[(uchar)p1[0]]=j;
 190         abbreviations[j].freq=0;
 191     }
 192     abbrevs_lookup_table_made = TRUE;
 193 }
 194
 195 /* ------------------------------------------------------------------------- */
 196 /*   Search the abbreviations lookup table (a routine which must be fast).   */
 197 /*   The source text to compare is text[i], text[i+1], ... and this routine  */
 198 /*   is only called if text[i] is indeed the first character of at least one */
 199 /*   abbreviation, "from" begin the least index into the abbreviations table */
 200 /*   of an abbreviation for which text[i] is the first character.  Recall    */
 201 /*   that the abbrevs table is in alphabetical order.                        */
 202 /*                                                                           */
 203 /*   The return value is -1 if there is no match.  If there is a match, the  */
 204 /*   text to be abbreviated out is over-written by a string of null chars    */
 205 /*   with "ASCII" value 1, and the abbreviation number is returned.          */
 206 /*                                                                           */
 207 /*   In Glulx, we *do not* do this overwriting with 1's.                     */
 208 /* ------------------------------------------------------------------------- */
 209
 210 static int try_abbreviations_from(unsigned char *text, int i, int from)
 211 {   int j, k; uchar *p, c;
 212     c=text[i];
 213     for (j=from;
 214          j<no_abbreviations;
 215          j++)
 216     {
 217         p=(uchar *)abbreviations_text+abbreviations[j].textpos;
 218         if (c != p[0]) break;
 219         if (text[i+1]==p[1])
 220         {   for (k=2; p[k]!=0; k++)
 221                 if (text[i+k]!=p[k]) goto NotMatched;
 222             if (!glulx_mode) {
 223                 for (k=0; p[k]!=0; k++) text[i+k]=1;
 224             }
 225             abbreviations[j].freq++;
 226             return(j);
 227             NotMatched: ;
 228         }
 229     }
 230     return(-1);
 231 }
 232
 233 /* Create an abbreviation. */
 234 extern void make_abbreviation(char *text)
 235 {
 236     int alen;
 237     int32 pos;
 238
 239     /* If -e mode is off, we won't waste space creating an abbreviation entry. */
 240     if (!economy_switch)
 241         return;
 242
 243     alen = strlen(text);
 244     pos = abbreviations_totaltext;
 245
 246     ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
 247     ensure_memory_list_available(&abbreviations_text_memlist, pos+alen+1);
 248
 249     strcpy(abbreviations_text+pos, text);
 250     abbreviations_totaltext += (alen+1);
 251
 252     abbreviations[no_abbreviations].textpos = pos;
 253     abbreviations[no_abbreviations].textlen = alen;
 254     abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
 255     abbreviations[no_abbreviations].freq = 0;
 256
 257     /*   The quality is the number of Z-chars saved by using this            */
 258     /*   abbreviation: note that it takes 2 Z-chars to print it.             */
 259
 260     abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
 261
 262     if (abbreviations[no_abbreviations].quality <= 0) {
 263         warning_named("Abbreviation does not save any characters:", text);
 264     }
 265
 266     no_abbreviations++;
 267 }
 268
 269 /* Return a pointer to the (uncompressed) abbreviation text.
 270    This should be treated as temporary; it is only valid until the next
 271    make_abbreviation() call. */
 272 extern char *abbreviation_text(int num)
 273 {
 274     if (num < 0 || num >= no_abbreviations) {
 275         compiler_error("Invalid abbrev for abbreviation_text()");
 276         return "";
 277     }
 278
 279     return abbreviations_text + abbreviations[num].textpos;
 280 }
 281
 282 /* ------------------------------------------------------------------------- */
 283 /*   The front end routine for text translation.                             */
 284 /*   strctx indicates the purpose of the string. This is mostly used for     */
 285 /*   informational output (gametext.txt), but we treat some string contexts  */
 286 /*   specially during compilation.                                           */
 287 /* ------------------------------------------------------------------------- */
 288
 289 /* TODO: When called from a print statement (parse_print()), it would be
 290    nice to detect if the generated string is exactly one character. In that
 291    case, we could return the character value and a flag to indicate the
 292    caller could use @print_char/@streamchar/@new_line/@streamunichar
 293    instead of printing a compiled string.
 294
 295    We'd need a new STRCTX value or two to distinguish direct-printed strings
 296    from referenceable strings.
 297
 298    Currently, parse_print() checks for the "^" case manually, which is a
 299    bit icky. */
 300
 301 extern int32 compile_string(char *b, int strctx)
 302 {   int32 i, j, k;
 303     uchar *c;
 304     int in_low_memory;
 305
 306     if (execution_never_reaches_here) {
 307         /* No need to put strings into gametext.txt or the static/low
 308            strings areas. */
 309         if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
 310             /* VENEER and VENEEROPC are only used at the translate_text level,
 311                so we don't have to catch them here. */
 312             return 0;
 313         }
 314     }
 315
 316     /* In Z-code, abbreviations go in the low memory pool (0x100). So
 317        do strings explicitly defined with the Lowstring directive.
 318        (In Glulx, the in_low_memory flag is ignored.) */
 319     in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 320
 321     if (!glulx_mode && in_low_memory)
 322     {
 323         k = translate_text(-1, b, strctx);
 324         if (k<0) {
 325             error("text translation failed");
 326             k = 0;
 327         }
 328         ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
 329         memcpy(low_strings+low_strings_top, translated_text, k);
 330         j = low_strings_top;
 331         low_strings_top += k;
 332         return(0x21+(j/2));
 333     }
 334
 335     if (glulx_mode && done_compression)
 336         compiler_error("Tried to add a string after compression was done.");
 337
 338     i = translate_text(-1, b, strctx);
 339     if (i < 0) {
 340         error("text translation failed");
 341         i = 0;
 342     }
 343
 344     /* Insert null bytes as needed to ensure that the next static string */
 345     /* also occurs at an address expressible as a packed address         */
 346
 347     if (!glulx_mode) {
 348         int textalign;
 349         if (oddeven_packing_switch)
 350             textalign = scale_factor*2;
 351         else
 352             textalign = scale_factor;
 353         while ((i%textalign)!=0)
 354         {
 355             ensure_memory_list_available(&translated_text_memlist, i+2);
 356             translated_text[i++] = 0;
 357             translated_text[i++] = 0;
 358         }
 359     }
 360
 361     j = static_strings_extent;
 362
 363     ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
 364     for (c=translated_text; c<translated_text+i;
 365          c++, static_strings_extent++)
 366         static_strings_area[static_strings_extent] = *c;
 367
 368     if (!glulx_mode) {
 369         return(j/scale_factor);
 370     }
 371     else {
 372         /* The marker value is a one-based string number. (We reserve zero
 373            to mean "not a string at all". */
 374         return (++no_strings);
 375     }
 376 }
 377
 378 /* ------------------------------------------------------------------------- */
 379 /*   Output a single Z-character into the buffer, and flush it if full       */
 380 /* ------------------------------------------------------------------------- */
 381
 382 static void write_z_char_z(int i)
 383 {   uint32 j;
 384     ASSERT_ZCODE();
 385     total_zchars_trans++;
 386     zchars_out_buffer[zob_index++]=(i%32);
 387     if (zob_index!=3) return;
 388     zob_index=0;
 389     j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
 390        + zchars_out_buffer[2];
 391
 392     if (text_out_limit >= 0) {
 393         if (text_out_pos+2 > text_out_limit) {
 394             text_out_overflow = TRUE;
 395             return;
 396         }
 397     }
 398     else {
 399         ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
 400     }
 401
 402     translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
 403     total_bytes_trans+=2;
 404 }
 405
 406 static void write_zscii(int zsc)
 407 {
 408     int lookup_value, in_alphabet;
 409
 410     if (zsc==' ')
 411     {   write_z_char_z(0);
 412         return;
 413     }
 414
 415     if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
 416
 417     else lookup_value = -1;
 418
 419     if (lookup_value >= 0)
 420     {   alphabet_used[lookup_value] = 'Y';
 421         in_alphabet = lookup_value/26;
 422         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 423         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 424         write_z_char_z(lookup_value%26 + 6);
 425     }
 426     else
 427     {   write_z_char_z(5); write_z_char_z(6);
 428         write_z_char_z(zsc/32); write_z_char_z(zsc%32);
 429     }
 430 }
 431
 432 /* ------------------------------------------------------------------------- */
 433 /*   Finish a Z-coded string, padding out with Z-char 5s if necessary and    */
 434 /*   setting the "end" bit on the final 2-byte word                          */
 435 /* ------------------------------------------------------------------------- */
 436
 437 static void end_z_chars(void)
 438 {
 439     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
 440     while (zob_index!=0) write_z_char_z(5);
 441     if (text_out_pos < 2) {
 442         /* Something went wrong. */
 443         text_out_overflow = TRUE;
 444         return;
 445     }
 446     translated_text[text_out_pos-2] += 128;
 447 }
 448
 449 /* Glulx handles this much more simply -- compression is done elsewhere. */
 450 static void write_z_char_g(int i)
 451 {
 452     ASSERT_GLULX();
 453     if (text_out_limit >= 0) {
 454         if (text_out_pos+1 > text_out_limit) {
 455             text_out_overflow = TRUE;
 456             return;
 457         }
 458     }
 459     else {
 460         ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
 461     }
 462     total_zchars_trans++;
 463     translated_text[text_out_pos++] = i;
 464     total_bytes_trans++;
 465 }
 466
 467 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
 468 static int zchar_weight(int c)
 469 {
 470     int lookup;
 471     if (c == ' ') return 1;
 472     lookup = iso_to_alphabet_grid[c];
 473     if (lookup < 0) return 4;
 474     if (lookup < 26) return 1;
 475     return 2;
 476 }
 477
 478 /* ------------------------------------------------------------------------- */
 479 /*   The main routine "text.c" provides to the rest of Inform: the text      */
 480 /*   translator. s_text is the source text and the return value is the       */
 481 /*   number of bytes translated.                                             */
 482 /*   The translated text will be stored in translated_text.                  */
 483 /*                                                                           */
 484 /*   If p_limit is >= 0, the text length will not exceed that many bytes.    */
 485 /*   If the translation tries to overflow this boundary, the return value    */
 486 /*   will be -1. (You should display an error and not read translated_text.) */
 487 /*                                                                           */
 488 /*   If p_limit is negative, any amount of text is accepted (up to int32     */
 489 /*   anyway).                                                                */
 490 /*                                                                           */
 491 /*   Note that the source text may be corrupted by this routine.             */
 492 /* ------------------------------------------------------------------------- */
 493
 494 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
 495 {   int i, j, k, in_alphabet, lookup_value, is_abbreviation;
 496     int32 unicode; int zscii;
 497     unsigned char *text_in;
 498
 499     if (p_limit >= 0) {
 500         ensure_memory_list_available(&translated_text_memlist, p_limit);
 501     }
 502
 503     /* For STRCTX_ABBREV, the string being translated is itself an
 504        abbreviation string, so it can't make use of abbreviations. Set
 505        the is_abbreviation flag to indicate this.
 506        The compiler has historically set this flag for the Lowstring
 507        directive as well -- the in_low_memory and is_abbreviation flag were
 508        always the same. I am preserving that convention. */
 509     is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
 510
 511
 512     /*  Cast the input and output streams to unsigned char: text_out_pos will
 513         advance as bytes of Z-coded text are written, but text_in doesn't    */
 514
 515     text_in     = (unsigned char *) s_text;
 516     text_out_pos = 0;
 517     text_out_limit = p_limit;
 518     text_out_overflow = FALSE;
 519
 520     /*  Remember the Z-chars total so that later we can subtract to find the
 521         number of Z-chars translated on this string                          */
 522
 523     zchars_trans_in_last_string = total_zchars_trans;
 524
 525     /*  Start with the Z-characters output buffer empty                      */
 526
 527     zob_index=0;
 528
 529     /*  If this is the first text translated since the abbreviations were
 530         declared, and if some were declared, then it's time to make the
 531         lookup table for abbreviations
 532
 533         (Except: we don't if the text being translated is itself
 534         the text of an abbreviation currently being defined)                 */
 535
 536     if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
 537         && (!is_abbreviation))
 538         make_abbrevs_lookup();
 539
 540     /*  If we're storing the whole game text to memory, then add this text.
 541         We will put two newlines between each text and four at the very end.
 542         (The optimise code does a lot of sloppy text[i+2], so the extra
 543         two newlines past all_text_top are necessary.) */
 544
 545     if ((!is_abbreviation) && (store_the_text))
 546     {   int addlen = strlen(s_text);
 547         ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
 548         sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
 549         /* Advance past two newlines. */
 550         all_text_top += (addlen+2);
 551     }
 552
 553     if (transcript_switch) {
 554         /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
 555         if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
 556             int label = strctx;
 557             if (veneer_mode) {
 558                 if (label == STRCTX_GAME)
 559                     label = STRCTX_VENEER;
 560                 else if (label == STRCTX_GAMEOPC)
 561                     label = STRCTX_VENEEROPC;
 562             }
 563             write_to_transcript_file(s_text, label);
 564         }
 565     }
 566
 567     /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
 568     /*  (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
 569     /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
 570     if (economy_switch)
 571     {
 572         uchar *q, c;
 573         int l, min_score, from;
 574         int text_in_length;
 575
 576         text_in_length = strlen( (char*) text_in);
 577         ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
 578         ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
 579
 580         abbreviations_optimal_parse_scores[text_in_length] = 0;
 581         for(j=text_in_length-1; j>=0; j--)
 582         {   /* Initial values: empty schedule, score = just write the letter without abbreviating. */
 583             abbreviations_optimal_parse_schedule[j] = -1;
 584             min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
 585             /* If there's an abbreviation starting with that letter... */
 586             if ( (from = abbrevs_lookup[text_in[j]]) != -1)
 587             {
 588                 c = text_in[j];
 589                 /* Loop on all abbreviations starting with what is in c. */
 590                 for (k=from;
 591                      k<no_abbreviations;
 592                      k++)
 593                 {
 594                     q=(uchar *)abbreviations_text+abbreviations[k].textpos;
 595                     if (c!=q[0]) break;
 596                     /* Let's compare; we also keep track of the length of the abbreviation. */
 597                     for (l=1; q[l]!=0; l++)
 598                     {    if (text_in[j+l]!=q[l]) {goto NotMatched;}
 599                     }
 600                     /* We have a match (length l), but is it smaller in size? */
 601                     if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
 602                     {   /* It is indeed smaller, so let's write it down in our schedule. */
 603                         min_score = 2 + abbreviations_optimal_parse_scores[j+l];
 604                         abbreviations_optimal_parse_schedule[j] = k;
 605                     }
 606                     NotMatched: ;
 607                 }
 608             }
 609             /* We gave it our best, this is the smallest we got. */
 610             abbreviations_optimal_parse_scores[j] = min_score;
 611         }
 612     }
 613
 614
 615
 616   if (!glulx_mode) {
 617
 618     /*  The empty string of Z-text is illegal, since it can't carry an end
 619         bit: so we translate an empty string of ASCII text to just the
 620         pad character 5.  Printing this causes nothing to appear on screen.  */
 621
 622     if (text_in[0]==0) write_z_char_z(5);
 623
 624     /*  Loop through the characters of the null-terminated input text: note
 625         that if 1 is written over a character in the input text, it is
 626         afterwards ignored                                                   */
 627
 628     for (i=0; text_in[i]!=0; i++)
 629     {   total_chars_trans++;
 630
 631         /*  Contract ".  " into ". " if double-space-removing switch set:
 632             likewise "?  " and "!  " if the setting is high enough           */
 633
 634         if ((double_space_setting >= 1)
 635             && (text_in[i+1]==' ') && (text_in[i+2]==' '))
 636         {   if (text_in[i]=='.') text_in[i+2]=1;
 637             if (double_space_setting >= 2)
 638             {   if (text_in[i]=='?') text_in[i+2]=1;
 639                 if (text_in[i]=='!') text_in[i+2]=1;
 640             }
 641         }
 642
 643         /*  Try abbreviations if the economy switch set. */
 644         /*  Look at the abbreviation schedule to see if we should abbreviate here. */
 645         /*  Note: Just because the schedule has something doesn't mean we should abbreviate there; */
 646         /*  sometimes you abbreviate before because it's better. If we have already replaced the */
 647         /*  char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
 648         if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
 649             ((j = abbreviations_optimal_parse_schedule[i]) != -1))
 650         {
 651             /* Fill with 1s, which will get ignored by everyone else. */
 652             uchar *p = (uchar *)abbreviation_text(j);
 653             for (k=0; p[k]!=0; k++) text_in[i+k]=1;
 654             /* Actually write the abbreviation in the story file. */
 655             abbreviations[j].freq++;
 656             /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
 657             j += MAX_DYNAMIC_STRINGS;
 658             write_z_char_z(j/32+1); write_z_char_z(j%32);
 659         }
 660
 661
 662         /* If Unicode switch set, use text_to_unicode to perform UTF-8
 663            decoding */
 664         if (character_set_unicode && (text_in[i] & 0x80))
 665         {   unicode = text_to_unicode((char *) (text_in+i));
 666             zscii = unicode_to_zscii(unicode);
 667             if (zscii != 5) write_zscii(zscii);
 668             else
 669             {   unicode_char_error(
 670                     "Character can only be used if declared in \
 671 advance as part of 'Zcharacter table':", unicode);
 672             }
 673             i += textual_form_length - 1;
 674             continue;
 675         }
 676
 677         /*  '@' is the escape character in Inform string notation: the various
 678             possibilities are:
 679
 680                 @@decimalnumber  :  write this ZSCII char (0 to 1023)
 681                 @twodigits or    :  write the abbreviation string with this
 682                 @(digits)           decimal number
 683                 @(symbol)        :  write the abbreviation string with this
 684                                     (constant) value
 685                 @accentcode      :  this accented character: e.g.,
 686                                         for @'e write an E-acute
 687                 @{...}           :  this Unicode char (in hex)              */
 688
 689         if (text_in[i]=='@')
 690         {   if (text_in[i+1]=='@')
 691             {
 692                 /*   @@... (ascii value)  */
 693
 694                 i+=2; j=atoi((char *) (text_in+i));
 695                 switch(j)
 696                 {   /* Prevent ~ and ^ from being translated to double-quote
 697                        and new-line, as they ordinarily would be */
 698
 699                     case 94:   write_z_char_z(5); write_z_char_z(6);
 700                                write_z_char_z(94/32); write_z_char_z(94%32);
 701                                break;
 702                     case 126:  write_z_char_z(5); write_z_char_z(6);
 703                                write_z_char_z(126/32); write_z_char_z(126%32);
 704                                break;
 705
 706                     default:   write_zscii(j); break;
 707                 }
 708                 while (isdigit(text_in[i])) i++; i--;
 709             }
 710             else if (text_in[i+1]=='(')
 711             {
 712                 /*   @(...) (dynamic string)   */
 713                 int len = 0, digits = 0;
 714                 i += 2;
 715                 /* This accepts "12xyz" as a symbol, which it really isn't,
 716                    but that just means it won't be found. */
 717                 while ((text_in[i] == '_' || isalnum(text_in[i]))) {
 718                     char ch = text_in[i++];
 719                     if (isdigit(ch)) digits++;
 720                     ensure_memory_list_available(&temp_symbol_memlist, len+1);
 721                     temp_symbol[len++] = ch;
 722                 }
 723                 ensure_memory_list_available(&temp_symbol_memlist, len+1);
 724                 temp_symbol[len] = '\0';
 725                 j = -1;
 726                 /* We would like to parse temp_symbol as *either* a decimal
 727                    number or a constant symbol. */
 728                 if (text_in[i] != ')' || len == 0) {
 729                     error("'@(...)' abbreviation must contain a symbol");
 730                 }
 731                 else if (digits == len) {
 732                     /* all digits; parse as decimal */
 733                     j = atoi(temp_symbol);
 734                 }
 735                 else {
 736                     int sym = get_symbol_index(temp_symbol);
 737                     if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 738                         error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol);
 739                     }
 740                     else {
 741                         symbols[sym].flags |= USED_SFLAG;
 742                         j = symbols[sym].value;
 743                     }
 744                 }
 745                 if (!glulx_mode && j >= 96) {
 746                     error_max_dynamic_strings(j);
 747                     j = -1;
 748                 }
 749                 if (j >= MAX_DYNAMIC_STRINGS) {
 750                     error_max_dynamic_strings(j);
 751                     j = -1;
 752                 }
 753                 if (j >= 0) {
 754                     write_z_char_z(j/32+1); write_z_char_z(j%32);
 755                 }
 756                 else {
 757                     write_z_char_z(' '); /* error fallback */
 758                 }
 759             }
 760             else if (isdigit(text_in[i+1])!=0)
 761             {   int d1, d2;
 762
 763                 /*   @.. (dynamic string)   */
 764
 765                 d1 = character_digit_value[text_in[i+1]];
 766                 d2 = character_digit_value[text_in[i+2]];
 767                 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
 768                     error("'@..' must have two decimal digits");
 769                 else
 770                 {
 771                     j = d1*10 + d2;
 772                     if (!glulx_mode && j >= 96) {
 773                         error_max_dynamic_strings(j);
 774                         j = -1;
 775                     }
 776                     if (j >= MAX_DYNAMIC_STRINGS) {
 777                         /* Shouldn't get here with two digits */
 778                         error_max_dynamic_strings(j);
 779                         j = -1;
 780                     }
 781                     i+=2;
 782                     if (j >= 0) {
 783                         write_z_char_z(j/32+1); write_z_char_z(j%32);
 784                     }
 785                     else {
 786                         write_z_char_z(' '); /* error fallback */
 787                     }
 788                 }
 789             }
 790             else
 791             {
 792                 /*   A string escape specifying an unusual character   */
 793
 794                 unicode = text_to_unicode((char *) (text_in+i));
 795                 zscii = unicode_to_zscii(unicode);
 796                 if (zscii != 5) write_zscii(zscii);
 797                 else
 798                 {   unicode_char_error(
 799                        "Character can only be used if declared in \
 800 advance as part of 'Zcharacter table':", unicode);
 801                 }
 802                 i += textual_form_length - 1;
 803             }
 804         }
 805         else
 806         {   /*  Skip a character which has been over-written with the null
 807                 value 1 earlier on                                           */
 808
 809             if (text_in[i]!=1)
 810             {   if (text_in[i]==' ') write_z_char_z(0);
 811                 else
 812                 {   j = (int) text_in[i];
 813                     lookup_value = iso_to_alphabet_grid[j];
 814                     if (lookup_value < 0)
 815                     {   /*  The character isn't in the standard alphabets, so
 816                             we have to use the ZSCII 4-Z-char sequence */
 817
 818                         if (lookup_value == -5)
 819                         {   /*  Character isn't in the ZSCII set at all */
 820
 821                             unicode = iso_to_unicode(j);
 822                             unicode_char_error(
 823                                 "Character can only be used if declared in \
 824 advance as part of 'Zcharacter table':", unicode);
 825                             write_zscii(0x200 + unicode/0x100);
 826                             write_zscii(0x300 + unicode%0x100);
 827                         }
 828                         else write_zscii(-lookup_value);
 829                     }
 830                     else
 831                     {   /*  The character is in one of the standard alphabets:
 832                             write a SHIFT to temporarily change alphabet if
 833                             it isn't in alphabet 0, then write the Z-char    */
 834
 835                         alphabet_used[lookup_value] = 'Y';
 836                         in_alphabet = lookup_value/26;
 837                         if (in_alphabet==1) write_z_char_z(4);  /* SHIFT to A1 */
 838                         if (in_alphabet==2) write_z_char_z(5);  /* SHIFT to A2 */
 839                         write_z_char_z(lookup_value%26 + 6);
 840                     }
 841                 }
 842             }
 843         }
 844     }
 845
 846     /*  Flush the Z-characters output buffer and set the "end" bit           */
 847
 848     end_z_chars();
 849   }
 850   else {
 851
 852     /* The text storage here is, of course, temporary. Compression
 853        will occur when we're finished compiling, so that all the
 854        clever Huffman stuff will work.
 855        In the stored text, we use "@@" to indicate @,
 856        "@0" to indicate a zero byte,
 857        "@ANNNN" to indicate an abbreviation,
 858        "@DNNNN" to indicate a dynamic string thing.
 859        "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
 860        (NNNN is a four-digit hex number using the letters A-P... an
 861        ugly representation but a convenient one.)
 862     */
 863
 864     for (i=0; text_in[i]!=0; i++) {
 865
 866       /*  Contract ".  " into ". " if double-space-removing switch set:
 867           likewise "?  " and "!  " if the setting is high enough. */
 868       if ((double_space_setting >= 1)
 869         && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
 870         if (text_in[i]=='.'
 871           || (double_space_setting >= 2
 872             && (text_in[i]=='?' || text_in[i]=='!'))) {
 873           text_in[i+1] = text_in[i];
 874           i++;
 875         }
 876       }
 877
 878       total_chars_trans++;
 879
 880       /*  Try abbreviations if the economy switch set. We have to be in
 881           compression mode too, since the abbreviation mechanism is part
 882           of string decompression. */
 883
 884       if ((economy_switch) && (compression_switch) && (!is_abbreviation)
 885         && ((k=abbrevs_lookup[text_in[i]])!=-1)
 886         && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
 887         char *cx = abbreviation_text(j);
 888         i += (strlen(cx)-1);
 889         write_z_char_g('@');
 890         write_z_char_g('A');
 891         write_z_char_g('A' + ((j >>12) & 0x0F));
 892         write_z_char_g('A' + ((j >> 8) & 0x0F));
 893         write_z_char_g('A' + ((j >> 4) & 0x0F));
 894         write_z_char_g('A' + ((j     ) & 0x0F));
 895       }
 896       else if (text_in[i] == '@') {
 897         if (text_in[i+1]=='@') {
 898           /* An ASCII code */
 899           i+=2; j=atoi((char *) (text_in+i));
 900           if (j == '@' || j == '\0') {
 901             write_z_char_g('@');
 902             if (j == 0) {
 903               j = '0';
 904               if (!compression_switch)
 905                 warning("Ascii @@0 will prematurely terminate non-compressed \
 906 string.");
 907             }
 908           }
 909           write_z_char_g(j);
 910           while (isdigit(text_in[i])) i++; i--;
 911         }
 912         else if (text_in[i+1]=='(') {
 913             int len = 0, digits = 0;
 914             i += 2;
 915             /* This accepts "12xyz" as a symbol, which it really isn't,
 916                but that just means it won't be found. */
 917             while ((text_in[i] == '_' || isalnum(text_in[i]))) {
 918                 char ch = text_in[i++];
 919                 if (isdigit(ch)) digits++;
 920                 ensure_memory_list_available(&temp_symbol_memlist, len+1);
 921                 temp_symbol[len++] = ch;
 922             }
 923             ensure_memory_list_available(&temp_symbol_memlist, len+1);
 924             temp_symbol[len] = '\0';
 925             j = -1;
 926             /* We would like to parse temp_symbol as *either* a decimal
 927                number or a constant symbol. */
 928             if (text_in[i] != ')' || len == 0) {
 929                 error("'@(...)' abbreviation must contain a symbol");
 930             }
 931             else if (digits == len) {
 932                 /* all digits; parse as decimal */
 933                 j = atoi(temp_symbol);
 934             }
 935             else {
 936                 int sym = get_symbol_index(temp_symbol);
 937                 if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
 938                     error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol);
 939                 }
 940                 else {
 941                     symbols[sym].flags |= USED_SFLAG;
 942                     j = symbols[sym].value;
 943                 }
 944             }
 945             if (j >= MAX_DYNAMIC_STRINGS) {
 946                 error_max_dynamic_strings(j);
 947                 j = -1;
 948             }
 949             if (j+1 >= no_dynamic_strings)
 950                 no_dynamic_strings = j+1;
 951             if (j >= 0) {
 952                 write_z_char_g('@');
 953                 write_z_char_g('D');
 954                 write_z_char_g('A' + ((j >>12) & 0x0F));
 955                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 956                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 957                 write_z_char_g('A' + ((j     ) & 0x0F));
 958             }
 959             else {
 960                 write_z_char_g(' '); /* error fallback */
 961             }
 962         }
 963         else if (isdigit(text_in[i+1])) {
 964           int d1, d2;
 965           d1 = character_digit_value[text_in[i+1]];
 966           d2 = character_digit_value[text_in[i+2]];
 967           if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
 968             error("'@..' must have two decimal digits");
 969           }
 970           else {
 971             if (!compression_switch)
 972               warning("'@..' print variable will not work in non-compressed \
 973 string; substituting '   '.");
 974             i += 2;
 975             j = d1*10 + d2;
 976             if (j >= MAX_DYNAMIC_STRINGS) {
 977               error_max_dynamic_strings(j);
 978               j = -1;
 979             }
 980             if (j+1 >= no_dynamic_strings)
 981               no_dynamic_strings = j+1;
 982             if (j >= 0) {
 983                 write_z_char_g('@');
 984                 write_z_char_g('D');
 985                 write_z_char_g('A' + ((j >>12) & 0x0F));
 986                 write_z_char_g('A' + ((j >> 8) & 0x0F));
 987                 write_z_char_g('A' + ((j >> 4) & 0x0F));
 988                 write_z_char_g('A' + ((j     ) & 0x0F));
 989             }
 990             else {
 991                 write_z_char_g(' '); /* error fallback */
 992             }
 993           }
 994         }
 995         else {
 996           unicode = text_to_unicode((char *) (text_in+i));
 997           i += textual_form_length - 1;
 998           if (unicode == '@' || unicode == '\0') {
 999             write_z_char_g('@');
1000             write_z_char_g(unicode ? '@' : '0');
1001           }
1002           else if (unicode >= 0 && unicode < 256) {
1003             write_z_char_g(unicode);
1004           }
1005           else {
1006             if (!compression_switch) {
1007               warning("Unicode characters will not work in non-compressed \
1008 string; substituting '?'.");
1009               write_z_char_g('?');
1010             }
1011             else {
1012               j = unicode_entity_index(unicode);
1013               write_z_char_g('@');
1014               write_z_char_g('U');
1015               write_z_char_g('A' + ((j >>12) & 0x0F));
1016               write_z_char_g('A' + ((j >> 8) & 0x0F));
1017               write_z_char_g('A' + ((j >> 4) & 0x0F));
1018               write_z_char_g('A' + ((j     ) & 0x0F));
1019             }
1020           }
1021         }
1022       }
1023       else if (text_in[i] == '^')
1024         write_z_char_g(0x0A);
1025       else if (text_in[i] == '~')
1026         write_z_char_g('"');
1027       else if (character_set_unicode) {
1028         if (text_in[i] & 0x80) {
1029           unicode = text_to_unicode((char *) (text_in+i));
1030           i += textual_form_length - 1;
1031           if (unicode >= 0 && unicode < 256) {
1032             write_z_char_g(unicode);
1033           }
1034           else {
1035             if (!compression_switch) {
1036               warning("Unicode characters will not work in non-compressed \
1037 string; substituting '?'.");
1038               write_z_char_g('?');
1039             }
1040             else {
1041               j = unicode_entity_index(unicode);
1042               write_z_char_g('@');
1043               write_z_char_g('U');
1044               write_z_char_g('A' + ((j >>12) & 0x0F));
1045               write_z_char_g('A' + ((j >> 8) & 0x0F));
1046               write_z_char_g('A' + ((j >> 4) & 0x0F));
1047               write_z_char_g('A' + ((j     ) & 0x0F));
1048             }
1049           }
1050         }
1051         else {
1052           write_z_char_g(text_in[i]);
1053         }
1054       }
1055       else {
1056         unicode = iso_to_unicode_grid[text_in[i]];
1057         if (unicode >= 0 && unicode < 256) {
1058           write_z_char_g(unicode);
1059         }
1060         else {
1061           if (!compression_switch) {
1062             warning("Unicode characters will not work in non-compressed \
1063 string; substituting '?'.");
1064             write_z_char_g('?');
1065           }
1066           else {
1067             j = unicode_entity_index(unicode);
1068             write_z_char_g('@');
1069             write_z_char_g('U');
1070             write_z_char_g('A' + ((j >>12) & 0x0F));
1071             write_z_char_g('A' + ((j >> 8) & 0x0F));
1072             write_z_char_g('A' + ((j >> 4) & 0x0F));
1073             write_z_char_g('A' + ((j     ) & 0x0F));
1074           }
1075         }
1076       }
1077     }
1078     write_z_char_g(0);
1079     zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1080
1081   }
1082
1083   if (text_out_overflow)
1084       return -1;
1085   else
1086       return text_out_pos;
1087 }
1088
1089 static int unicode_entity_index(int32 unicode)
1090 {
1091   int j;
1092   int buck = unicode % UNICODE_HASH_BUCKETS;
1093
1094   for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1095     if (unicode_usage_entries[j].ch == unicode)
1096       break;
1097   }
1098   if (j < 0) {
1099     ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1100     j = no_unicode_chars++;
1101     unicode_usage_entries[j].ch = unicode;
1102     unicode_usage_entries[j].next = unicode_usage_hash[buck];
1103     unicode_usage_hash[buck] = j;
1104   }
1105
1106   return j;
1107 }
1108
1109 /* ------------------------------------------------------------------------- */
1110 /*   Glulx compression code                                                  */
1111 /* ------------------------------------------------------------------------- */
1112
1113
1114 static void compress_makebits(int entnum, int depth, int prevbit,
1115   huffbitlist_t *bits);
1116
1117 /*   The compressor. This uses the usual Huffman compression algorithm. */
1118 void compress_game_text()
1119 {
1120   int entities=0, branchstart, branches;
1121   int numlive;
1122   int32 lx;
1123   int jx;
1124   int ch;
1125   int32 ix;
1126   int max_char_set;
1127   huffbitlist_t bits;
1128
1129   if (compression_switch) {
1130     max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1131
1132     huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1133       "huffman entities");
1134     hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1135       "huffman node list");
1136
1137     /* How many entities have we currently got? Well, 256 plus the
1138        string-terminator plus Unicode chars plus abbrevations plus
1139        dynamic strings. */
1140     entities = 256+1;
1141     huff_unicode_start = entities;
1142     entities += no_unicode_chars;
1143     huff_abbrev_start = entities;
1144     if (economy_switch)
1145       entities += no_abbreviations;
1146     huff_dynam_start = entities;
1147     entities += no_dynamic_strings;
1148
1149     if (entities > max_char_set)
1150       compiler_error("Too many entities for max_char_set");
1151
1152     /* Characters */
1153     for (jx=0; jx<256; jx++) {
1154       huff_entities[jx].type = 2;
1155       huff_entities[jx].count = 0;
1156       huff_entities[jx].u.ch = jx;
1157     }
1158     /* Terminator */
1159     huff_entities[256].type = 1;
1160     huff_entities[256].count = 0;
1161     for (jx=0; jx<no_unicode_chars; jx++) {
1162       huff_entities[huff_unicode_start+jx].type = 4;
1163       huff_entities[huff_unicode_start+jx].count = 0;
1164       huff_entities[huff_unicode_start+jx].u.val = jx;
1165     }
1166     if (economy_switch) {
1167       for (jx=0; jx<no_abbreviations; jx++) {
1168         huff_entities[huff_abbrev_start+jx].type = 3;
1169         huff_entities[huff_abbrev_start+jx].count = 0;
1170         huff_entities[huff_abbrev_start+jx].u.val = jx;
1171       }
1172     }
1173     for (jx=0; jx<no_dynamic_strings; jx++) {
1174       huff_entities[huff_dynam_start+jx].type = 9;
1175       huff_entities[huff_dynam_start+jx].count = 0;
1176       huff_entities[huff_dynam_start+jx].u.val = jx;
1177     }
1178   }
1179   else {
1180     /* No compression; use defaults that will make it easy to check
1181        for errors. */
1182     no_huff_entities = 257;
1183     huff_unicode_start = 257;
1184     huff_abbrev_start = 257;
1185     huff_dynam_start = 257+no_abbreviations;
1186     compression_table_size = 0;
1187   }
1188
1189   if (compression_switch) {
1190
1191     for (lx=0, ix=0; lx<no_strings; lx++) {
1192       int escapelen=0, escapetype=0;
1193       int done=FALSE;
1194       int32 escapeval=0;
1195       while (!done) {
1196         ch = static_strings_area[ix];
1197         ix++;
1198         if (ix > static_strings_extent || ch < 0)
1199           compiler_error("Read too much not-yet-compressed text.");
1200         if (escapelen == -1) {
1201           escapelen = 0;
1202           if (ch == '@') {
1203             ch = '@';
1204           }
1205           else if (ch == '0') {
1206             ch = '\0';
1207           }
1208           else if (ch == 'A' || ch == 'D' || ch == 'U') {
1209             escapelen = 4;
1210             escapetype = ch;
1211             escapeval = 0;
1212             continue;
1213           }
1214           else {
1215             compiler_error("Strange @ escape in processed text.");
1216           }
1217         }
1218         else if (escapelen) {
1219           escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1220           escapelen--;
1221           if (escapelen == 0) {
1222             if (escapetype == 'A') {
1223               ch = huff_abbrev_start+escapeval;
1224             }
1225             else if (escapetype == 'D') {
1226               ch = huff_dynam_start+escapeval;
1227             }
1228             else if (escapetype == 'U') {
1229               ch = huff_unicode_start+escapeval;
1230             }
1231             else {
1232               compiler_error("Strange @ escape in processed text.");
1233             }
1234           }
1235           else
1236             continue;
1237         }
1238         else {
1239           if (ch == '@') {
1240             escapelen = -1;
1241             continue;
1242           }
1243           if (ch == 0) {
1244             ch = 256;
1245             done = TRUE;
1246           }
1247         }
1248         huff_entities[ch].count++;
1249       }
1250     }
1251
1252     numlive = 0;
1253     for (jx=0; jx<entities; jx++) {
1254       if (huff_entities[jx].count) {
1255         hufflist[numlive] = &(huff_entities[jx]);
1256         numlive++;
1257       }
1258     }
1259
1260     branchstart = entities;
1261     branches = 0;
1262
1263     while (numlive > 1) {
1264       int best1, best2;
1265       int best1num, best2num;
1266       huffentity_t *bran;
1267
1268       if (hufflist[0]->count < hufflist[1]->count) {
1269         best1 = 0;
1270         best2 = 1;
1271       }
1272       else {
1273         best2 = 0;
1274         best1 = 1;
1275       }
1276
1277       best1num = hufflist[best1]->count;
1278       best2num = hufflist[best2]->count;
1279
1280       for (jx=2; jx<numlive; jx++) {
1281         if (hufflist[jx]->count < best1num) {
1282           best2 = best1;
1283           best2num = best1num;
1284           best1 = jx;
1285           best1num = hufflist[best1]->count;
1286         }
1287         else if (hufflist[jx]->count < best2num) {
1288           best2 = jx;
1289           best2num = hufflist[best2]->count;
1290         }
1291       }
1292
1293       bran = &(huff_entities[branchstart+branches]);
1294       branches++;
1295       bran->type = 0;
1296       bran->count = hufflist[best1]->count + hufflist[best2]->count;
1297       bran->u.branch[0] = (hufflist[best1] - huff_entities);
1298       bran->u.branch[1] = (hufflist[best2] - huff_entities);
1299       hufflist[best1] = bran;
1300       if (best2 < numlive-1) {
1301         memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1302           ((numlive-1) - best2) * sizeof(huffentity_t *));
1303       }
1304       numlive--;
1305     }
1306
1307     huff_entity_root = (hufflist[0] - huff_entities);
1308
1309     for (ix=0; ix<MAXHUFFBYTES; ix++)
1310       bits.b[ix] = 0;
1311     compression_table_size = 12;
1312
1313     no_huff_entities = 0; /* compress_makebits will total this up */
1314     compress_makebits(huff_entity_root, 0, -1, &bits);
1315   }
1316
1317   /* Now, sadly, we have to compute the size of the string section,
1318      without actually doing the compression. */
1319   compression_string_size = 0;
1320
1321   ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1322
1323   for (lx=0, ix=0; lx<no_strings; lx++) {
1324     int escapelen=0, escapetype=0;
1325     int done=FALSE;
1326     int32 escapeval=0;
1327     jx = 0;
1328     compressed_offsets[lx] = compression_table_size + compression_string_size;
1329     compression_string_size++; /* for the type byte */
1330     while (!done) {
1331       ch = static_strings_area[ix];
1332       ix++;
1333       if (ix > static_strings_extent || ch < 0)
1334         compiler_error("Read too much not-yet-compressed text.");
1335       if (escapelen == -1) {
1336         escapelen = 0;
1337         if (ch == '@') {
1338           ch = '@';
1339         }
1340         else if (ch == '0') {
1341           ch = '\0';
1342         }
1343         else if (ch == 'A' || ch == 'D' || ch == 'U') {
1344           escapelen = 4;
1345           escapetype = ch;
1346           escapeval = 0;
1347           continue;
1348         }
1349         else {
1350           compiler_error("Strange @ escape in processed text.");
1351         }
1352       }
1353       else if (escapelen) {
1354         escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1355         escapelen--;
1356         if (escapelen == 0) {
1357           if (escapetype == 'A') {
1358             ch = huff_abbrev_start+escapeval;
1359           }
1360           else if (escapetype == 'D') {
1361             ch = huff_dynam_start+escapeval;
1362           }
1363           else if (escapetype == 'U') {
1364             ch = huff_unicode_start+escapeval;
1365           }
1366           else {
1367             compiler_error("Strange @ escape in processed text.");
1368           }
1369         }
1370         else
1371           continue;
1372       }
1373       else {
1374         if (ch == '@') {
1375           escapelen = -1;
1376           continue;
1377         }
1378         if (ch == 0) {
1379           ch = 256;
1380           done = TRUE;
1381         }
1382       }
1383
1384       if (compression_switch) {
1385         jx += huff_entities[ch].depth;
1386         compression_string_size += (jx/8);
1387         jx = (jx % 8);
1388       }
1389       else {
1390         if (ch >= huff_dynam_start) {
1391           compression_string_size += 3;
1392         }
1393         else if (ch >= huff_unicode_start) {
1394           compiler_error("Abbreviation/Unicode in non-compressed string \
1395 should be impossible.");
1396         }
1397         else
1398           compression_string_size += 1;
1399       }
1400     }
1401     if (compression_switch && jx)
1402       compression_string_size++;
1403   }
1404
1405   done_compression = TRUE;
1406 }
1407
1408 static void compress_makebits(int entnum, int depth, int prevbit,
1409   huffbitlist_t *bits)
1410 {
1411   huffentity_t *ent = &(huff_entities[entnum]);
1412   char *cx;
1413
1414   no_huff_entities++;
1415   ent->addr = compression_table_size;
1416   ent->depth = depth;
1417   ent->bits = *bits;
1418   if (depth > 0) {
1419     if (prevbit)
1420       ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1421   }
1422
1423   switch (ent->type) {
1424   case 0:
1425     compression_table_size += 9;
1426     compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1427     compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1428     break;
1429   case 1:
1430     compression_table_size += 1;
1431     break;
1432   case 2:
1433     compression_table_size += 2;
1434     break;
1435   case 3:
1436     cx = abbreviation_text(ent->u.val);
1437     compression_table_size += (1 + 1 + strlen(cx));
1438     break;
1439   case 4:
1440   case 9:
1441     compression_table_size += 5;
1442     break;
1443   }
1444 }
1445
1446 /* ------------------------------------------------------------------------- */
1447 /*   The abbreviations optimiser                                             */
1448 /*                                                                           */
1449 /*   This is a very complex, memory and time expensive algorithm to          */
1450 /*   approximately solve the problem of which abbreviation strings would     */
1451 /*   minimise the total number of Z-chars to which the game text translates. */
1452 /*   It is in some ways a quite separate program but remains inside Inform   */
1453 /*   for compatibility with previous releases.                               */
1454 /* ------------------------------------------------------------------------- */
1455
1456 /* The complete game text. */
1457 static char *opttext;
1458 static int32 opttextlen;
1459
1460 typedef struct tlb_s
1461 {   char text[4];
1462     int32 intab, occurrences;
1463 } tlb;
1464 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1465 static memory_list tlbtab_memlist;
1466 static int32 no_occs;
1467
1468 static int32 *grandtable;
1469 static int32 *grandflags;
1470 typedef struct optab_s
1471 {   int32  length;
1472     int32  popularity;
1473     int32  score;
1474     int32  location;
1475     char  *text; /* allocated to textsize, min 4 */
1476     int32  textsize;
1477 } optab;
1478 static int32 MAX_BESTYET;
1479 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1480 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1481
1482 static void optab_copy(optab *dest, const optab *src)
1483 {
1484     dest->length = src->length;
1485     dest->popularity = src->popularity;
1486     dest->score = src->score;
1487     dest->location = src->location;
1488     if (src->length+1 > dest->textsize) {
1489         int32 oldsize = dest->textsize;
1490         dest->textsize = (src->length+1)*2;
1491         my_realloc(&dest->text, oldsize, dest->textsize, "bestyet2.text");
1492     }
1493     strcpy(dest->text, src->text);
1494 }
1495
1496 static int pass_no;
1497
1498 static void optimise_pass(void)
1499 {
1500     TIMEVALUE t1, t2;
1501     float duration;
1502     int32 i;
1503     int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1504     for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1505     for (i=0; i<no_occs; i++)
1506     {   if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1507         {
1508 #ifdef MAC_FACE
1509             if (i%((**g_pm_hndl).linespercheck) == 0)
1510             {   ProcessEvents (&g_proc);
1511                 if (g_proc != true)
1512                 {   ao_free_arrays();
1513                     longjmp (g_fallback, 1);
1514                 }
1515             }
1516 #endif
1517             if (optabbrevs_trace_setting >= 2) {
1518                 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1519                     pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1520                     (long int) tlbtab[i].occurrences);
1521             }
1522             TIMEVALUE_NOW(&t1);
1523             for (j=0; j<tlbtab[i].occurrences; j++)
1524             {   for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1525                 nl=2; noflags=tlbtab[i].occurrences;
1526                 while (noflags>=2)
1527                 {   nl++;
1528                     for (j2=0; j2<nl; j2++)
1529                         if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1530                             goto FinishEarly;
1531                     matches=0;
1532                     for (j2=j; j2<tlbtab[i].occurrences; j2++)
1533                     {   if (grandflags[j2]==1)
1534                         {   x=grandtable[tlbtab[i].intab+j2]
1535                               - grandtable[tlbtab[i].intab+j];
1536                          if (((x>-nl)&&(x<nl))
1537                             || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1538                                        opttext+grandtable[tlbtab[i].intab+j2],
1539                                        nl)!=0))
1540                             {   grandflags[j2]=0; noflags--; }
1541                             else matches++;
1542                         }
1543                     }
1544                     scrabble=0;
1545                     for (k=0; k<nl; k++)
1546                     {   scrabble++;
1547                         c=opttext[grandtable[tlbtab[i].intab+j+k]];
1548                         if (c!=(int) ' ')
1549                         {   if (iso_to_alphabet_grid[c]<0)
1550                                 scrabble+=2;
1551                             else
1552                                 if (iso_to_alphabet_grid[c]>=26)
1553                                     scrabble++;
1554                         }
1555                     }
1556                     score=(matches-1)*(scrabble-2);
1557                     min=score;
1558                     for (j2=0; j2<MAX_BESTYET; j2++)
1559                     {   if ((nl==bestyet[j2].length)
1560                                 && (memcmp(opttext+bestyet[j2].location,
1561                                        opttext+grandtable[tlbtab[i].intab+j],
1562                                        nl)==0))
1563                         {   j2=MAX_BESTYET; min=score; }
1564                         else
1565                         {   if (bestyet[j2].score<min)
1566                             {   min=bestyet[j2].score; minat=j2;
1567                             }
1568                         }
1569                     }
1570                     if (min!=score)
1571                     {   bestyet[minat].score=score;
1572                         bestyet[minat].length=nl;
1573                         bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1574                         bestyet[minat].popularity=matches;
1575                     }
1576                 }
1577                 FinishEarly: ;
1578             }
1579             if (optabbrevs_trace_setting >= 2) {
1580                 TIMEVALUE_NOW(&t2);
1581                 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1582                 printf(" (%.4f seconds)\n", duration);
1583             }
1584         }
1585     }
1586 }
1587
1588 static int any_overlap(char *s1, char *s2)
1589 {   int a, b, i, j, flag;
1590     a=strlen(s1); b=strlen(s2);
1591     for (i=1-b; i<a; i++)
1592     {   flag=0;
1593         for (j=0; j<b; j++)
1594             if ((0<=i+j)&&(i+j<=a-1))
1595                 if (s1[i+j]!=s2[j]) flag=1;
1596         if (flag==0) return(1);
1597     }
1598     return(0);
1599 }
1600
1601 extern void optimise_abbreviations(void)
1602 {   int32 i, j, tcount, max=0, MAX_GTABLE;
1603     int32 j2, selected, available, maxat=0, nl;
1604
1605     if (opttext == NULL)
1606         return;
1607
1608     /* We insist that the first two abbreviations will be ". " and ", ". */
1609     if (MAX_ABBREVS < 2)
1610         return;
1611
1612     /* Note that it's safe to access opttext[opttextlen+2]. There are
1613        two newlines and a null beyond opttextlen. */
1614
1615     printf("Beginning calculation of optimal abbreviations...\n");
1616
1617     pass_no = 0;
1618
1619     initialise_memory_list(&tlbtab_memlist,
1620         sizeof(tlb), 1000, (void**)&tlbtab,
1621         "three-letter-blocks buffer");
1622
1623     no_occs=0;
1624
1625     /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1626     MAX_BESTYET = 4 * MAX_ABBREVS;
1627
1628     bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1629     for (i=0; i<MAX_BESTYET; i++) {
1630         bestyet[i].length = 0;
1631         bestyet[i].popularity = 0;
1632         bestyet[i].score = 0;
1633         bestyet[i].location = 0;
1634         bestyet[i].textsize = 4;
1635         bestyet[i].text = my_malloc(bestyet[i].textsize, "bestyet.text");
1636     }
1637
1638     bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1639     for (i=0; i<MAX_ABBREVS; i++) {
1640         bestyet2[i].length = 0;
1641         bestyet2[i].popularity = 0;
1642         bestyet2[i].score = 0;
1643         bestyet2[i].location = 0;
1644         bestyet2[i].textsize = 4;
1645         bestyet2[i].text = my_malloc(bestyet2[i].textsize, "bestyet2.text");
1646     }
1647
1648     bestyet2[0].text[0]='.';
1649     bestyet2[0].text[1]=' ';
1650     bestyet2[0].text[2]=0;
1651
1652     bestyet2[1].text[0]=',';
1653     bestyet2[1].text[1]=' ';
1654     bestyet2[1].text[2]=0;
1655
1656     selected=2;
1657
1658     for (i=0; i<opttextlen; i++)
1659     {
1660         if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1661         {   opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1662             bestyet2[0].popularity++;
1663         }
1664
1665         if ((opttext[i]=='.') && (opttext[i+1]==' '))
1666         {   opttext[i]='\n'; opttext[i+1]='\n';
1667             bestyet2[0].popularity++;
1668         }
1669
1670         if ((opttext[i]==',') && (opttext[i+1]==' '))
1671         {   opttext[i]='\n'; opttext[i+1]='\n';
1672             bestyet2[1].popularity++;
1673         }
1674     }
1675
1676     MAX_GTABLE=opttextlen+1;
1677     grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1678
1679     for (i=0, tcount=0; i<opttextlen; i++)
1680     {
1681         tlb test;
1682         test.text[0]=opttext[i];
1683         test.text[1]=opttext[i+1];
1684         test.text[2]=opttext[i+2];
1685         test.text[3]=0;
1686         if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1687             goto DontKeep;
1688         for (j=0; j<no_occs; j++) {
1689             if (strcmp(test.text,tlbtab[j].text)==0)
1690                 goto DontKeep;
1691         }
1692         test.occurrences=0;
1693         test.intab=0;
1694         for (j=i+3; j<opttextlen; j++)
1695         {
1696 #ifdef MAC_FACE
1697             if (j%((**g_pm_hndl).linespercheck) == 0)
1698             {   ProcessEvents (&g_proc);
1699                 if (g_proc != true)
1700                 {   ao_free_arrays();
1701                     longjmp (g_fallback, 1);
1702                 }
1703             }
1704 #endif
1705             if ((opttext[i]==opttext[j])
1706                  && (opttext[i+1]==opttext[j+1])
1707                  && (opttext[i+2]==opttext[j+2]))
1708                  {   grandtable[tcount+test.occurrences]=j;
1709                      test.occurrences++;
1710                      if (tcount+test.occurrences==MAX_GTABLE)
1711                      {   printf("All %ld cross-references used\n",
1712                              (long int) MAX_GTABLE);
1713                          goto Built;
1714                      }
1715                  }
1716         }
1717         if (test.occurrences>=2)
1718         {
1719             ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1720             tlbtab[no_occs]=test;
1721             tlbtab[no_occs].intab=tcount;
1722             tcount += tlbtab[no_occs].occurrences;
1723             if (max<tlbtab[no_occs].occurrences)
1724                 max=tlbtab[no_occs].occurrences;
1725             no_occs++;
1726         }
1727         DontKeep: ;
1728     }
1729
1730     Built:
1731     grandflags=my_calloc(sizeof(int), max, "grandflags");
1732
1733
1734     if (optabbrevs_trace_setting >= 1) {
1735         printf("Cross-reference table (%ld entries) built...\n",
1736             (long int) no_occs);
1737     }
1738     /*  for (i=0; i<no_occs; i++)
1739             printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1740                 tlbtab[i].occurrences);
1741     */
1742
1743     for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1744     available=MAX_BESTYET;
1745     while ((available>0)&&(selected<MAX_ABBREVS))
1746     {
1747         pass_no++;
1748         if (optabbrevs_trace_setting >= 1) {
1749             printf("Pass %d\n", pass_no);
1750         }
1751
1752         optimise_pass();
1753         available=0;
1754         for (i=0; i<MAX_BESTYET; i++)
1755             if (bestyet[i].score!=0)
1756             {   available++;
1757                 nl=bestyet[i].length;
1758                 if (nl+1 > bestyet[i].textsize) {
1759                     int32 oldsize = bestyet[i].textsize;
1760                     bestyet[i].textsize = (nl+1)*2;
1761                     my_realloc(&bestyet[i].text, oldsize, bestyet[i].textsize, "bestyet.text");
1762                 }
1763                 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1764                     opttext[bestyet[i].location+j2];
1765                 bestyet[i].text[nl]=0;
1766             }
1767
1768     /*  printf("End of pass results:\n");
1769         printf("\nno   score  freq   string\n");
1770         for (i=0; i<MAX_BESTYET; i++)
1771             if (bestyet[i].score>0)
1772                 printf("%02d:  %4d   %4d   '%s'\n", i, bestyet[i].score,
1773                     bestyet[i].popularity, bestyet[i].text);
1774     */
1775
1776         do
1777         {   max=0;
1778             for (i=0; i<MAX_BESTYET; i++)
1779                 if (max<bestyet[i].score)
1780                 {   max=bestyet[i].score;
1781                     maxat=i;
1782                 }
1783
1784             if (max>0)
1785             {
1786                 char testtext[4];
1787                 optab_copy(&bestyet2[selected++], &bestyet[maxat]);
1788
1789                 if (optabbrevs_trace_setting >= 1) {
1790                     printf(
1791                         "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1792                         (long int) selected,bestyet[maxat].text,
1793                         (long int) bestyet[maxat].popularity,
1794                         (long int) bestyet[maxat].score);
1795                 }
1796
1797                 testtext[0]=bestyet[maxat].text[0];
1798                 testtext[1]=bestyet[maxat].text[1];
1799                 testtext[2]=bestyet[maxat].text[2];
1800                 testtext[3]=0;
1801
1802                 for (i=0; i<no_occs; i++)
1803                     if (strcmp(testtext,tlbtab[i].text)==0)
1804                         break;
1805
1806                 for (j=0; j<tlbtab[i].occurrences; j++)
1807                 {   if (memcmp(bestyet[maxat].text,
1808                                opttext+grandtable[tlbtab[i].intab+j],
1809                                bestyet[maxat].length)==0)
1810                     {   for (j2=0; j2<bestyet[maxat].length; j2++)
1811                             opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1812                     }
1813                 }
1814
1815                 for (i=0; i<MAX_BESTYET; i++)
1816                     if ((bestyet[i].score>0)&&
1817                         (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1818                     {   bestyet[i].score=0;
1819                        /* printf("Discarding '%s' as overlapping\n",
1820                             bestyet[i].text); */
1821                     }
1822             }
1823         } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1824     }
1825
1826     printf("\nChosen abbreviations (in Inform syntax):\n\n");
1827     for (i=0; i<selected; i++)
1828         printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1829
1830     text_free_arrays();
1831 }
1832
1833 /* ------------------------------------------------------------------------- */
1834 /*   The dictionary manager begins here.                                     */
1835 /*                                                                           */
1836 /*   Speed is extremely important in these algorithms.  If a linear-time     */
1837 /*   routine were used to search the dictionary words so far built up, then  */
1838 /*   Inform would crawl.                                                     */
1839 /*                                                                           */
1840 /*   Instead, the dictionary is stored as a binary tree, which is kept       */
1841 /*   balanced with the red-black algorithm.                                  */
1842 /* ------------------------------------------------------------------------- */
1843 /*   A dictionary table similar to the Z-machine format is kept: there is a  */
1844 /*   7-byte header (left blank here to be filled in at the                   */
1845 /*   construct_storyfile() stage in "tables.c") and then a sequence of       */
1846 /*   records, one per word, in the form                                      */
1847 /*                                                                           */
1848 /*        <Z-coded text>    <flags>   <verbnumber>     <adjectivenumber>     */
1849 /*        4 or 6 bytes       byte        byte             byte               */
1850 /*                                                                           */
1851 /*   For Glulx, the form is instead: (See below about Unicode-valued         */
1852 /*   dictionaries and DICT_WORD_BYTES.)                                      */
1853 /*                                                                           */
1854 /*        <tag>  <plain text>    <flags>  <verbnumber>   <adjectivenumber>   */
1855 /*         $60    DICT_WORD_BYTES short    short          short              */
1856 /*                                                                           */
1857 /*   These records are stored in "accession order" (i.e. in order of their   */
1858 /*   first being received by these routines) and only alphabetically sorted  */
1859 /*   by construct_storyfile() (using the array below).                       */
1860 /* ------------------------------------------------------------------------- */
1861 /*                                                                           */
1862 /*   Further notes about the data fields...                                  */
1863 /*   The flags are currently:                                                */
1864 /*     bit 0: word is used as a verb (in verb grammar)                       */
1865 /*     bit 1: word is used as a meta verb                                    */
1866 /*     bit 2: word is plural (set by '//p')                                  */
1867 /*     bit 3: word is used as a preposition (in verb grammar)                */
1868 /*     bit 6: set for all verbs, but not used by the parser?                 */
1869 /*     bit 7: word is used as a noun (set for every word that appears in     */
1870 /*       code or in an object property)                                      */
1871 /*                                                                           */
1872 /*   In grammar version 2, the third field (adjectivenumber) is unused (and  */
1873 /*   zero).                                                                  */
1874 /*                                                                           */
1875 /*   The compiler generates special constants #dict_par1, #dict_par2,        */
1876 /*   #dict_par3 to refer to the byte offsets of the three fields. In         */
1877 /*   Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they      */
1878 /*   are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three    */
1879 /*   fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.)                     */
1880 /* ------------------------------------------------------------------------- */
1881
1882 uchar *dictionary;                    /* (These two variables are externally
1883                                          used only in "tables.c" when
1884                                          building the story-file)            */
1885 static memory_list dictionary_memlist;
1886 int32 dictionary_top;                 /* Position of the next free record
1887                                          in dictionary (i.e., the current
1888                                          number of bytes)                    */
1889
1890 int dict_entries;                     /* Total number of records entered     */
1891
1892 /* ------------------------------------------------------------------------- */
1893 /*   dict_word was originally a typedef for a struct of 6 unsigned chars.    */
1894 /*   It held the (4 or) 6 bytes of Z-coded text of a word.                   */
1895 /*   Usefully, because the PAD character 5 is < all alphabetic characters,   */
1896 /*   alphabetic order corresponds to numeric order.  For this reason, the    */
1897 /*   dict_word is called the "sort code" of the original text word.          */
1898 /*                                                                           */
1899 /*   In modifying the compiler for Glulx, I found it easier to discard the   */
1900 /*   typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1901 /*   In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work   */
1902 /*   as before. In Glulx, it can be any value.                               */
1903 /*                                                                           */
1904 /*   In further modifying the compiler to generate a Unicode dictionary,     */
1905 /*   I have to store four-byte values in the uchar array. We make the array  */
1906 /*   size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).          */
1907 /*   Then we store the 32-bit character value big-endian. This lets us       */
1908 /*   continue to compare arrays bytewise, which is a nice simplification.    */
1909 /* ------------------------------------------------------------------------- */
1910
1911 extern int compare_sorts(uchar *d1, uchar *d2)
1912 {   int i;
1913     for (i=0; i<DICT_WORD_BYTES; i++)
1914         if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1915     /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1916        libraries) */
1917     return(0);
1918 }
1919
1920 extern void copy_sorts(uchar *d1, uchar *d2)
1921 {   int i;
1922     for (i=0; i<DICT_WORD_BYTES; i++)
1923         d1[i] = d2[i];
1924 }
1925
1926 static memory_list prepared_sort_memlist;
1927 static uchar *prepared_sort;    /* Holds the sort code of current word */
1928
1929 static int prepared_dictflags_pos;  /* Dict flags set by the current word */
1930 static int prepared_dictflags_neg;  /* Dict flags *not* set by the word */
1931
1932 /* Also used by verbs.c */
1933 static void dictionary_prepare_z(char *dword, uchar *optresult)
1934 {   int i, j, k, k2, wd[13];
1935     int32 tot;
1936     int negflag;
1937
1938     /* A rapid text translation algorithm using only the simplified rules
1939        applying to the text of dictionary entries: first produce a sequence
1940        of 6 (v3) or 9 (v4+) Z-characters                                     */
1941
1942     int dictsize = (version_number==3) ? 6 : 9;
1943
1944     prepared_dictflags_pos = 0;
1945     prepared_dictflags_neg = 0;
1946
1947     for (i=0, j=0; dword[j]!=0; j++)
1948     {
1949         if ((dword[j] == '/') && (dword[j+1] == '/'))
1950         {
1951             /* The rest of the word is dict flags. Run through them. */
1952             negflag = FALSE;
1953             for (j+=2; dword[j] != 0; j++)
1954             {
1955                 switch(dword[j])
1956                 {
1957                     case '~':
1958                         if (!dword[j+1])
1959                             error_named("'//~' with no flag character (pn) in dict word", dword);
1960                         negflag = !negflag;
1961                         break;
1962                     case 'p':
1963                         if (!negflag)
1964                             prepared_dictflags_pos |= 4;
1965                         else
1966                             prepared_dictflags_neg |= 4;
1967                         negflag = FALSE;
1968                         break;
1969                     case 'n':
1970                         if (!negflag)
1971                             prepared_dictflags_pos |= 128;
1972                         else
1973                             prepared_dictflags_neg |= 128;
1974                         negflag = FALSE;
1975                         break;
1976                     default:
1977                         error_named("Expected flag character (pn~) after '//' in dict word", dword);
1978                         break;
1979                 }
1980             }
1981             break;
1982         }
1983
1984         /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping
1985            at dictsize. */
1986         if (LONG_DICT_FLAG_BUG && i>=dictsize)
1987             break;
1988
1989         k=(int) dword[j];
1990         if (k==(int) '\'')
1991             warning_named("Obsolete usage: use the ^ character for the \
1992 apostrophe in", dword);
1993         if (k==(int) '^') k=(int) '\'';
1994         if (k=='\"') k='~';
1995
1996         if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1997         {   int unicode = text_to_unicode(dword+j);
1998             if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1999             k = unicode_to_zscii(unicode);
2000             j += textual_form_length - 1;
2001             if ((k == 5) || (k >= 0x100))
2002             {   unicode_char_error(
2003                    "Character can be printed but not input:", unicode);
2004                 k = '?';
2005             }
2006             k2 = zscii_to_alphabet_grid[(uchar) k];
2007         }
2008         else
2009         {   if (isupper(k)) k = tolower(k);
2010             k2 = iso_to_alphabet_grid[(uchar) k];
2011         }
2012
2013         if (k2 < 0)
2014         {   if ((k2 == -5) || (k2 <= -0x100))
2015                 char_error("Character can be printed but not input:", k);
2016             else
2017             {   /* Use 4 more Z-chars to encode a ZSCII escape sequence      */
2018                 if (i<dictsize)
2019                     wd[i++] = 5;
2020                 if (i<dictsize)
2021                     wd[i++] = 6;
2022                 k2 = -k2;
2023                 if (i<dictsize)
2024                     wd[i++] = k2/32;
2025                 if (i<dictsize)
2026                     wd[i++] = k2%32;
2027             }
2028         }
2029         else
2030         {   alphabet_used[k2] = 'Y';
2031             if ((k2/26)!=0 && i<dictsize)
2032                 wd[i++]=3+(k2/26);            /* Change alphabet for symbols */
2033             if (i<dictsize)
2034                 wd[i++]=6+(k2%26);            /* Write the Z character       */
2035         }
2036     }
2037
2038     if (i > dictsize)
2039         compiler_error("dict word buffer overflow");
2040
2041     /* Fill up to the end of the dictionary block with PAD characters
2042        (for safety, we right-pad to 9 chars even in V3)                      */
2043
2044     for (; i<9; i++) wd[i]=5;
2045
2046     /* The array of Z-chars is converted to two or three 2-byte blocks       */
2047     ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES);
2048
2049     tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
2050     prepared_sort[1]=tot%0x100;
2051     prepared_sort[0]=(tot/0x100)%0x100;
2052     tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
2053     prepared_sort[3]=tot%0x100;
2054     prepared_sort[2]=(tot/0x100)%0x100;
2055     if (version_number==3)
2056         tot = 0;
2057     else
2058         tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
2059     prepared_sort[5]=tot%0x100;
2060     prepared_sort[4]=(tot/0x100)%0x100;
2061
2062     /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block    */
2063
2064     if (version_number==3) prepared_sort[2]+=0x80;
2065                       else prepared_sort[4]+=0x80;
2066
2067     if (optresult) copy_sorts(optresult, prepared_sort);
2068 }
2069
2070 /* Also used by verbs.c */
2071 static void dictionary_prepare_g(char *dword, uchar *optresult)
2072 {
2073   int i, j, k;
2074   int32 unicode;
2075   int negflag;
2076
2077   prepared_dictflags_pos = 0;
2078   prepared_dictflags_neg = 0;
2079
2080   for (i=0, j=0; (dword[j]!=0); j++) {
2081     if ((dword[j] == '/') && (dword[j+1] == '/')) {
2082       /* The rest of the word is dict flags. Run through them. */
2083       negflag = FALSE;
2084       for (j+=2; dword[j] != 0; j++) {
2085         switch(dword[j]) {
2086         case '~':
2087             if (!dword[j+1])
2088                 error_named("'//~' with no flag character (pn) in dict word", dword);
2089             negflag = !negflag;
2090             break;
2091         case 'p':
2092             if (!negflag)
2093                 prepared_dictflags_pos |= 4;
2094             else
2095                 prepared_dictflags_neg |= 4;
2096             negflag = FALSE;
2097             break;
2098         case 'n':
2099             if (!negflag)
2100                 prepared_dictflags_pos |= 128;
2101             else
2102                 prepared_dictflags_neg |= 128;
2103             negflag = FALSE;
2104             break;
2105         default:
2106           error_named("Expected flag character (pn~) after '//' in dict word", dword);
2107           break;
2108         }
2109       }
2110       break;
2111     }
2112
2113     /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping
2114        at DICT_WORD_SIZE. */
2115     if (LONG_DICT_FLAG_BUG && i>=DICT_WORD_SIZE)
2116         break;
2117
2118     k= ((unsigned char *)dword)[j];
2119     if (k=='\'')
2120       warning_named("Obsolete usage: use the ^ character for the \
2121 apostrophe in", dword);
2122     if (k=='^')
2123       k='\'';
2124     if (k=='~') /* as in iso_to_alphabet_grid */
2125       k='\"';
2126
2127     if (k=='@' || (character_set_unicode && (k & 0x80))) {
2128       unicode = text_to_unicode(dword+j);
2129       j += textual_form_length - 1;
2130     }
2131     else {
2132       unicode = iso_to_unicode_grid[k];
2133     }
2134
2135     if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
2136       k = unicode;
2137     }
2138     else {
2139       error("The dictionary cannot contain Unicode characters beyond Latin-1. \
2140 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
2141       k = '?';
2142     }
2143
2144     if (k >= (unsigned)'A' && k <= (unsigned)'Z')
2145       k += ('a' - 'A');
2146
2147     ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES);
2148
2149     if (DICT_CHAR_SIZE == 1) {
2150       if (i<DICT_WORD_SIZE)
2151         prepared_sort[i++] = k;
2152     }
2153     else {
2154       if (i<DICT_WORD_SIZE) {
2155         prepared_sort[4*i]   = (k >> 24) & 0xFF;
2156         prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2157         prepared_sort[4*i+2] = (k >>  8) & 0xFF;
2158         prepared_sort[4*i+3] = (k)       & 0xFF;
2159         i++;
2160       }
2161     }
2162   }
2163
2164   if (i > DICT_WORD_SIZE)
2165     compiler_error("dict word buffer overflow");
2166
2167   /* Right-pad with zeroes */
2168   if (DICT_CHAR_SIZE == 1) {
2169     for (; i<DICT_WORD_SIZE; i++)
2170       prepared_sort[i] = 0;
2171   }
2172   else {
2173     for (; i<DICT_WORD_SIZE; i++) {
2174       prepared_sort[4*i]   = 0;
2175       prepared_sort[4*i+1] = 0;
2176       prepared_sort[4*i+2] = 0;
2177       prepared_sort[4*i+3] = 0;
2178     }
2179   }
2180
2181   if (optresult) copy_sorts(optresult, prepared_sort);
2182 }
2183
2184 extern void dictionary_prepare(char *dword, uchar *optresult)
2185 {
2186   if (!glulx_mode)
2187     dictionary_prepare_z(dword, optresult);
2188   else
2189     dictionary_prepare_g(dword, optresult);
2190 }
2191
2192 /* ------------------------------------------------------------------------- */
2193 /*   The arrays below are all concerned with the problem of alphabetically   */
2194 /*   sorting the dictionary during the compilation pass.                     */
2195 /*   Note that it is not enough simply to apply qsort to the dictionary at   */
2196 /*   the end of the pass: we need to ensure that no duplicates are ever      */
2197 /*   created.                                                                */
2198 /*                                                                           */
2199 /*   dict_sort_codes[n]     the sort code of record n: i.e., of the nth      */
2200 /*                          word to be entered into the dictionary, where    */
2201 /*                          n counts upward from 0                           */
2202 /*                          (n is also called the "accession number")        */
2203 /*                                                                           */
2204 /*   The tree structure encodes an ordering.  The special value VACANT means */
2205 /*   "no node here": otherwise, node numbers are the same as accession       */
2206 /*   numbers.  At all times, "root" holds the node number of the top of the  */
2207 /*   tree; each node has up to two branches, such that the subtree of the    */
2208 /*   left branch is always alphabetically before what's at the node, and     */
2209 /*   the subtree to the right is always after; and all branches are coloured */
2210 /*   either "black" or "red".  These colours are used to detect points where */
2211 /*   the tree is growing asymmetrically (and therefore becoming inefficient  */
2212 /*   to search).                                                             */
2213 /* ------------------------------------------------------------------------- */
2214
2215 #define RED    'r'
2216 #define BLACK  'b'
2217 #define VACANT -1
2218
2219 static int root;
2220 typedef struct dict_tree_node_s
2221 {   int  branch[2];               /* Branch 0 is "left", 1 is "right" */
2222     char colour;                  /* The colour of the branch to the parent */
2223 } dict_tree_node;
2224
2225 static dict_tree_node *dtree;     /* Allocated to dict_entries */
2226 static memory_list dtree_memlist;
2227
2228 static uchar *dict_sort_codes;  /* Allocated to dict_entries*DICT_WORD_BYTES */
2229 static memory_list dict_sort_codes_memlist;
2230
2231 int   *final_dict_order;          /* Allocated at sort_dictionary() time */
2232
2233 static void dictionary_begin_pass(void)
2234 {
2235     /*  Leave room for the 7-byte header (added in "tables.c" much later)    */
2236     /*  Glulx has a 4-byte header instead. */
2237
2238     if (!glulx_mode)
2239         dictionary_top = 7;
2240     else
2241         dictionary_top = 4;
2242
2243     ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2244
2245     root = VACANT;
2246     dict_entries = 0;
2247 }
2248
2249 static int fdo_count;
2250 static void recursively_sort(int node)
2251 {   if (dtree[node].branch[0] != VACANT)
2252         recursively_sort(dtree[node].branch[0]);
2253     final_dict_order[node] = fdo_count++;
2254     if (dtree[node].branch[1] != VACANT)
2255         recursively_sort(dtree[node].branch[1]);
2256 }
2257
2258 extern void sort_dictionary(void)
2259 {
2260     final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2261
2262     if (root != VACANT)
2263     {   fdo_count = 0; recursively_sort(root);
2264     }
2265 }
2266
2267 /* ------------------------------------------------------------------------- */
2268 /*   If "dword" is in the dictionary, return its accession number plus 1;    */
2269 /*   If not, return 0.                                                       */
2270 /* ------------------------------------------------------------------------- */
2271
2272 static int dictionary_find(char *dword)
2273 {   int at = root, n;
2274
2275     dictionary_prepare(dword, NULL);
2276
2277     while (at != VACANT)
2278     {   n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2279         if (n==0) return at + 1;
2280         if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2281     }
2282     return 0;
2283 }
2284
2285 /* ------------------------------------------------------------------------- */
2286 /*  Add "dword" to the dictionary with (flag1,flag2,flag3) as its data       */
2287 /*  fields; unless it already exists, in which case OR the data fields with  */
2288 /*  those flags.                                                             */
2289 /*                                                                           */
2290 /*  These fields are one byte each in Z-code, two bytes each in Glulx.       */
2291 /*                                                                           */
2292 /*  Returns: the accession number.                                           */
2293 /* ------------------------------------------------------------------------- */
2294
2295 extern int dictionary_add(char *dword, int flag1, int flag2, int flag3)
2296 {   int n; uchar *p;
2297     int ggfr = 0, gfr = 0, fr = 0, r = 0;
2298     int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2299     int a, b;
2300     int res=((version_number==3)?4:6);
2301
2302     /* Fill in prepared_sort and prepared_dictflags. */
2303     dictionary_prepare(dword, NULL);
2304
2305     /* Adjust flag1 according to prepared_dictflags. */
2306     flag1 &= (~prepared_dictflags_neg);
2307     flag1 |= prepared_dictflags_pos;
2308
2309     if (root == VACANT)
2310     {   root = 0; goto CreateEntry;
2311     }
2312     while (TRUE)
2313     {
2314         n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2315         if (n==0)
2316         {
2317             if (!glulx_mode) {
2318                 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2319                 p[0] |= flag1; p[1] |= flag2;
2320                 if (!ZCODE_LESS_DICT_DATA)
2321                     p[2] |= flag3;
2322             }
2323             else {
2324                 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2325                 p[0] |= (flag1/256); p[1] |= (flag1%256);
2326                 p[2] |= (flag2/256); p[3] |= (flag2%256);
2327                 p[4] |= (flag3/256); p[5] |= (flag3%256);
2328             }
2329             return at;
2330         }
2331         if (n>0) r=1; else r=0;
2332
2333         a = dtree[at].branch[0]; b = dtree[at].branch[1];
2334         if ((a != VACANT) && (dtree[a].colour == RED) &&
2335             (b != VACANT) && (dtree[b].colour == RED))
2336         {   dtree[a].colour = BLACK;
2337             dtree[b].colour = BLACK;
2338
2339             dtree[at].colour = RED;
2340
2341         /* A tree rotation may be needed to avoid two red links in a row:
2342            e.g.
2343              ggf   (or else gf is root)         ggf (or f is root)
2344               |                                  |
2345               gf                                 f
2346              / \(red)                           / \ (both red)
2347                 f            becomes          gf   at
2348                / \(red)                      /  \ /  \
2349                   at
2350                  /  \
2351
2352            In effect we rehang the "gf" subtree from "f".
2353            See the Technical Manual for further details.
2354         */
2355
2356             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2357             {
2358               if (fr == gfr)
2359               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2360                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2361                 dtree[f].branch[1-fr] = gf;
2362                 dtree[f].colour = BLACK;
2363                 dtree[gf].colour = RED;
2364                 gf = ggf; gfr = ggfr;
2365               }
2366               else
2367               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2368                 dtree[at].colour = BLACK;
2369                 dtree[gf].colour = RED;
2370                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2371                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2372                 dtree[at].branch[gfr] = f;
2373                 dtree[at].branch[fr] = gf;
2374
2375                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2376                 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2377               }
2378             }
2379         }
2380
2381         if (dtree[at].branch[r] == VACANT)
2382         {   dtree[at].colour = RED;
2383
2384             if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2385             { if (fr == gfr)
2386               { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2387                 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2388                 dtree[f].branch[1-fr] = gf;
2389                 dtree[f].colour = BLACK;
2390                 dtree[gf].colour = RED;
2391               }
2392               else
2393               { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2394                 dtree[at].colour = BLACK;
2395                 dtree[gf].colour = RED;
2396                 dtree[f].branch[fr] = dtree[at].branch[gfr];
2397                 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2398                 dtree[at].branch[gfr] = f;
2399                 dtree[at].branch[fr] = gf;
2400
2401                 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2402                 f = n; gf = ggf;
2403               }
2404             }
2405             dtree[at].branch[r] = dict_entries;
2406             goto CreateEntry;
2407         }
2408         ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2409         ggfr = gfr; gfr = fr; fr = r;
2410     }
2411
2412     CreateEntry:
2413
2414     ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2415     ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2416
2417     dtree[dict_entries].branch[0] = VACANT;
2418     dtree[dict_entries].branch[1] = VACANT;
2419     dtree[dict_entries].colour    = BLACK;
2420
2421     /*  Address in Inform's own dictionary table to write the record to      */
2422
2423     if (!glulx_mode) {
2424
2425         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2426         p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2427
2428         /*  So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2429             bytes */
2430
2431         p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2432         p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2433         if (version_number > 3)
2434           {   p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2435         p[res]=flag1; p[res+1]=flag2;
2436         if (!ZCODE_LESS_DICT_DATA) p[res+2]=flag3;
2437
2438         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2439
2440     }
2441     else {
2442         int i;
2443         ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2444         p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2445         p[0] = 0x60; /* type byte -- dict word */
2446
2447         p += DICT_CHAR_SIZE;
2448         for (i=0; i<DICT_WORD_BYTES; i++)
2449           p[i] = prepared_sort[i];
2450
2451         p += DICT_WORD_BYTES;
2452         p[0] = (flag1/256); p[1] = (flag1%256);
2453         p[2] = (flag2/256); p[3] = (flag2%256);
2454         p[4] = (flag3/256); p[5] = (flag3%256);
2455
2456         dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2457
2458     }
2459
2460     copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2461
2462     return dict_entries++;
2463 }
2464
2465 /* ------------------------------------------------------------------------- */
2466 /*   Used in "tables.c" for "Extend ... only", to renumber a verb-word to a  */
2467 /*   new verb syntax of its own.  (Otherwise existing verb-words never       */
2468 /*   change their verb-numbers.)                                             */
2469 /* ------------------------------------------------------------------------- */
2470
2471 extern void dictionary_set_verb_number(char *dword, int to)
2472 {   int i; uchar *p;
2473     int res=((version_number==3)?4:6);
2474     i=dictionary_find(dword);
2475     if (i!=0)
2476     {
2477         if (!glulx_mode) {
2478             p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2479             p[1]=to;
2480         }
2481         else {
2482             p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2483             p[2]=to/256; p[3]=to%256;
2484         }
2485     }
2486 }
2487
2488 /* ------------------------------------------------------------------------- */
2489 /*   Tracing code for the dictionary: used by "trace" and text               */
2490 /*   transcription.                                                          */
2491 /* ------------------------------------------------------------------------- */
2492
2493 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2494    printed directly. (The "Trace dictionary" directive does this.)
2495    If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2496    until it's a page-width.
2497 */
2498 static char *d_show_buf = NULL;
2499 static int d_show_size; /* allocated size */
2500 static int d_show_len;  /* current length */
2501
2502 static void show_char(char c)
2503 {
2504     if (d_show_buf == NULL) {
2505         printf("%c", c);
2506     }
2507     else {
2508         if (d_show_len+2 >= d_show_size) {
2509             int newsize = 2 * d_show_len + 16;
2510             my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2511             d_show_size = newsize;
2512         }
2513         d_show_buf[d_show_len++] = c;
2514         d_show_buf[d_show_len] = '\0';
2515     }
2516 }
2517
2518 /* Display a Unicode character in user-readable form. This uses the same
2519    character encoding as the source code. */
2520 static void show_uchar(uint32 c)
2521 {
2522     char buf[16];
2523     int ix;
2524
2525     if (c < 0x80) {
2526         /* ASCII always works */
2527         show_char(c);
2528         return;
2529     }
2530     if (character_set_unicode) {
2531         /* UTF-8 the character */
2532         if (c < 0x80) {
2533             show_char(c);
2534         }
2535         else if (c < 0x800) {
2536             show_char((0xC0 | ((c & 0x7C0) >> 6)));
2537             show_char((0x80 |  (c & 0x03F)     ));
2538         }
2539         else if (c < 0x10000) {
2540             show_char((0xE0 | ((c & 0xF000) >> 12)));
2541             show_char((0x80 | ((c & 0x0FC0) >>  6)));
2542             show_char((0x80 |  (c & 0x003F)      ));
2543         }
2544         else if (c < 0x200000) {
2545             show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2546             show_char((0x80 | ((c & 0x03F000) >> 12)));
2547             show_char((0x80 | ((c & 0x000FC0) >>  6)));
2548             show_char((0x80 |  (c & 0x00003F)      ));
2549         }
2550         else {
2551             show_char('?');
2552         }
2553         return;
2554     }
2555     if (character_set_setting == 1 && c < 0x100) {
2556         /* Fits in Latin-1 */
2557         show_char(c);
2558         return;
2559     }
2560     /* Supporting other character_set_setting is harder; not currently implemented. */
2561
2562     /* Use the escaped form */
2563     sprintf(buf, "@{%x}", c);
2564     for (ix=0; buf[ix]; ix++)
2565         show_char(buf[ix]);
2566 }
2567
2568 extern void word_to_ascii(uchar *p, char *results)
2569 {   int i, shift, cc, zchar; uchar encoded_word[9];
2570     encoded_word[0] = (((int) p[0])&0x7c)/4;
2571     encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2572     encoded_word[2] = ((int) p[1])&0x1f;
2573     encoded_word[3] = (((int) p[2])&0x7c)/4;
2574     encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2575     encoded_word[5] = ((int) p[3])&0x1f;
2576     if (version_number > 3)
2577     {   encoded_word[6] = (((int) p[4])&0x7c)/4;
2578         encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2579         encoded_word[8] = ((int) p[5])&0x1f;
2580     }
2581     else
2582     {
2583         encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2584     }
2585
2586     shift = 0; cc = 0;
2587     for (i=0; i< ((version_number==3)?6:9); i++)
2588     {   zchar = encoded_word[i];
2589
2590         if (zchar == 4) shift = 1;
2591         else
2592         if (zchar == 5) shift = 2;
2593         else
2594         {   if ((shift == 2) && (zchar == 6))
2595             {   zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2596                 i += 2;
2597                 if ((zchar>=32) && (zchar<=126))
2598                     results[cc++] = zchar;
2599                 else
2600                 {   zscii_to_text(results+cc, zchar);
2601                     cc = strlen(results);
2602                 }
2603             }
2604             else
2605             {   zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2606                 cc = strlen(results);
2607             }
2608             shift = 0;
2609         }
2610     }
2611     results[cc] = 0;
2612 }
2613
2614 /* Print a dictionary word to stdout.
2615    (This assumes that d_show_buf is null.)
2616  */
2617 void print_dict_word(int node)
2618 {
2619     uchar *p;
2620     int cprinted;
2621
2622     if (!glulx_mode) {
2623         char textual_form[32];
2624         p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2625
2626         word_to_ascii(p, textual_form);
2627
2628         for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2629             show_char(textual_form[cprinted]);
2630     }
2631     else {
2632         p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2633
2634         for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2635         {
2636             uint32 ch;
2637             if (DICT_CHAR_SIZE == 1)
2638                 ch = p[1+cprinted];
2639             else
2640                 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2641             if (!ch)
2642                 break;
2643             show_uchar(ch);
2644         }
2645     }
2646 }
2647
2648 static void recursively_show_z(int node, int level)
2649 {   int i, cprinted, flags; uchar *p;
2650     char textual_form[32];
2651     int res = (version_number == 3)?4:6; /* byte length of encoded text */
2652
2653     if (dtree[node].branch[0] != VACANT)
2654         recursively_show_z(dtree[node].branch[0], level);
2655
2656     p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2657
2658     word_to_ascii(p, textual_form);
2659
2660     for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2661         show_char(textual_form[cprinted]);
2662     for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2663         show_char(' ');
2664
2665     /* The level-1 info can only be printfed (d_show_buf must be null). */
2666     if (d_show_buf == NULL && level >= 1)
2667     {
2668         if (level >= 2) {
2669             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2670         }
2671
2672         flags = (int) p[res];
2673         if (flags & 128)
2674             printf("noun ");
2675         else
2676             printf("     ");
2677         if (flags & 4)
2678             printf("p ");
2679         else
2680             printf("  ");
2681         if (flags & 8)
2682         {   if (grammar_version_number == 1)
2683                 printf("preposition:%d  ", (int) p[res+2]);
2684             else
2685                 printf("preposition    ");
2686         }
2687         if ((flags & 3) == 3) printf("metaverb:%d  ", (int) p[res+1]);
2688         else if ((flags & 3) == 1) printf("verb:%d  ", (int) p[res+1]);
2689         printf("\n");
2690     }
2691
2692     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2693     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2694     {
2695         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2696         d_show_len = 0;
2697     }
2698
2699     if (dtree[node].branch[1] != VACANT)
2700         recursively_show_z(dtree[node].branch[1], level);
2701 }
2702
2703 static void recursively_show_g(int node, int level)
2704 {   int i, cprinted;
2705     uchar *p;
2706
2707     if (dtree[node].branch[0] != VACANT)
2708         recursively_show_g(dtree[node].branch[0], level);
2709
2710     p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2711
2712     for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2713     {
2714         uint32 ch;
2715         if (DICT_CHAR_SIZE == 1)
2716             ch = p[1+cprinted];
2717         else
2718             ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2719         if (!ch)
2720             break;
2721         show_uchar(ch);
2722     }
2723     for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2724         show_char(' ');
2725
2726     /* The level-1 info can only be printfed (d_show_buf must be null). */
2727     if (d_show_buf == NULL && level >= 1)
2728     {   int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2729         int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2730         int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2731         if (level >= 2) {
2732             for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2733         }
2734         if (flags & 128)
2735             printf("noun ");
2736         else
2737             printf("     ");
2738         if (flags & 4)
2739             printf("p ");
2740         else
2741             printf("  ");
2742         if (flags & 8)
2743         {   printf("preposition    ");
2744         }
2745         if ((flags & 3) == 3) printf("metaverb:%d  ", verbnum);
2746         else if ((flags & 3) == 1) printf("verb:%d  ", verbnum);
2747         printf("\n");
2748     }
2749
2750     /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2751     if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2752     {
2753         write_to_transcript_file(d_show_buf, STRCTX_DICT);
2754         d_show_len = 0;
2755     }
2756
2757     if (dtree[node].branch[1] != VACANT)
2758         recursively_show_g(dtree[node].branch[1], level);
2759 }
2760
2761 static void show_alphabet(int i)
2762 {   int j, c; char chartext[8];
2763
2764     for (j=0; j<26; j++)
2765     {   c = alphabet[i][j];
2766
2767         if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2768
2769         zscii_to_text(chartext, c);
2770         printf("%s", chartext);
2771
2772         if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2773     }
2774     printf("\n");
2775 }
2776
2777 extern void show_dictionary(int level)
2778 {
2779     /* Level 0: show words only. Level 1: show words and flags.
2780        Level 2: also show bytes.*/
2781     printf("Dictionary contains %d entries:\n",dict_entries);
2782     if (dict_entries != 0)
2783     {   d_show_len = 0; d_show_buf = NULL;
2784         if (!glulx_mode)
2785             recursively_show_z(root, level);
2786         else
2787             recursively_show_g(root, level);
2788     }
2789     if (!glulx_mode)
2790     {
2791         printf("\nZ-machine alphabet entries:\n");
2792         show_alphabet(0);
2793         show_alphabet(1);
2794         show_alphabet(2);
2795     }
2796 }
2797
2798 extern void write_dictionary_to_transcript(void)
2799 {
2800     d_show_size = 80; /* initial size */
2801     d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2802
2803     write_to_transcript_file("", STRCTX_INFO);
2804     sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2805     write_to_transcript_file(d_show_buf, STRCTX_INFO);
2806
2807     d_show_len = 0;
2808
2809     if (dict_entries != 0)
2810     {
2811         if (!glulx_mode)
2812             recursively_show_z(root, 0);
2813         else
2814             recursively_show_g(root, 0);
2815     }
2816     if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2817
2818     my_free(&d_show_buf, "dictionary display buffer");
2819     d_show_len = 0; d_show_buf = NULL;
2820 }
2821
2822 /* ========================================================================= */
2823 /*   Data structure management routines                                      */
2824 /* ------------------------------------------------------------------------- */
2825
2826 extern void init_text_vars(void)
2827 {   int j;
2828
2829     opttext = NULL;
2830     opttextlen = 0;
2831     bestyet = NULL;
2832     bestyet2 = NULL;
2833     tlbtab = NULL;
2834     grandtable = NULL;
2835     grandflags = NULL;
2836
2837     translated_text = NULL;
2838     temp_symbol = NULL;
2839     all_text = NULL;
2840
2841     for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2842
2843     total_zchars_trans = 0;
2844
2845     dictionary = NULL;
2846     dictionary_top = 0;
2847     dtree = NULL;
2848     final_dict_order = NULL;
2849     dict_sort_codes = NULL;
2850     prepared_sort = NULL;
2851     dict_entries=0;
2852
2853     static_strings_area = NULL;
2854     abbreviations_optimal_parse_schedule = NULL;
2855     abbreviations_optimal_parse_scores = NULL;
2856
2857     compressed_offsets = NULL;
2858     huff_entities = NULL;
2859     hufflist = NULL;
2860     unicode_usage_entries = NULL;
2861 }
2862
2863 extern void text_begin_pass(void)
2864 {   abbrevs_lookup_table_made = FALSE;
2865     no_abbreviations=0;
2866     abbreviations_totaltext=0;
2867     total_chars_trans=0; total_bytes_trans=0;
2868     all_text_top=0;
2869     dictionary_begin_pass();
2870     low_strings_top = 0;
2871
2872     static_strings_extent = 0;
2873     no_strings = 0;
2874     no_dynamic_strings = 0;
2875     no_unicode_chars = 0;
2876 }
2877
2878 /*  Note: for allocation and deallocation of all_the_text, see inform.c      */
2879
2880 extern void text_allocate_arrays(void)
2881 {
2882     int ix;
2883
2884     initialise_memory_list(&translated_text_memlist,
2885         sizeof(uchar), 8000, (void**)&translated_text,
2886         "translated text holding area");
2887
2888     initialise_memory_list(&temp_symbol_memlist,
2889         sizeof(char), 32, (void**)&temp_symbol,
2890         "temporary symbol name");
2891
2892     initialise_memory_list(&all_text_memlist,
2893         sizeof(char), 0, (void**)&all_text,
2894         "transcription text for optimise");
2895
2896     initialise_memory_list(&static_strings_area_memlist,
2897         sizeof(uchar), 128, (void**)&static_strings_area,
2898         "static strings area");
2899
2900     initialise_memory_list(&abbreviations_text_memlist,
2901         sizeof(char), 64, (void**)&abbreviations_text,
2902         "abbreviation text");
2903
2904     initialise_memory_list(&abbreviations_memlist,
2905         sizeof(abbreviation), 64, (void**)&abbreviations,
2906         "abbreviations");
2907
2908     initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2909         sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2910         "abbreviations optimal parse schedule");
2911     initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2912         sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2913         "abbreviations optimal parse scores");
2914
2915     initialise_memory_list(&dtree_memlist,
2916         sizeof(dict_tree_node), 1500, (void**)&dtree,
2917         "red-black tree for dictionary");
2918     initialise_memory_list(&dict_sort_codes_memlist,
2919         sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2920         "dictionary sort codes");
2921     initialise_memory_list(&prepared_sort_memlist,
2922         sizeof(uchar), DICT_WORD_BYTES, (void**)&prepared_sort,
2923         "prepared sort buffer");
2924
2925     final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2926
2927     /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2928        4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2929        allocation; we don't have to be precise. */
2930     initialise_memory_list(&dictionary_memlist,
2931         sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2932         "dictionary");
2933
2934     initialise_memory_list(&low_strings_memlist,
2935         sizeof(uchar), 1024, (void**)&low_strings,
2936         "low (abbreviation) strings");
2937
2938     d_show_buf = NULL;
2939     d_show_size = 0;
2940     d_show_len = 0;
2941
2942     huff_entities = NULL;
2943     hufflist = NULL;
2944     unicode_usage_entries = NULL;
2945     done_compression = FALSE;
2946     compression_table_size = 0;
2947     compressed_offsets = NULL;
2948
2949     initialise_memory_list(&unicode_usage_entries_memlist,
2950         sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2951         "unicode entity entries");
2952
2953     /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2954
2955     /* This hash table is only used in Glulx */
2956     for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2957         unicode_usage_hash[ix] = -1;
2958
2959     initialise_memory_list(&compressed_offsets_memlist,
2960         sizeof(int32), 0, (void**)&compressed_offsets,
2961         "static strings index table");
2962 }
2963
2964 extern void extract_all_text()
2965 {
2966     /* optimise_abbreviations() is called after free_arrays(). Therefore,
2967        we need to preserve the text transcript where it will not be
2968        freed up. We do this by copying the pointer to opttext. */
2969     opttext = all_text;
2970     opttextlen = all_text_top;
2971
2972     /* Re-init all_text_memlist. This causes it to forget all about the
2973        old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2974     initialise_memory_list(&all_text_memlist,
2975         sizeof(char), 0, (void**)&all_text,
2976         "dummy transcription text");
2977 }
2978
2979 extern void text_free_arrays(void)
2980 {
2981     deallocate_memory_list(&translated_text_memlist);
2982     deallocate_memory_list(&temp_symbol_memlist);
2983
2984     deallocate_memory_list(&all_text_memlist);
2985
2986     deallocate_memory_list(&low_strings_memlist);
2987     deallocate_memory_list(&abbreviations_text_memlist);
2988     deallocate_memory_list(&abbreviations_memlist);
2989
2990     deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2991     deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2992
2993     deallocate_memory_list(&dtree_memlist);
2994     deallocate_memory_list(&dict_sort_codes_memlist);
2995     deallocate_memory_list(&prepared_sort_memlist);
2996     my_free(&final_dict_order, "final dictionary ordering table");
2997
2998     deallocate_memory_list(&dictionary_memlist);
2999
3000     deallocate_memory_list(&compressed_offsets_memlist);
3001     my_free(&hufflist, "huffman node list");
3002     my_free(&huff_entities, "huffman entities");
3003
3004     deallocate_memory_list(&unicode_usage_entries_memlist);
3005
3006     deallocate_memory_list(&static_strings_area_memlist);
3007 }
3008
3009 extern void ao_free_arrays(void)
3010 {
3011     /* Called only after optimise_abbreviations() runs. */
3012
3013     int32 i;
3014     if (bestyet) {
3015         for (i=0; i<MAX_BESTYET; i++) {
3016             my_free(&bestyet[i].text, "bestyet.text");
3017         }
3018     }
3019     if (bestyet2) {
3020         for (i=0; i<MAX_ABBREVS; i++) {
3021             my_free(&bestyet2[i].text, "bestyet2.text");
3022         }
3023     }
3024
3025     my_free (&opttext,"stashed transcript for optimisation");
3026     my_free (&bestyet,"bestyet");
3027     my_free (&bestyet2,"bestyet2");
3028     my_free (&grandtable,"grandtable");
3029     my_free (&grandflags,"grandflags");
3030
3031     deallocate_memory_list(&tlbtab_memlist);
3032
3033     /* This was re-inited, so we should re-deallocate it. */
3034     deallocate_memory_list(&all_text_memlist);
3035 }
3036
3037 /* ========================================================================= */