1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Part of Inform 6.35 */
5 /* copyright (c) Graham Nelson 1993 - 2021 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ *
20 /* ------------------------------------------------------------------------- */
24 uchar *low_strings, *low_strings_top; /* Start and next free byte in the low
27 int32 static_strings_extent; /* Number of bytes of static strings
29 memory_block static_strings_area; /* Used if (!temporary_files_switch) to
30 hold the static strings area so far */
32 static uchar *strings_holding_area; /* Area holding translated strings
33 until they are moved into either
34 a temporary file, or the
35 static_strings_area below */
37 char *all_text, *all_text_top; /* Start and next byte free in (large)
38 text buffer holding the entire text
39 of the game, when it is being
41 int is_abbreviation, /* When TRUE, the string being trans
42 is itself an abbreviation string
43 so can't make use of abbreviations */
44 abbrevs_lookup_table_made, /* The abbreviations lookup table is
45 constructed when the first non-
46 abbreviation string is translated:
47 this flag is TRUE after that */
48 abbrevs_lookup[256]; /* Once this has been constructed,
49 abbrevs_lookup[n] = the smallest
50 number of any abbreviation beginning
51 with ASCII character n, or -1
52 if none of the abbreviations do */
53 int no_abbreviations; /* No of abbreviations defined so far */
54 uchar *abbreviations_at; /* Memory to hold the text of any
55 abbreviation strings declared */
56 /* ------------------------------------------------------------------------- */
57 /* Glulx string compression storage */
58 /* ------------------------------------------------------------------------- */
60 int no_strings; /* No of strings in static strings
62 int no_dynamic_strings; /* No. of @.. string escapes used
63 (actually, the highest value used
65 int no_unicode_chars; /* Number of distinct Unicode chars
66 used. (Beyond 0xFF.) */
68 static int MAX_CHARACTER_SET; /* Number of possible entities */
69 huffentity_t *huff_entities; /* The list of entities (characters,
70 abbreviations, @.. escapes, and
72 static huffentity_t **hufflist; /* Copy of the list, for sorting */
74 int no_huff_entities; /* The number of entities in the list */
75 int huff_unicode_start; /* Position in the list where Unicode
77 int huff_abbrev_start; /* Position in the list where string
78 abbreviations begin. */
79 int huff_dynam_start; /* Position in the list where @..
81 int huff_entity_root; /* The position in the list of the root
82 entry (when considering the table
85 int done_compression; /* Has the game text been compressed? */
86 int32 compression_table_size; /* Length of the Huffman table, in
88 int32 compression_string_size; /* Length of the compressed string
90 int32 *compressed_offsets; /* The beginning of every string in
91 the game, relative to the beginning
92 of the Huffman table. (So entry 0
93 is equal to compression_table_size)*/
95 #define UNICODE_HASH_BUCKETS (64)
96 unicode_usage_t *unicode_usage_entries;
97 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
99 static int unicode_entity_index(int32 unicode);
101 /* ------------------------------------------------------------------------- */
102 /* Abbreviation arrays */
103 /* ------------------------------------------------------------------------- */
109 /* ------------------------------------------------------------------------- */
111 int32 total_chars_trans, /* Number of ASCII chars of text in */
112 total_bytes_trans, /* Number of bytes of Z-code text out */
113 zchars_trans_in_last_string; /* Number of Z-chars in last string:
114 needed only for abbrev efficiency
115 calculation in "directs.c" */
116 static int32 total_zchars_trans, /* Number of Z-chars of text out
117 (only used to calculate the above) */
118 no_chars_transcribed; /* Number of ASCII chars written to
119 the text transcription area (used
120 for the -r and -u switches) */
122 static int zchars_out_buffer[3], /* During text translation, a buffer of
123 3 Z-chars at a time: when it's full
124 these are written as a 2-byte word */
125 zob_index; /* Index (0 to 2) into it */
127 static unsigned char *text_out_pc; /* The "program counter" during text
128 translation: the next address to
129 write Z-coded text output to */
131 static unsigned char *text_out_limit; /* The upper limit of text_out_pc
132 during text translation */
134 static int text_out_overflow; /* During text translation, becomes
135 true if text_out_pc tries to pass
138 /* ------------------------------------------------------------------------- */
139 /* For variables/arrays used by the dictionary manager, see below */
140 /* ------------------------------------------------------------------------- */
142 /* ------------------------------------------------------------------------- */
143 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
144 /* detection in text translation). We first bubble-sort the abbrevs into */
145 /* alphabetical order (this is necessary for the detection algorithm to */
146 /* to work). Since the table is only prepared once, and for a table */
147 /* of size at most 96, there's no point using an efficient sort algorithm. */
148 /* ------------------------------------------------------------------------- */
150 static void make_abbrevs_lookup(void)
151 { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
153 { bubble_sort = FALSE;
154 for (j=0; j<no_abbreviations; j++)
155 for (k=j+1; k<no_abbreviations; k++)
156 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
157 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
159 { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
160 l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
162 l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
167 } while (bubble_sort);
169 for (j=no_abbreviations-1; j>=0; j--)
170 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
171 abbrevs_lookup[(uchar)p1[0]]=j;
174 abbrevs_lookup_table_made = TRUE;
177 /* ------------------------------------------------------------------------- */
178 /* Search the abbreviations lookup table (a routine which must be fast). */
179 /* The source text to compare is text[i], text[i+1], ... and this routine */
180 /* is only called if text[i] is indeed the first character of at least one */
181 /* abbreviation, "from" begin the least index into the abbreviations table */
182 /* of an abbreviation for which text[i] is the first character. Recall */
183 /* that the abbrevs table is in alphabetical order. */
185 /* The return value is -1 if there is no match. If there is a match, the */
186 /* text to be abbreviated out is over-written by a string of null chars */
187 /* with "ASCII" value 1, and the abbreviation number is returned. */
189 /* In Glulx, we *do not* do this overwriting with 1's. */
190 /* ------------------------------------------------------------------------- */
192 static int try_abbreviations_from(unsigned char *text, int i, int from)
193 { int j, k; uchar *p, c;
195 for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
196 (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
197 { if (text[i+1]==p[1])
198 { for (k=2; p[k]!=0; k++)
199 if (text[i+k]!=p[k]) goto NotMatched;
201 for (k=0; p[k]!=0; k++) text[i+k]=1;
211 extern void make_abbreviation(char *text)
213 strcpy((char *)abbreviations_at
214 + no_abbreviations*MAX_ABBREV_LENGTH, text);
216 is_abbreviation = TRUE;
217 abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
218 is_abbreviation = FALSE;
220 /* The quality is the number of Z-chars saved by using this */
221 /* abbreviation: note that it takes 2 Z-chars to print it. */
223 abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
226 /* ------------------------------------------------------------------------- */
227 /* The front end routine for text translation */
228 /* ------------------------------------------------------------------------- */
230 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
231 { int i, j; uchar *c;
233 is_abbreviation = is_abbrev;
235 /* Put into the low memory pool (at 0x100 in the Z-machine) of strings */
236 /* which may be wanted as possible entries in the abbreviations table */
238 if (!glulx_mode && in_low_memory)
239 { j=subtract_pointers(low_strings_top,low_strings);
240 low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
241 if (!low_strings_top)
242 memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
243 is_abbreviation = FALSE;
247 if (glulx_mode && done_compression)
248 compiler_error("Tried to add a string after compression was done.");
250 c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
252 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
254 i = subtract_pointers(c, strings_holding_area);
256 /* Insert null bytes as needed to ensure that the next static string */
257 /* also occurs at an address expressible as a packed address */
261 if (oddeven_packing_switch)
262 textalign = scale_factor*2;
264 textalign = scale_factor;
265 while ((i%textalign)!=0)
267 if (i+2 > MAX_STATIC_STRINGS)
268 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
269 i+=2; *c++ = 0; *c++ = 0;
273 j = static_strings_extent;
275 if (temporary_files_switch)
276 for (c=strings_holding_area; c<strings_holding_area+i;
277 c++, static_strings_extent++)
280 for (c=strings_holding_area; c<strings_holding_area+i;
281 c++, static_strings_extent++)
282 write_byte_to_memory_block(&static_strings_area,
283 static_strings_extent, *c);
285 is_abbreviation = FALSE;
288 return(j/scale_factor);
291 /* The marker value is a one-based string number. (We reserve zero
292 to mean "not a string at all". */
293 return (++no_strings);
297 /* ------------------------------------------------------------------------- */
298 /* Output a single Z-character into the buffer, and flush it if full */
299 /* ------------------------------------------------------------------------- */
301 static void write_z_char_z(int i)
304 total_zchars_trans++;
305 zchars_out_buffer[zob_index++]=(i%32);
306 if (zob_index!=3) return;
308 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
309 + zchars_out_buffer[2];
310 if (text_out_pc+2 > text_out_limit) {
311 text_out_overflow = TRUE;
314 text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
315 total_bytes_trans+=2;
318 static void write_zscii(int zsc)
320 int lookup_value, in_alphabet;
327 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
329 else lookup_value = -1;
331 if (lookup_value >= 0)
332 { alphabet_used[lookup_value] = 'Y';
333 in_alphabet = lookup_value/26;
334 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
335 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
336 write_z_char_z(lookup_value%26 + 6);
339 { write_z_char_z(5); write_z_char_z(6);
340 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
344 /* ------------------------------------------------------------------------- */
345 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
346 /* setting the "end" bit on the final 2-byte word */
347 /* ------------------------------------------------------------------------- */
349 static void end_z_chars(void)
351 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
352 while (zob_index!=0) write_z_char_z(5);
353 p=(unsigned char *) text_out_pc;
357 /* Glulx handles this much more simply -- compression is done elsewhere. */
358 static void write_z_char_g(int i)
361 if (text_out_pc+1 > text_out_limit) {
362 text_out_overflow = TRUE;
365 total_zchars_trans++;
371 /* ------------------------------------------------------------------------- */
372 /* The main routine "text.c" provides to the rest of Inform: the text */
373 /* translator. p is the address to write output to, s_text the source text */
374 /* and the return value is the next free address to write output to. */
375 /* The return value will not exceed p_limit. If the translation tries to */
376 /* overflow this boundary, the return value will be NULL (and you should */
377 /* display an error). */
378 /* Note that the source text may be corrupted by this routine. */
379 /* ------------------------------------------------------------------------- */
381 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
382 { int i, j, k, in_alphabet, lookup_value;
383 int32 unicode; int zscii;
384 unsigned char *text_in;
386 /* Cast the input and output streams to unsigned char: text_out_pc will
387 advance as bytes of Z-coded text are written, but text_in doesn't */
389 text_in = (unsigned char *) s_text;
390 text_out_pc = (unsigned char *) p;
391 text_out_limit = (unsigned char *) p_limit;
392 text_out_overflow = FALSE;
394 /* Remember the Z-chars total so that later we can subtract to find the
395 number of Z-chars translated on this string */
397 zchars_trans_in_last_string = total_zchars_trans;
399 /* Start with the Z-characters output buffer empty */
403 /* If this is the first text translated since the abbreviations were
404 declared, and if some were declared, then it's time to make the
405 lookup table for abbreviations
407 (Except: we don't if the text being translated is itself
408 the text of an abbreviation currently being defined) */
410 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
411 && (!is_abbreviation))
412 make_abbrevs_lookup();
414 /* If we're storing the whole game text to memory, then add this text */
416 if ((!is_abbreviation) && (store_the_text))
417 { no_chars_transcribed += strlen(s_text)+2;
418 if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
419 memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
420 sprintf(all_text_top, "%s\n\n", s_text);
421 all_text_top += strlen(all_text_top);
424 if (transcript_switch && (!veneer_mode))
425 write_to_transcript_file(s_text);
429 /* The empty string of Z-text is illegal, since it can't carry an end
430 bit: so we translate an empty string of ASCII text to just the
431 pad character 5. Printing this causes nothing to appear on screen. */
433 if (text_in[0]==0) write_z_char_z(5);
435 /* Loop through the characters of the null-terminated input text: note
436 that if 1 is written over a character in the input text, it is
437 afterwards ignored */
439 for (i=0; text_in[i]!=0; i++)
440 { total_chars_trans++;
442 /* Contract ". " into ". " if double-space-removing switch set:
443 likewise "? " and "! " if the setting is high enough */
445 if ((double_space_setting >= 1)
446 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
447 { if (text_in[i]=='.') text_in[i+2]=1;
448 if (double_space_setting >= 2)
449 { if (text_in[i]=='?') text_in[i+2]=1;
450 if (text_in[i]=='!') text_in[i+2]=1;
454 /* Try abbreviations if the economy switch set */
456 if ((economy_switch) && (!is_abbreviation)
457 && ((k=abbrevs_lookup[text_in[i]])!=-1))
458 { if ((j=try_abbreviations_from(text_in, i, k))!=-1)
459 { /* abbreviations run from MAX_DYNAMIC_STRINGS to 96 */
460 j += MAX_DYNAMIC_STRINGS;
461 write_z_char_z(j/32+1); write_z_char_z(j%32);
465 /* If Unicode switch set, use text_to_unicode to perform UTF-8
467 if (character_set_unicode && (text_in[i] & 0x80))
468 { unicode = text_to_unicode((char *) (text_in+i));
469 zscii = unicode_to_zscii(unicode);
470 if (zscii != 5) write_zscii(zscii);
472 { unicode_char_error(
473 "Character can only be used if declared in \
474 advance as part of 'Zcharacter table':", unicode);
476 i += textual_form_length - 1;
480 /* '@' is the escape character in Inform string notation: the various
484 @@decimalnumber : write this ZSCII char (0 to 1023)
485 @twodigits : write the abbreviation string with this
489 @accentcode : this accented character: e.g.,
490 for @'e write an E-acute
491 @{...} : this Unicode char (in hex) */
494 { if (text_in[i+1]=='@')
498 i+=2; j=atoi((char *) (text_in+i));
500 { /* Prevent ~ and ^ from being translated to double-quote
501 and new-line, as they ordinarily would be */
503 case 94: write_z_char_z(5); write_z_char_z(6);
504 write_z_char_z(94/32); write_z_char_z(94%32);
506 case 126: write_z_char_z(5); write_z_char_z(6);
507 write_z_char_z(126/32); write_z_char_z(126%32);
510 default: write_zscii(j); break;
512 while (isdigit(text_in[i])) i++; i--;
514 else if (isdigit(text_in[i+1])!=0)
517 /* @.. (dynamic string) */
519 d1 = character_digit_value[text_in[i+1]];
520 d2 = character_digit_value[text_in[i+2]];
521 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
522 error("'@..' must have two decimal digits");
526 if (!glulx_mode && j >= 96)
527 { error("Z-machine dynamic strings are limited to 96");
530 if (j >= MAX_DYNAMIC_STRINGS) {
531 memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
535 write_z_char_z(j/32+1); write_z_char_z(j%32);
540 /* A string escape specifying an unusual character */
542 unicode = text_to_unicode((char *) (text_in+i));
543 zscii = unicode_to_zscii(unicode);
544 if (zscii != 5) write_zscii(zscii);
546 { unicode_char_error(
547 "Character can only be used if declared in \
548 advance as part of 'Zcharacter table':", unicode);
550 i += textual_form_length - 1;
554 { /* Skip a character which has been over-written with the null
555 value 1 earlier on */
558 { if (text_in[i]==' ') write_z_char_z(0);
560 { j = (int) text_in[i];
561 lookup_value = iso_to_alphabet_grid[j];
562 if (lookup_value < 0)
563 { /* The character isn't in the standard alphabets, so
564 we have to use the ZSCII 4-Z-char sequence */
566 if (lookup_value == -5)
567 { /* Character isn't in the ZSCII set at all */
569 unicode = iso_to_unicode(j);
571 "Character can only be used if declared in \
572 advance as part of 'Zcharacter table':", unicode);
573 write_zscii(0x200 + unicode/0x100);
574 write_zscii(0x300 + unicode%0x100);
576 else write_zscii(-lookup_value);
579 { /* The character is in one of the standard alphabets:
580 write a SHIFT to temporarily change alphabet if
581 it isn't in alphabet 0, then write the Z-char */
583 alphabet_used[lookup_value] = 'Y';
584 in_alphabet = lookup_value/26;
585 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
586 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
587 write_z_char_z(lookup_value%26 + 6);
594 /* Flush the Z-characters output buffer and set the "end" bit */
601 /* The text storage here is, of course, temporary. Compression
602 will occur when we're finished compiling, so that all the
603 clever Huffman stuff will work.
604 In the stored text, we use "@@" to indicate @,
605 "@0" to indicate a zero byte,
606 "@ANNNN" to indicate an abbreviation,
607 "@DNNNN" to indicate a dynamic string thing.
608 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
609 (NNNN is a four-digit hex number using the letters A-P... an
610 ugly representation but a convenient one.)
613 for (i=0; text_in[i]!=0; i++) {
615 /* Contract ". " into ". " if double-space-removing switch set:
616 likewise "? " and "! " if the setting is high enough. */
617 if ((double_space_setting >= 1)
618 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
620 || (double_space_setting >= 2
621 && (text_in[i]=='?' || text_in[i]=='!'))) {
622 text_in[i+1] = text_in[i];
629 /* Try abbreviations if the economy switch set. We have to be in
630 compression mode too, since the abbreviation mechanism is part
631 of string decompression. */
633 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
634 && ((k=abbrevs_lookup[text_in[i]])!=-1)
635 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
636 char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
640 write_z_char_g('A' + ((j >>12) & 0x0F));
641 write_z_char_g('A' + ((j >> 8) & 0x0F));
642 write_z_char_g('A' + ((j >> 4) & 0x0F));
643 write_z_char_g('A' + ((j ) & 0x0F));
645 else if (text_in[i] == '@') {
646 if (text_in[i+1]=='@') {
648 i+=2; j=atoi((char *) (text_in+i));
649 if (j == '@' || j == '\0') {
653 if (!compression_switch)
654 warning("Ascii @@0 will prematurely terminate non-compressed \
659 while (isdigit(text_in[i])) i++; i--;
661 else if (isdigit(text_in[i+1])) {
663 d1 = character_digit_value[text_in[i+1]];
664 d2 = character_digit_value[text_in[i+2]];
665 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
666 error("'@..' must have two decimal digits");
669 if (!compression_switch)
670 warning("'@..' print variable will not work in non-compressed \
671 string; substituting ' '.");
674 if (j >= MAX_DYNAMIC_STRINGS) {
675 memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
678 if (j+1 >= no_dynamic_strings)
679 no_dynamic_strings = j+1;
682 write_z_char_g('A' + ((j >>12) & 0x0F));
683 write_z_char_g('A' + ((j >> 8) & 0x0F));
684 write_z_char_g('A' + ((j >> 4) & 0x0F));
685 write_z_char_g('A' + ((j ) & 0x0F));
689 unicode = text_to_unicode((char *) (text_in+i));
690 i += textual_form_length - 1;
691 if (unicode == '@' || unicode == '\0') {
693 write_z_char_g(unicode ? '@' : '0');
695 else if (unicode >= 0 && unicode < 256) {
696 write_z_char_g(unicode);
699 if (!compression_switch) {
700 warning("Unicode characters will not work in non-compressed \
701 string; substituting '?'.");
705 j = unicode_entity_index(unicode);
708 write_z_char_g('A' + ((j >>12) & 0x0F));
709 write_z_char_g('A' + ((j >> 8) & 0x0F));
710 write_z_char_g('A' + ((j >> 4) & 0x0F));
711 write_z_char_g('A' + ((j ) & 0x0F));
716 else if (text_in[i] == '^')
717 write_z_char_g(0x0A);
718 else if (text_in[i] == '~')
720 else if (character_set_unicode) {
721 if (text_in[i] & 0x80) {
722 unicode = text_to_unicode((char *) (text_in+i));
723 i += textual_form_length - 1;
724 if (unicode >= 0 && unicode < 256) {
725 write_z_char_g(unicode);
728 if (!compression_switch) {
729 warning("Unicode characters will not work in non-compressed \
730 string; substituting '?'.");
734 j = unicode_entity_index(unicode);
737 write_z_char_g('A' + ((j >>12) & 0x0F));
738 write_z_char_g('A' + ((j >> 8) & 0x0F));
739 write_z_char_g('A' + ((j >> 4) & 0x0F));
740 write_z_char_g('A' + ((j ) & 0x0F));
745 write_z_char_g(text_in[i]);
749 unicode = iso_to_unicode_grid[text_in[i]];
750 if (unicode >= 0 && unicode < 256) {
751 write_z_char_g(unicode);
754 if (!compression_switch) {
755 warning("Unicode characters will not work in non-compressed \
756 string; substituting '?'.");
760 j = unicode_entity_index(unicode);
763 write_z_char_g('A' + ((j >>12) & 0x0F));
764 write_z_char_g('A' + ((j >> 8) & 0x0F));
765 write_z_char_g('A' + ((j >> 4) & 0x0F));
766 write_z_char_g('A' + ((j ) & 0x0F));
775 if (text_out_overflow)
778 return((uchar *) text_out_pc);
781 static int unicode_entity_index(int32 unicode)
783 unicode_usage_t *uptr;
785 int buck = unicode % UNICODE_HASH_BUCKETS;
787 for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
788 if (uptr->ch == unicode)
792 j = (uptr - unicode_usage_entries);
795 if (no_unicode_chars >= MAX_UNICODE_CHARS) {
796 memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
800 j = no_unicode_chars;
802 uptr = unicode_usage_entries + j;
804 uptr->next = unicode_usage_hash[buck];
805 unicode_usage_hash[buck] = uptr;
812 /* ------------------------------------------------------------------------- */
813 /* Glulx compression code */
814 /* ------------------------------------------------------------------------- */
817 static void compress_makebits(int entnum, int depth, int prevbit,
818 huffbitlist_t *bits);
820 /* The compressor. This uses the usual Huffman compression algorithm. */
821 void compress_game_text()
823 int entities=0, branchstart, branches;
831 if (compression_switch) {
833 /* How many entities have we currently got? Well, 256 plus the
834 string-terminator plus Unicode chars plus abbrevations plus
837 huff_unicode_start = entities;
838 entities += no_unicode_chars;
839 huff_abbrev_start = entities;
841 entities += no_abbreviations;
842 huff_dynam_start = entities;
843 entities += no_dynamic_strings;
845 if (entities > MAX_CHARACTER_SET)
846 memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
849 for (jx=0; jx<256; jx++) {
850 huff_entities[jx].type = 2;
851 huff_entities[jx].count = 0;
852 huff_entities[jx].u.ch = jx;
855 huff_entities[256].type = 1;
856 huff_entities[256].count = 0;
857 for (jx=0; jx<no_unicode_chars; jx++) {
858 huff_entities[huff_unicode_start+jx].type = 4;
859 huff_entities[huff_unicode_start+jx].count = 0;
860 huff_entities[huff_unicode_start+jx].u.val = jx;
862 if (economy_switch) {
863 for (jx=0; jx<no_abbreviations; jx++) {
864 huff_entities[huff_abbrev_start+jx].type = 3;
865 huff_entities[huff_abbrev_start+jx].count = 0;
866 huff_entities[huff_abbrev_start+jx].u.val = jx;
869 for (jx=0; jx<no_dynamic_strings; jx++) {
870 huff_entities[huff_dynam_start+jx].type = 9;
871 huff_entities[huff_dynam_start+jx].count = 0;
872 huff_entities[huff_dynam_start+jx].u.val = jx;
876 /* No compression; use defaults that will make it easy to check
878 no_huff_entities = 257;
879 huff_unicode_start = 257;
880 huff_abbrev_start = 257;
881 huff_dynam_start = 257+MAX_ABBREVS;
882 compression_table_size = 0;
885 if (temporary_files_switch) {
887 Temp1_fp=fopen(Temp1_Name,"rb");
889 fatalerror("I/O failure: couldn't reopen temporary file 1");
892 if (compression_switch) {
894 for (lx=0, ix=0; lx<no_strings; lx++) {
895 int escapelen=0, escapetype=0;
899 if (temporary_files_switch)
900 ch = fgetc(Temp1_fp);
902 ch = read_byte_from_memory_block(&static_strings_area, ix);
904 if (ix > static_strings_extent || ch < 0)
905 compiler_error("Read too much not-yet-compressed text.");
906 if (escapelen == -1) {
911 else if (ch == '0') {
914 else if (ch == 'A' || ch == 'D' || ch == 'U') {
921 compiler_error("Strange @ escape in processed text.");
924 else if (escapelen) {
925 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
927 if (escapelen == 0) {
928 if (escapetype == 'A') {
929 ch = huff_abbrev_start+escapeval;
931 else if (escapetype == 'D') {
932 ch = huff_dynam_start+escapeval;
934 else if (escapetype == 'U') {
935 ch = huff_unicode_start+escapeval;
938 compiler_error("Strange @ escape in processed text.");
954 huff_entities[ch].count++;
959 for (jx=0; jx<entities; jx++) {
960 if (huff_entities[jx].count) {
961 hufflist[numlive] = &(huff_entities[jx]);
966 branchstart = entities;
969 while (numlive > 1) {
971 int best1num, best2num;
974 if (hufflist[0]->count < hufflist[1]->count) {
983 best1num = hufflist[best1]->count;
984 best2num = hufflist[best2]->count;
986 for (jx=2; jx<numlive; jx++) {
987 if (hufflist[jx]->count < best1num) {
991 best1num = hufflist[best1]->count;
993 else if (hufflist[jx]->count < best2num) {
995 best2num = hufflist[best2]->count;
999 bran = &(huff_entities[branchstart+branches]);
1002 bran->count = hufflist[best1]->count + hufflist[best2]->count;
1003 bran->u.branch[0] = (hufflist[best1] - huff_entities);
1004 bran->u.branch[1] = (hufflist[best2] - huff_entities);
1005 hufflist[best1] = bran;
1006 if (best2 < numlive-1) {
1007 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1008 ((numlive-1) - best2) * sizeof(huffentity_t *));
1013 huff_entity_root = (hufflist[0] - huff_entities);
1015 for (ix=0; ix<MAXHUFFBYTES; ix++)
1017 compression_table_size = 12;
1019 no_huff_entities = 0; /* compress_makebits will total this up */
1020 compress_makebits(huff_entity_root, 0, -1, &bits);
1023 /* Now, sadly, we have to compute the size of the string section,
1024 without actually doing the compression. */
1025 compression_string_size = 0;
1027 if (temporary_files_switch) {
1028 fseek(Temp1_fp, 0, SEEK_SET);
1031 if (no_strings >= MAX_NUM_STATIC_STRINGS)
1032 memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1034 for (lx=0, ix=0; lx<no_strings; lx++) {
1035 int escapelen=0, escapetype=0;
1039 compressed_offsets[lx] = compression_table_size + compression_string_size;
1040 compression_string_size++; /* for the type byte */
1042 if (temporary_files_switch)
1043 ch = fgetc(Temp1_fp);
1045 ch = read_byte_from_memory_block(&static_strings_area, ix);
1047 if (ix > static_strings_extent || ch < 0)
1048 compiler_error("Read too much not-yet-compressed text.");
1049 if (escapelen == -1) {
1054 else if (ch == '0') {
1057 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1064 compiler_error("Strange @ escape in processed text.");
1067 else if (escapelen) {
1068 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1070 if (escapelen == 0) {
1071 if (escapetype == 'A') {
1072 ch = huff_abbrev_start+escapeval;
1074 else if (escapetype == 'D') {
1075 ch = huff_dynam_start+escapeval;
1077 else if (escapetype == 'U') {
1078 ch = huff_unicode_start+escapeval;
1081 compiler_error("Strange @ escape in processed text.");
1098 if (compression_switch) {
1099 jx += huff_entities[ch].depth;
1100 compression_string_size += (jx/8);
1104 if (ch >= huff_dynam_start) {
1105 compression_string_size += 3;
1107 else if (ch >= huff_unicode_start) {
1108 compiler_error("Abbreviation/Unicode in non-compressed string \
1109 should be impossible.");
1112 compression_string_size += 1;
1115 if (compression_switch && jx)
1116 compression_string_size++;
1119 done_compression = TRUE;
1122 static void compress_makebits(int entnum, int depth, int prevbit,
1123 huffbitlist_t *bits)
1125 huffentity_t *ent = &(huff_entities[entnum]);
1129 ent->addr = compression_table_size;
1134 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1137 switch (ent->type) {
1139 compression_table_size += 9;
1140 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1141 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1144 compression_table_size += 1;
1147 compression_table_size += 2;
1150 cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1151 compression_table_size += (1 + 1 + strlen(cx));
1155 compression_table_size += 5;
1160 /* ------------------------------------------------------------------------- */
1161 /* The abbreviations optimiser */
1163 /* This is a very complex, memory and time expensive algorithm to */
1164 /* approximately solve the problem of which abbreviation strings would */
1165 /* minimise the total number of Z-chars to which the game text translates. */
1166 /* It is in some ways a quite separate program but remains inside Inform */
1167 /* for compatibility with previous releases. */
1168 /* ------------------------------------------------------------------------- */
1170 typedef struct tlb_s
1172 int32 intab, occurrences;
1175 static int32 no_occs;
1177 static int32 *grandtable;
1178 static int32 *grandflags;
1179 typedef struct optab_s
1184 char text[MAX_ABBREV_LENGTH];
1186 static optab *bestyet, *bestyet2;
1190 static char *sub_buffer;
1192 static void optimise_pass(void)
1193 { int32 i; int t1, t2;
1194 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1195 for (i=0; i<256; i++) bestyet[i].length=0;
1196 for (i=0; i<no_occs; i++)
1197 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1200 if (i%((**g_pm_hndl).linespercheck) == 0)
1201 { ProcessEvents (&g_proc);
1205 my_free(&all_text,"transcription text");
1206 longjmp (g_fallback, 1);
1210 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1211 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1212 (long int) tlbtab[i].occurrences);
1214 for (j=0; j<tlbtab[i].occurrences; j++)
1215 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1216 nl=2; noflags=tlbtab[i].occurrences;
1217 while ((noflags>=2)&&(nl<=62))
1219 for (j2=0; j2<nl; j2++)
1220 if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1223 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1224 { if (grandflags[j2]==1)
1225 { x=grandtable[tlbtab[i].intab+j2]
1226 - grandtable[tlbtab[i].intab+j];
1227 if (((x>-nl)&&(x<nl))
1228 || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1229 all_text+grandtable[tlbtab[i].intab+j2],
1231 { grandflags[j2]=0; noflags--; }
1236 for (k=0; k<nl; k++)
1238 c=all_text[grandtable[tlbtab[i].intab+j+k]];
1240 { if (iso_to_alphabet_grid[c]<0)
1243 if (iso_to_alphabet_grid[c]>=26)
1247 score=(matches-1)*(scrabble-2);
1249 for (j2=0; j2<256; j2++)
1250 { if ((nl==bestyet[j2].length)
1251 && (memcmp(all_text+bestyet[j2].location,
1252 all_text+grandtable[tlbtab[i].intab+j],
1254 { j2=256; min=score; }
1256 { if (bestyet[j2].score<min)
1257 { min=bestyet[j2].score; minat=j2;
1262 { bestyet[minat].score=score;
1263 bestyet[minat].length=nl;
1264 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1265 bestyet[minat].popularity=matches;
1266 for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1267 all_text[bestyet[minat].location+j2];
1273 t2=((int) time(0)) - t1;
1274 printf(" (%d seconds)\n",t2);
1279 static int any_overlap(char *s1, char *s2)
1280 { int a, b, i, j, flag;
1281 a=strlen(s1); b=strlen(s2);
1282 for (i=1-b; i<a; i++)
1285 if ((0<=i+j)&&(i+j<=a-1))
1286 if (s1[i+j]!=s2[j]) flag=1;
1287 if (flag==0) return(1);
1292 #define MAX_TLBS 8000
1294 extern void optimise_abbreviations(void)
1295 { int32 i, j, t, max=0, MAX_GTABLE;
1296 int32 j2, selected, available, maxat=0, nl;
1299 printf("Beginning calculation of optimal abbreviations...\n");
1302 tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1303 sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1304 for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1306 bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1307 bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1309 bestyet2[0].text[0]='.';
1310 bestyet2[0].text[1]=' ';
1311 bestyet2[0].text[2]=0;
1313 bestyet2[1].text[0]=',';
1314 bestyet2[1].text[1]=' ';
1315 bestyet2[1].text[2]=0;
1317 for (i=0; all_text+i<all_text_top; i++)
1319 if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1320 { all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1321 bestyet2[0].popularity++;
1324 if ((all_text[i]=='.') && (all_text[i+1]==' '))
1325 { all_text[i]='\n'; all_text[i+1]='\n';
1326 bestyet2[0].popularity++;
1329 if ((all_text[i]==',') && (all_text[i+1]==' '))
1330 { all_text[i]='\n'; all_text[i+1]='\n';
1331 bestyet2[1].popularity++;
1335 MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1336 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1338 for (i=0, t=0; all_text+i<all_text_top; i++)
1339 { test.text[0]=all_text[i];
1340 test.text[1]=all_text[i+1];
1341 test.text[2]=all_text[i+2];
1343 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1345 for (j=0; j<no_occs; j++)
1346 if (strcmp(test.text,tlbtab[j].text)==0)
1349 for (j=i+3; all_text+j<all_text_top; j++)
1352 if (j%((**g_pm_hndl).linespercheck) == 0)
1353 { ProcessEvents (&g_proc);
1357 my_free(&all_text,"transcription text");
1358 longjmp (g_fallback, 1);
1362 if ((all_text[i]==all_text[j])
1363 && (all_text[i+1]==all_text[j+1])
1364 && (all_text[i+2]==all_text[j+2]))
1365 { grandtable[t+test.occurrences]=j;
1367 if (t+test.occurrences==MAX_GTABLE)
1368 { printf("All %ld cross-references used\n",
1369 (long int) MAX_GTABLE);
1374 if (test.occurrences>=2)
1375 { tlbtab[no_occs]=test;
1376 tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1377 if (max<tlbtab[no_occs].occurrences)
1378 max=tlbtab[no_occs].occurrences;
1380 if (no_occs==MAX_TLBS)
1381 { printf("All %d three-letter-blocks used\n",
1390 grandflags=my_calloc(sizeof(int), max, "grandflags");
1393 printf("Cross-reference table (%ld entries) built...\n",
1394 (long int) no_occs);
1395 /* for (i=0; i<no_occs; i++)
1396 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1397 tlbtab[i].occurrences);
1400 for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1402 while ((available>0)&&(selected<64))
1403 { printf("Pass %d\n", ++pass_no);
1407 for (i=0; i<256; i++)
1408 if (bestyet[i].score!=0)
1410 nl=bestyet[i].length;
1411 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1412 all_text[bestyet[i].location+j2];
1413 bestyet[i].text[nl]=0;
1416 /* printf("End of pass results:\n");
1417 printf("\nno score freq string\n");
1418 for (i=0; i<256; i++)
1419 if (bestyet[i].score>0)
1420 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1421 bestyet[i].popularity, bestyet[i].text);
1426 for (i=0; i<256; i++)
1427 if (max<bestyet[i].score)
1428 { max=bestyet[i].score;
1433 { bestyet2[selected++]=bestyet[maxat];
1436 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1437 (long int) selected,bestyet[maxat].text,
1438 (long int) bestyet[maxat].popularity,
1439 (long int) bestyet[maxat].score);
1441 test.text[0]=bestyet[maxat].text[0];
1442 test.text[1]=bestyet[maxat].text[1];
1443 test.text[2]=bestyet[maxat].text[2];
1446 for (i=0; i<no_occs; i++)
1447 if (strcmp(test.text,tlbtab[i].text)==0)
1450 for (j=0; j<tlbtab[i].occurrences; j++)
1451 { if (memcmp(bestyet[maxat].text,
1452 all_text+grandtable[tlbtab[i].intab+j],
1453 bestyet[maxat].length)==0)
1454 { for (j2=0; j2<bestyet[maxat].length; j2++)
1455 all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1459 for (i=0; i<256; i++)
1460 if ((bestyet[i].score>0)&&
1461 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1462 { bestyet[i].score=0;
1463 /* printf("Discarding '%s' as overlapping\n",
1464 bestyet[i].text); */
1467 } while ((max>0)&&(available>0)&&(selected<64));
1470 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1471 for (i=0; i<selected; i++)
1472 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1477 /* ------------------------------------------------------------------------- */
1478 /* The dictionary manager begins here. */
1480 /* Speed is extremely important in these algorithms. If a linear-time */
1481 /* routine were used to search the dictionary words so far built up, then */
1482 /* Inform would crawl. */
1484 /* Instead, the dictionary is stored as a binary tree, which is kept */
1485 /* balanced with the red-black algorithm. */
1486 /* ------------------------------------------------------------------------- */
1487 /* A dictionary table similar to the Z-machine format is kept: there is a */
1488 /* 7-byte header (left blank here to be filled in at the */
1489 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1490 /* records, one per word, in the form */
1492 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1493 /* 4 or 6 bytes byte byte byte */
1495 /* For Glulx, the form is instead: (But see below about Unicode-valued */
1496 /* dictionaries and my heinie.) */
1498 /* <plain text> <flags> <verbnumber> <adjectivenumber> */
1499 /* DICT_WORD_SIZE short short short */
1501 /* These records are stored in "accession order" (i.e. in order of their */
1502 /* first being received by these routines) and only alphabetically sorted */
1503 /* by construct_storyfile() (using the array below). */
1504 /* ------------------------------------------------------------------------- */
1506 /* Further notes about the data fields... */
1507 /* The flags are currently: */
1508 /* bit 0: word is used as a verb (in verb grammar) */
1509 /* bit 1: word is used as a meta verb */
1510 /* bit 2: word is plural (set by '//p') */
1511 /* bit 3: word is used as a preposition (in verb grammar) */
1512 /* bit 6: set for all verbs, but not used by the parser? */
1513 /* bit 7: word is used as a noun (set for every word that appears in */
1514 /* code or in an object property) */
1516 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1519 /* The compiler generates special constants #dict_par1, #dict_par2, */
1520 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1521 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1522 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1523 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1524 /* ------------------------------------------------------------------------- */
1526 uchar *dictionary, /* (These two pointers are externally
1527 used only in "tables.c" when
1528 building the story-file) */
1529 *dictionary_top; /* Pointer to next free record */
1531 int dict_entries; /* Total number of records entered */
1533 /* ------------------------------------------------------------------------- */
1534 /* dict_word is a typedef for a struct of 6 unsigned chars (defined in */
1535 /* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */
1536 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1537 /* alphabetic order corresponds to numeric order. For this reason, the */
1538 /* dict_word is called the "sort code" of the original text word. */
1540 /* ###- In modifying the compiler, I've found it easier to discard the */
1541 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1542 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1543 /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
1544 /* (That limit is defined as 40 in the header; it exists only for a few */
1545 /* static buffers, and can be increased without using significant memory.) */
1547 /* ###- Well, that certainly bit me on the butt, didn't it. In further */
1548 /* modifying the compiler to generate a Unicode dictionary, I have to */
1549 /* store four-byte values in the uchar array. This is handled by making */
1550 /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1551 /* Then we store the 32-bit character value big-endian. This lets us */
1552 /* continue to compare arrays bytewise, which is a nice simplification. */
1553 /* ------------------------------------------------------------------------- */
1555 extern int compare_sorts(uchar *d1, uchar *d2)
1557 for (i=0; i<DICT_WORD_BYTES; i++)
1558 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1559 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1564 extern void copy_sorts(uchar *d1, uchar *d2)
1566 for (i=0; i<DICT_WORD_BYTES; i++)
1570 static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
1573 static int number_and_case;
1575 /* Also used by verbs.c */
1576 static void dictionary_prepare_z(char *dword, uchar *optresult)
1577 { int i, j, k, k2, wd[13]; int32 tot;
1579 /* A rapid text translation algorithm using only the simplified rules
1580 applying to the text of dictionary entries: first produce a sequence
1581 of 6 (v3) or 9 (v4+) Z-characters */
1583 int dictsize = (version_number==3) ? 6 : 9;
1585 number_and_case = 0;
1587 for (i=0, j=0; dword[j]!=0; i++, j++)
1588 { if ((dword[j] == '/') && (dword[j+1] == '/'))
1589 { for (j+=2; dword[j] != 0; j++)
1591 { case 'p': number_and_case |= 4; break;
1593 error_named("Expected 'p' after '//' \
1594 to give number of dictionary word", dword);
1600 if (i>=dictsize) break;
1604 warning_named("Obsolete usage: use the ^ character for the \
1605 apostrophe in", dword);
1606 if (k==(int) '^') k=(int) '\'';
1609 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1610 { int unicode = text_to_unicode(dword+j);
1611 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1612 k = unicode_to_zscii(unicode);
1613 j += textual_form_length - 1;
1614 if ((k == 5) || (k >= 0x100))
1615 { unicode_char_error(
1616 "Character can be printed but not input:", unicode);
1619 k2 = zscii_to_alphabet_grid[(uchar) k];
1622 { if (isupper(k)) k = tolower(k);
1623 k2 = iso_to_alphabet_grid[(uchar) k];
1627 { if ((k2 == -5) || (k2 <= -0x100))
1628 char_error("Character can be printed but not input:", k);
1630 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
1632 wd[i++] = 5; wd[i++] = 6;
1634 wd[i++] = k2/32; wd[i] = k2%32;
1638 { alphabet_used[k2] = 'Y';
1640 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
1641 wd[i]=6+(k2%26); /* Write the Z character */
1645 /* Fill up to the end of the dictionary block with PAD characters */
1647 for (; i<9; i++) wd[i]=5;
1649 /* The array of Z-chars is converted to three 2-byte blocks */
1651 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1652 prepared_sort[1]=tot%0x100;
1653 prepared_sort[0]=(tot/0x100)%0x100;
1654 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1655 prepared_sort[3]=tot%0x100;
1656 prepared_sort[2]=(tot/0x100)%0x100;
1657 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1658 prepared_sort[5]=tot%0x100;
1659 prepared_sort[4]=(tot/0x100)%0x100;
1661 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
1663 if (version_number==3) prepared_sort[2]+=0x80;
1664 else prepared_sort[4]+=0x80;
1666 if (optresult) copy_sorts(optresult, prepared_sort);
1669 /* Also used by verbs.c */
1670 static void dictionary_prepare_g(char *dword, uchar *optresult)
1675 number_and_case = 0;
1677 for (i=0, j=0; (dword[j]!=0); i++, j++) {
1678 if ((dword[j] == '/') && (dword[j+1] == '/')) {
1679 for (j+=2; dword[j] != 0; j++) {
1682 number_and_case |= 4;
1685 error_named("Expected 'p' after '//' \
1686 to give gender or number of dictionary word", dword);
1692 if (i>=DICT_WORD_SIZE) break;
1694 k= ((unsigned char *)dword)[j];
1696 warning_named("Obsolete usage: use the ^ character for the \
1697 apostrophe in", dword);
1700 if (k=='~') /* as in iso_to_alphabet_grid */
1703 if (k=='@' || (character_set_unicode && (k & 0x80))) {
1704 unicode = text_to_unicode(dword+j);
1705 j += textual_form_length - 1;
1708 unicode = iso_to_unicode_grid[k];
1711 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1715 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1716 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1720 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1723 if (DICT_CHAR_SIZE == 1) {
1724 prepared_sort[i] = k;
1727 prepared_sort[4*i] = (k >> 24) & 0xFF;
1728 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1729 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
1730 prepared_sort[4*i+3] = (k) & 0xFF;
1734 if (DICT_CHAR_SIZE == 1) {
1735 for (; i<DICT_WORD_SIZE; i++)
1736 prepared_sort[i] = 0;
1739 for (; i<DICT_WORD_SIZE; i++) {
1740 prepared_sort[4*i] = 0;
1741 prepared_sort[4*i+1] = 0;
1742 prepared_sort[4*i+2] = 0;
1743 prepared_sort[4*i+3] = 0;
1747 if (optresult) copy_sorts(optresult, prepared_sort);
1750 extern void dictionary_prepare(char *dword, uchar *optresult)
1753 dictionary_prepare_z(dword, optresult);
1755 dictionary_prepare_g(dword, optresult);
1758 /* ------------------------------------------------------------------------- */
1759 /* The arrays below are all concerned with the problem of alphabetically */
1760 /* sorting the dictionary during the compilation pass. */
1761 /* Note that it is not enough simply to apply qsort to the dictionary at */
1762 /* the end of the pass: we need to ensure that no duplicates are ever */
1765 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
1766 /* word to be entered into the dictionary, where */
1767 /* n counts upward from 0 */
1768 /* (n is also called the "accession number") */
1770 /* The tree structure encodes an ordering. The special value VACANT means */
1771 /* "no node here": otherwise, node numbers are the same as accession */
1772 /* numbers. At all times, "root" holds the node number of the top of the */
1773 /* tree; each node has up to two branches, such that the subtree of the */
1774 /* left branch is always alphabetically before what's at the node, and */
1775 /* the subtree to the right is always after; and all branches are coloured */
1776 /* either "black" or "red". These colours are used to detect points where */
1777 /* the tree is growing asymmetrically (and therefore becoming inefficient */
1779 /* ------------------------------------------------------------------------- */
1786 typedef struct dict_tree_node_s
1787 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
1788 char colour; /* The colour of the branch to the parent */
1791 static dict_tree_node *dtree;
1793 int *final_dict_order;
1794 static uchar *dict_sort_codes;
1796 static void dictionary_begin_pass(void)
1798 /* Leave room for the 7-byte header (added in "tables.c" much later) */
1799 /* Glulx has a 4-byte header instead. */
1802 dictionary_top=dictionary+7;
1804 dictionary_top=dictionary+4;
1810 static int fdo_count;
1811 static void recursively_sort(int node)
1812 { if (dtree[node].branch[0] != VACANT)
1813 recursively_sort(dtree[node].branch[0]);
1814 final_dict_order[node] = fdo_count++;
1815 if (dtree[node].branch[1] != VACANT)
1816 recursively_sort(dtree[node].branch[1]);
1819 extern void sort_dictionary(void)
1822 { for (i=0; i<dict_entries; i++)
1823 final_dict_order[i] = i;
1828 { fdo_count = 0; recursively_sort(root);
1832 /* ------------------------------------------------------------------------- */
1833 /* If "dword" is in the dictionary, return its accession number plus 1; */
1834 /* If not, return 0. */
1835 /* ------------------------------------------------------------------------- */
1837 static int dictionary_find(char *dword)
1840 dictionary_prepare(dword, NULL);
1842 while (at != VACANT)
1843 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1844 if (n==0) return at + 1;
1845 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1850 /* ------------------------------------------------------------------------- */
1851 /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
1852 /* it already exists, in which case OR the data with (x,y,z) */
1854 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
1856 /* Returns: the accession number. */
1857 /* ------------------------------------------------------------------------- */
1859 extern int dictionary_add(char *dword, int x, int y, int z)
1861 int ggfr = 0, gfr = 0, fr = 0, r = 0;
1862 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1864 int res=((version_number==3)?4:6);
1866 dictionary_prepare(dword, NULL);
1869 { root = 0; goto CreateEntry;
1873 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1877 p = dictionary+7 + at*(3+res) + res;
1878 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1879 if (x & 128) p[0] = (p[0])|number_and_case;
1882 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1883 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1884 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1885 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1886 if (x & 128) p[1] = (p[1]) | number_and_case;
1890 if (n>0) r=1; else r=0;
1892 a = dtree[at].branch[0]; b = dtree[at].branch[1];
1893 if ((a != VACANT) && (dtree[a].colour == RED) &&
1894 (b != VACANT) && (dtree[b].colour == RED))
1895 { dtree[a].colour = BLACK;
1896 dtree[b].colour = BLACK;
1898 dtree[at].colour = RED;
1900 /* A tree rotation may be needed to avoid two red links in a row:
1902 ggf (or else gf is root) ggf (or f is root)
1905 / \(red) / \ (both red)
1911 In effect we rehang the "gf" subtree from "f".
1912 See the Technical Manual for further details.
1915 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1918 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1919 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1920 dtree[f].branch[1-fr] = gf;
1921 dtree[f].colour = BLACK;
1922 dtree[gf].colour = RED;
1923 gf = ggf; gfr = ggfr;
1926 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1927 dtree[at].colour = BLACK;
1928 dtree[gf].colour = RED;
1929 dtree[f].branch[fr] = dtree[at].branch[gfr];
1930 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1931 dtree[at].branch[gfr] = f;
1932 dtree[at].branch[fr] = gf;
1934 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1935 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1940 if (dtree[at].branch[r] == VACANT)
1941 { dtree[at].colour = RED;
1943 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1945 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1946 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1947 dtree[f].branch[1-fr] = gf;
1948 dtree[f].colour = BLACK;
1949 dtree[gf].colour = RED;
1952 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1953 dtree[at].colour = BLACK;
1954 dtree[gf].colour = RED;
1955 dtree[f].branch[fr] = dtree[at].branch[gfr];
1956 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1957 dtree[at].branch[gfr] = f;
1958 dtree[at].branch[fr] = gf;
1960 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1964 dtree[at].branch[r] = dict_entries;
1967 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1968 ggfr = gfr; gfr = fr; fr = r;
1973 if (dict_entries==MAX_DICT_ENTRIES)
1974 memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1976 dtree[dict_entries].branch[0] = VACANT;
1977 dtree[dict_entries].branch[1] = VACANT;
1978 dtree[dict_entries].colour = BLACK;
1980 /* Address in Inform's own dictionary table to write the record to */
1984 p = dictionary + (3+res)*dict_entries + 7;
1986 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1989 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1990 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1991 if (version_number > 3)
1992 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1993 p[res]=x; p[res+1]=y; p[res+2]=z;
1994 if (x & 128) p[res] = (p[res])|number_and_case;
1996 dictionary_top += res+3;
2001 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2002 p[0] = 0x60; /* type byte -- dict word */
2004 p += DICT_CHAR_SIZE;
2005 for (i=0; i<DICT_WORD_BYTES; i++)
2006 p[i] = prepared_sort[i];
2008 p += DICT_WORD_BYTES;
2010 p[2] = y/256; p[3] = y%256;
2013 p[1] |= number_and_case;
2015 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2019 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2021 return dict_entries++;
2024 /* ------------------------------------------------------------------------- */
2025 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2026 /* new verb syntax of its own. (Otherwise existing verb-words never */
2027 /* change their verb-numbers.) */
2028 /* ------------------------------------------------------------------------- */
2030 extern void dictionary_set_verb_number(char *dword, int to)
2032 int res=((version_number==3)?4:6);
2033 i=dictionary_find(dword);
2037 p=dictionary+7+(i-1)*(3+res)+res;
2041 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2042 p[2]=to/256; p[3]=to%256;
2047 /* ------------------------------------------------------------------------- */
2048 /* Tracing code for the dictionary: used not only by "trace" and text */
2049 /* transcription, but also (in the case of "word_to_ascii") in a vital */
2050 /* by the linker. */
2051 /* ------------------------------------------------------------------------- */
2053 static char *d_show_to;
2054 static int d_show_total;
2056 static void show_char(char c)
2057 { if (d_show_to == NULL) printf("%c", c);
2059 { int i = strlen(d_show_to);
2060 d_show_to[i] = c; d_show_to[i+1] = 0;
2064 extern void word_to_ascii(uchar *p, char *results)
2065 { int i, shift, cc, zchar; uchar encoded_word[9];
2066 encoded_word[0] = (((int) p[0])&0x7c)/4;
2067 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2068 encoded_word[2] = ((int) p[1])&0x1f;
2069 encoded_word[3] = (((int) p[2])&0x7c)/4;
2070 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2071 encoded_word[5] = ((int) p[3])&0x1f;
2072 if (version_number > 3)
2073 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2074 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2075 encoded_word[8] = ((int) p[5])&0x1f;
2079 for (i=0; i< ((version_number==3)?6:9); i++)
2080 { zchar = encoded_word[i];
2082 if (zchar == 4) shift = 1;
2084 if (zchar == 5) shift = 2;
2086 { if ((shift == 2) && (zchar == 6))
2087 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2089 if ((zchar>=32) && (zchar<=126))
2090 results[cc++] = zchar;
2092 { zscii_to_text(results+cc, zchar);
2093 cc = strlen(results);
2097 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2098 cc = strlen(results);
2106 static void recursively_show_z(int node)
2107 { int i, cprinted, flags; uchar *p;
2108 char textual_form[32];
2109 int res = (version_number == 3)?4:6;
2111 if (dtree[node].branch[0] != VACANT)
2112 recursively_show_z(dtree[node].branch[0]);
2114 p = (uchar *)dictionary + 7 + (3+res)*node;
2116 word_to_ascii(p, textual_form);
2118 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2119 show_char(textual_form[cprinted]);
2120 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2123 if (d_show_to == NULL)
2124 { for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2126 flags = (int) p[res];
2129 if (flags & 4) printf("p"); else printf(" ");
2134 { if (grammar_version_number == 1)
2135 printf("preposition:%d ", (int) p[res+2]);
2137 printf("preposition ");
2139 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2140 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2144 if (d_show_total++ == 5)
2146 if (d_show_to != NULL)
2147 { write_to_transcript_file(d_show_to);
2152 if (dtree[node].branch[1] != VACANT)
2153 recursively_show_z(dtree[node].branch[1]);
2156 static void recursively_show_g(int node)
2158 warning("### Glulx dictionary-show not yet implemented.\n");
2161 static void show_alphabet(int i)
2162 { int j, c; char chartext[8];
2164 for (j=0; j<26; j++)
2165 { c = alphabet[i][j];
2167 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2169 zscii_to_text(chartext, c);
2170 printf("%s", chartext);
2172 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2177 extern void show_dictionary(void)
2178 { printf("Dictionary contains %d entries:\n",dict_entries);
2179 if (dict_entries != 0)
2180 { d_show_total = 0; d_show_to = NULL;
2182 recursively_show_z(root);
2184 recursively_show_g(root);
2186 printf("\nZ-machine alphabet entries:\n");
2192 extern void write_dictionary_to_transcript(void)
2193 { char d_buffer[81];
2195 sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2197 d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2199 if (dict_entries != 0)
2200 { d_show_total = 0; d_show_to = d_buffer;
2202 recursively_show_z(root);
2204 recursively_show_g(root);
2206 if (d_show_total != 0) write_to_transcript_file(d_buffer);
2209 /* ========================================================================= */
2210 /* Data structure management routines */
2211 /* ------------------------------------------------------------------------- */
2213 extern void init_text_vars(void)
2220 no_chars_transcribed = 0;
2221 is_abbreviation = FALSE;
2223 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2225 total_zchars_trans = 0;
2228 final_dict_order = NULL;
2229 dict_sort_codes = NULL;
2232 initialise_memory_block(&static_strings_area);
2235 extern void text_begin_pass(void)
2236 { abbrevs_lookup_table_made = FALSE;
2238 total_chars_trans=0; total_bytes_trans=0;
2239 if (store_the_text) all_text_top=all_text;
2240 dictionary_begin_pass();
2241 low_strings_top = low_strings;
2243 static_strings_extent = 0;
2245 no_dynamic_strings = 0;
2246 no_unicode_chars = 0;
2249 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2251 extern void text_allocate_arrays(void)
2252 { abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2254 abbrev_values = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2255 abbrev_quality = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2256 abbrev_freqs = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev freqs");
2258 dtree = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2259 "red-black tree for dictionary");
2260 final_dict_order = my_calloc(sizeof(int), MAX_DICT_ENTRIES,
2261 "final dictionary ordering table");
2262 dict_sort_codes = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2263 "dictionary sort codes");
2266 dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2269 dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2272 strings_holding_area
2273 = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2274 low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2276 huff_entities = NULL;
2278 unicode_usage_entries = NULL;
2279 done_compression = FALSE;
2280 compression_table_size = 0;
2281 compressed_offsets = NULL;
2283 MAX_CHARACTER_SET = 0;
2286 if (compression_switch) {
2288 MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2289 + MAX_UNICODE_CHARS;
2290 huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2291 "huffman entities");
2292 hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2293 "huffman node list");
2294 unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2295 MAX_UNICODE_CHARS, "unicode entity entries");
2296 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2297 unicode_usage_hash[ix] = NULL;
2299 compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2300 "static strings index table");
2304 extern void text_free_arrays(void)
2306 my_free(&strings_holding_area, "static strings holding area");
2307 my_free(&low_strings, "low (abbreviation) strings");
2308 my_free(&abbreviations_at, "abbreviations");
2309 my_free(&abbrev_values, "abbrev values");
2310 my_free(&abbrev_quality, "abbrev quality");
2311 my_free(&abbrev_freqs, "abbrev freqs");
2313 my_free(&dtree, "red-black tree for dictionary");
2314 my_free(&final_dict_order, "final dictionary ordering table");
2315 my_free(&dict_sort_codes, "dictionary sort codes");
2317 my_free(&dictionary,"dictionary");
2319 my_free(&compressed_offsets, "static strings index table");
2320 my_free(&hufflist, "huffman node list");
2321 my_free(&huff_entities, "huffman entities");
2322 my_free(&unicode_usage_entries, "unicode entity entities");
2324 deallocate_memory_block(&static_strings_area);
2327 extern void ao_free_arrays(void)
2328 { my_free (&tlbtab,"tlb table");
2329 my_free (&sub_buffer,"sub_buffer");
2330 my_free (&bestyet,"bestyet");
2331 my_free (&bestyet2,"bestyet2");
2332 my_free (&grandtable,"grandtable");
2333 my_free (&grandflags,"grandflags");
2336 /* ========================================================================= */