1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Part of Inform 6.35 */
5 /* copyright (c) Graham Nelson 1993 - 2020 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ *
20 /* ------------------------------------------------------------------------- */
24 uchar *low_strings, *low_strings_top; /* Start and next free byte in the low
27 int32 static_strings_extent; /* Number of bytes of static strings
29 memory_block static_strings_area; /* Used if (!temporary_files_switch) to
30 hold the static strings area so far */
32 static uchar *strings_holding_area; /* Area holding translated strings
33 until they are moved into either
34 a temporary file, or the
35 static_strings_area below */
37 char *all_text, *all_text_top; /* Start and next byte free in (large)
38 text buffer holding the entire text
39 of the game, when it is being
41 int put_strings_in_low_memory, /* When TRUE, put static strings in
42 the low strings pool at 0x100 rather
43 than in the static strings area */
44 is_abbreviation, /* When TRUE, the string being trans
45 is itself an abbreviation string
46 so can't make use of abbreviations */
47 abbrevs_lookup_table_made, /* The abbreviations lookup table is
48 constructed when the first non-
49 abbreviation string is translated:
50 this flag is TRUE after that */
51 abbrevs_lookup[256]; /* Once this has been constructed,
52 abbrevs_lookup[n] = the smallest
53 number of any abbreviation beginning
54 with ASCII character n, or -1
55 if none of the abbreviations do */
56 int no_abbreviations; /* No of abbreviations defined so far */
57 uchar *abbreviations_at; /* Memory to hold the text of any
58 abbreviation strings declared */
59 /* ------------------------------------------------------------------------- */
60 /* Glulx string compression storage */
61 /* ------------------------------------------------------------------------- */
63 int no_strings; /* No of strings in static strings
65 int no_dynamic_strings; /* No. of @.. string escapes used
66 (actually, the highest value used
68 int no_unicode_chars; /* Number of distinct Unicode chars
69 used. (Beyond 0xFF.) */
71 static int MAX_CHARACTER_SET; /* Number of possible entities */
72 huffentity_t *huff_entities; /* The list of entities (characters,
73 abbreviations, @.. escapes, and
75 static huffentity_t **hufflist; /* Copy of the list, for sorting */
77 int no_huff_entities; /* The number of entities in the list */
78 int huff_unicode_start; /* Position in the list where Unicode
80 int huff_abbrev_start; /* Position in the list where string
81 abbreviations begin. */
82 int huff_dynam_start; /* Position in the list where @..
84 int huff_entity_root; /* The position in the list of the root
85 entry (when considering the table
88 int done_compression; /* Has the game text been compressed? */
89 int32 compression_table_size; /* Length of the Huffman table, in
91 int32 compression_string_size; /* Length of the compressed string
93 int32 *compressed_offsets; /* The beginning of every string in
94 the game, relative to the beginning
95 of the Huffman table. (So entry 0
96 is equal to compression_table_size)*/
98 #define UNICODE_HASH_BUCKETS (64)
99 unicode_usage_t *unicode_usage_entries;
100 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
102 static int unicode_entity_index(int32 unicode);
104 /* ------------------------------------------------------------------------- */
105 /* Abbreviation arrays */
106 /* ------------------------------------------------------------------------- */
112 /* ------------------------------------------------------------------------- */
114 int32 total_chars_trans, /* Number of ASCII chars of text in */
115 total_bytes_trans, /* Number of bytes of Z-code text out */
116 zchars_trans_in_last_string; /* Number of Z-chars in last string:
117 needed only for abbrev efficiency
118 calculation in "directs.c" */
119 static int32 total_zchars_trans, /* Number of Z-chars of text out
120 (only used to calculate the above) */
121 no_chars_transcribed; /* Number of ASCII chars written to
122 the text transcription area (used
123 for the -r and -u switches) */
125 static int zchars_out_buffer[3], /* During text translation, a buffer of
126 3 Z-chars at a time: when it's full
127 these are written as a 2-byte word */
128 zob_index; /* Index (0 to 2) into it */
130 static unsigned char *text_out_pc; /* The "program counter" during text
131 translation: the next address to
132 write Z-coded text output to */
134 static unsigned char *text_out_limit; /* The upper limit of text_out_pc
135 during text translation */
137 static int text_out_overflow; /* During text translation, becomes
138 true if text_out_pc tries to pass
141 /* ------------------------------------------------------------------------- */
142 /* For variables/arrays used by the dictionary manager, see below */
143 /* ------------------------------------------------------------------------- */
145 /* ------------------------------------------------------------------------- */
146 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
147 /* detection in text translation). We first bubble-sort the abbrevs into */
148 /* alphabetical order (this is necessary for the detection algorithm to */
149 /* to work). Since the table is only prepared once, and for a table */
150 /* of size at most 96, there's no point using an efficient sort algorithm. */
151 /* ------------------------------------------------------------------------- */
153 static void make_abbrevs_lookup(void)
154 { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
156 { bubble_sort = FALSE;
157 for (j=0; j<no_abbreviations; j++)
158 for (k=j+1; k<no_abbreviations; k++)
159 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
160 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
162 { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
163 l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
165 l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
170 } while (bubble_sort);
172 for (j=no_abbreviations-1; j>=0; j--)
173 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
174 abbrevs_lookup[(uchar)p1[0]]=j;
177 abbrevs_lookup_table_made = TRUE;
180 /* ------------------------------------------------------------------------- */
181 /* Search the abbreviations lookup table (a routine which must be fast). */
182 /* The source text to compare is text[i], text[i+1], ... and this routine */
183 /* is only called if text[i] is indeed the first character of at least one */
184 /* abbreviation, "from" begin the least index into the abbreviations table */
185 /* of an abbreviation for which text[i] is the first character. Recall */
186 /* that the abbrevs table is in alphabetical order. */
188 /* The return value is -1 if there is no match. If there is a match, the */
189 /* text to be abbreviated out is over-written by a string of null chars */
190 /* with "ASCII" value 1, and the abbreviation number is returned. */
192 /* In Glulx, we *do not* do this overwriting with 1's. */
193 /* ------------------------------------------------------------------------- */
195 static int try_abbreviations_from(unsigned char *text, int i, int from)
196 { int j, k; uchar *p, c;
198 for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
199 (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
200 { if (text[i+1]==p[1])
201 { for (k=2; p[k]!=0; k++)
202 if (text[i+k]!=p[k]) goto NotMatched;
204 for (k=0; p[k]!=0; k++) text[i+k]=1;
214 extern void make_abbreviation(char *text)
216 strcpy((char *)abbreviations_at
217 + no_abbreviations*MAX_ABBREV_LENGTH, text);
219 is_abbreviation = TRUE;
220 abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
221 is_abbreviation = FALSE;
223 /* The quality is the number of Z-chars saved by using this */
224 /* abbreviation: note that it takes 2 Z-chars to print it. */
226 abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
229 /* ------------------------------------------------------------------------- */
230 /* The front end routine for text translation */
231 /* ------------------------------------------------------------------------- */
233 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
234 { int i, j; uchar *c;
236 is_abbreviation = is_abbrev;
238 /* Put into the low memory pool (at 0x100 in the Z-machine) of strings */
239 /* which may be wanted as possible entries in the abbreviations table */
241 if (!glulx_mode && in_low_memory)
242 { j=subtract_pointers(low_strings_top,low_strings);
243 low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
244 if (!low_strings_top)
245 memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
246 is_abbreviation = FALSE;
250 if (glulx_mode && done_compression)
251 compiler_error("Tried to add a string after compression was done.");
253 c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
255 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
257 i = subtract_pointers(c, strings_holding_area);
259 /* Insert null bytes as needed to ensure that the next static string */
260 /* also occurs at an address expressible as a packed address */
264 if (oddeven_packing_switch)
265 textalign = scale_factor*2;
267 textalign = scale_factor;
268 while ((i%textalign)!=0)
270 if (i+2 > MAX_STATIC_STRINGS)
271 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
272 i+=2; *c++ = 0; *c++ = 0;
276 j = static_strings_extent;
278 if (temporary_files_switch)
279 for (c=strings_holding_area; c<strings_holding_area+i;
280 c++, static_strings_extent++)
283 for (c=strings_holding_area; c<strings_holding_area+i;
284 c++, static_strings_extent++)
285 write_byte_to_memory_block(&static_strings_area,
286 static_strings_extent, *c);
288 is_abbreviation = FALSE;
291 return(j/scale_factor);
294 /* The marker value is a one-based string number. (We reserve zero
295 to mean "not a string at all". */
296 return (++no_strings);
300 /* ------------------------------------------------------------------------- */
301 /* Output a single Z-character into the buffer, and flush it if full */
302 /* ------------------------------------------------------------------------- */
304 static void write_z_char_z(int i)
307 total_zchars_trans++;
308 zchars_out_buffer[zob_index++]=(i%32);
309 if (zob_index!=3) return;
311 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
312 + zchars_out_buffer[2];
313 if (text_out_pc+2 > text_out_limit) {
314 text_out_overflow = TRUE;
317 text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
318 total_bytes_trans+=2;
321 static void write_zscii(int zsc)
323 int lookup_value, in_alphabet;
330 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
332 else lookup_value = -1;
334 if (lookup_value >= 0)
335 { alphabet_used[lookup_value] = 'Y';
336 in_alphabet = lookup_value/26;
337 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
338 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
339 write_z_char_z(lookup_value%26 + 6);
342 { write_z_char_z(5); write_z_char_z(6);
343 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
347 /* ------------------------------------------------------------------------- */
348 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
349 /* setting the "end" bit on the final 2-byte word */
350 /* ------------------------------------------------------------------------- */
352 static void end_z_chars(void)
354 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
355 while (zob_index!=0) write_z_char_z(5);
356 p=(unsigned char *) text_out_pc;
360 /* Glulx handles this much more simply -- compression is done elsewhere. */
361 static void write_z_char_g(int i)
364 if (text_out_pc+1 > text_out_limit) {
365 text_out_overflow = TRUE;
368 total_zchars_trans++;
374 /* ------------------------------------------------------------------------- */
375 /* The main routine "text.c" provides to the rest of Inform: the text */
376 /* translator. p is the address to write output to, s_text the source text */
377 /* and the return value is the next free address to write output to. */
378 /* The return value will not exceed p_limit. If the translation tries to */
379 /* overflow this boundary, the return value will be NULL (and you should */
380 /* display an error). */
381 /* Note that the source text may be corrupted by this routine. */
382 /* ------------------------------------------------------------------------- */
384 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
385 { int i, j, k, in_alphabet, lookup_value;
386 int32 unicode; int zscii;
387 unsigned char *text_in;
389 /* Cast the input and output streams to unsigned char: text_out_pc will
390 advance as bytes of Z-coded text are written, but text_in doesn't */
392 text_in = (unsigned char *) s_text;
393 text_out_pc = (unsigned char *) p;
394 text_out_limit = (unsigned char *) p_limit;
395 text_out_overflow = FALSE;
397 /* Remember the Z-chars total so that later we can subtract to find the
398 number of Z-chars translated on this string */
400 zchars_trans_in_last_string = total_zchars_trans;
402 /* Start with the Z-characters output buffer empty */
406 /* If this is the first text translated since the abbreviations were
407 declared, and if some were declared, then it's time to make the
408 lookup table for abbreviations
410 (Except: we don't if the text being translated is itself
411 the text of an abbreviation currently being defined) */
413 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
414 && (!is_abbreviation))
415 make_abbrevs_lookup();
417 /* If we're storing the whole game text to memory, then add this text */
419 if ((!is_abbreviation) && (store_the_text))
420 { no_chars_transcribed += strlen(s_text)+2;
421 if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
422 memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
423 sprintf(all_text_top, "%s\n\n", s_text);
424 all_text_top += strlen(all_text_top);
427 if (transcript_switch && (!veneer_mode))
428 write_to_transcript_file(s_text);
432 /* The empty string of Z-text is illegal, since it can't carry an end
433 bit: so we translate an empty string of ASCII text to just the
434 pad character 5. Printing this causes nothing to appear on screen. */
436 if (text_in[0]==0) write_z_char_z(5);
438 /* Loop through the characters of the null-terminated input text: note
439 that if 1 is written over a character in the input text, it is
440 afterwards ignored */
442 for (i=0; text_in[i]!=0; i++)
443 { total_chars_trans++;
445 /* Contract ". " into ". " if double-space-removing switch set:
446 likewise "? " and "! " if the setting is high enough */
448 if ((double_space_setting >= 1)
449 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
450 { if (text_in[i]=='.') text_in[i+2]=1;
451 if (double_space_setting >= 2)
452 { if (text_in[i]=='?') text_in[i+2]=1;
453 if (text_in[i]=='!') text_in[i+2]=1;
457 /* Try abbreviations if the economy switch set */
459 if ((economy_switch) && (!is_abbreviation)
460 && ((k=abbrevs_lookup[text_in[i]])!=-1))
461 { if ((j=try_abbreviations_from(text_in, i, k))!=-1)
462 { if (j<32) { write_z_char_z(2); write_z_char_z(j); }
463 else { write_z_char_z(3); write_z_char_z(j-32); }
467 /* If Unicode switch set, use text_to_unicode to perform UTF-8
469 if (character_set_unicode && (text_in[i] & 0x80))
470 { unicode = text_to_unicode((char *) (text_in+i));
471 zscii = unicode_to_zscii(unicode);
472 if (zscii != 5) write_zscii(zscii);
474 { unicode_char_error(
475 "Character can only be used if declared in \
476 advance as part of 'Zcharacter table':", unicode);
478 i += textual_form_length - 1;
482 /* '@' is the escape character in Inform string notation: the various
486 @@decimalnumber : write this ZSCII char (0 to 1023)
487 @twodigits : write the abbreviation string with this
491 @accentcode : this accented character: e.g.,
492 for @'e write an E-acute
493 @{...} : this Unicode char (in hex) */
496 { if (text_in[i+1]=='@')
500 i+=2; j=atoi((char *) (text_in+i));
502 { /* Prevent ~ and ^ from being translated to double-quote
503 and new-line, as they ordinarily would be */
505 case 94: write_z_char_z(5); write_z_char_z(6);
506 write_z_char_z(94/32); write_z_char_z(94%32);
508 case 126: write_z_char_z(5); write_z_char_z(6);
509 write_z_char_z(126/32); write_z_char_z(126%32);
512 default: write_zscii(j); break;
514 while (isdigit(text_in[i])) i++; i--;
516 else if (isdigit(text_in[i+1])!=0)
521 d1 = character_digit_value[text_in[i+1]];
522 d2 = character_digit_value[text_in[i+2]];
523 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
524 error("'@..' must have two decimal digits");
527 write_z_char_z(1); write_z_char_z(d1*10 + d2);
532 /* A string escape specifying an unusual character */
534 unicode = text_to_unicode((char *) (text_in+i));
535 zscii = unicode_to_zscii(unicode);
536 if (zscii != 5) write_zscii(zscii);
538 { unicode_char_error(
539 "Character can only be used if declared in \
540 advance as part of 'Zcharacter table':", unicode);
542 i += textual_form_length - 1;
546 { /* Skip a character which has been over-written with the null
547 value 1 earlier on */
550 { if (text_in[i]==' ') write_z_char_z(0);
552 { j = (int) text_in[i];
553 lookup_value = iso_to_alphabet_grid[j];
554 if (lookup_value < 0)
555 { /* The character isn't in the standard alphabets, so
556 we have to use the ZSCII 4-Z-char sequence */
558 if (lookup_value == -5)
559 { /* Character isn't in the ZSCII set at all */
561 unicode = iso_to_unicode(j);
563 "Character can only be used if declared in \
564 advance as part of 'Zcharacter table':", unicode);
565 write_zscii(0x200 + unicode/0x100);
566 write_zscii(0x300 + unicode%0x100);
568 else write_zscii(-lookup_value);
571 { /* The character is in one of the standard alphabets:
572 write a SHIFT to temporarily change alphabet if
573 it isn't in alphabet 0, then write the Z-char */
575 alphabet_used[lookup_value] = 'Y';
576 in_alphabet = lookup_value/26;
577 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
578 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
579 write_z_char_z(lookup_value%26 + 6);
586 /* Flush the Z-characters output buffer and set the "end" bit */
593 /* The text storage here is, of course, temporary. Compression
594 will occur when we're finished compiling, so that all the
595 clever Huffman stuff will work.
596 In the stored text, we use "@@" to indicate @,
597 "@0" to indicate a zero byte,
598 "@ANNNN" to indicate an abbreviation,
599 "@DNNNN" to indicate a dynamic string thing.
600 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
601 (NNNN is a four-digit hex number using the letters A-P... an
602 ugly representation but a convenient one.)
605 for (i=0; text_in[i]!=0; i++) {
607 /* Contract ". " into ". " if double-space-removing switch set:
608 likewise "? " and "! " if the setting is high enough. */
609 if ((double_space_setting >= 1)
610 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
612 || (double_space_setting >= 2
613 && (text_in[i]=='?' || text_in[i]=='!'))) {
614 text_in[i+1] = text_in[i];
621 /* Try abbreviations if the economy switch set. We have to be in
622 compression mode too, since the abbreviation mechanism is part
623 of string decompression. */
625 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
626 && ((k=abbrevs_lookup[text_in[i]])!=-1)
627 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
628 char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
632 write_z_char_g('A' + ((j >>12) & 0x0F));
633 write_z_char_g('A' + ((j >> 8) & 0x0F));
634 write_z_char_g('A' + ((j >> 4) & 0x0F));
635 write_z_char_g('A' + ((j ) & 0x0F));
637 else if (text_in[i] == '@') {
638 if (text_in[i+1]=='@') {
640 i+=2; j=atoi((char *) (text_in+i));
641 if (j == '@' || j == '\0') {
645 if (!compression_switch)
646 warning("Ascii @@0 will prematurely terminate non-compressed \
651 while (isdigit(text_in[i])) i++; i--;
653 else if (isdigit(text_in[i+1])) {
655 d1 = character_digit_value[text_in[i+1]];
656 d2 = character_digit_value[text_in[i+2]];
657 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
658 error("'@..' must have two decimal digits");
661 if (!compression_switch)
662 warning("'@..' print variable will not work in non-compressed \
663 string; substituting ' '.");
666 if (j >= MAX_DYNAMIC_STRINGS) {
667 memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
670 if (j+1 >= no_dynamic_strings)
671 no_dynamic_strings = j+1;
674 write_z_char_g('A' + ((j >>12) & 0x0F));
675 write_z_char_g('A' + ((j >> 8) & 0x0F));
676 write_z_char_g('A' + ((j >> 4) & 0x0F));
677 write_z_char_g('A' + ((j ) & 0x0F));
681 unicode = text_to_unicode((char *) (text_in+i));
682 i += textual_form_length - 1;
683 if (unicode == '@' || unicode == '\0') {
685 write_z_char_g(unicode ? '@' : '0');
687 else if (unicode >= 0 && unicode < 256) {
688 write_z_char_g(unicode);
691 if (!compression_switch) {
692 warning("Unicode characters will not work in non-compressed \
693 string; substituting '?'.");
697 j = unicode_entity_index(unicode);
700 write_z_char_g('A' + ((j >>12) & 0x0F));
701 write_z_char_g('A' + ((j >> 8) & 0x0F));
702 write_z_char_g('A' + ((j >> 4) & 0x0F));
703 write_z_char_g('A' + ((j ) & 0x0F));
708 else if (text_in[i] == '^')
709 write_z_char_g(0x0A);
710 else if (text_in[i] == '~')
712 else if (character_set_unicode) {
713 if (text_in[i] & 0x80) {
714 unicode = text_to_unicode((char *) (text_in+i));
715 i += textual_form_length - 1;
716 if (unicode >= 0 && unicode < 256) {
717 write_z_char_g(unicode);
720 if (!compression_switch) {
721 warning("Unicode characters will not work in non-compressed \
722 string; substituting '?'.");
726 j = unicode_entity_index(unicode);
729 write_z_char_g('A' + ((j >>12) & 0x0F));
730 write_z_char_g('A' + ((j >> 8) & 0x0F));
731 write_z_char_g('A' + ((j >> 4) & 0x0F));
732 write_z_char_g('A' + ((j ) & 0x0F));
737 write_z_char_g(text_in[i]);
741 unicode = iso_to_unicode_grid[text_in[i]];
742 if (unicode >= 0 && unicode < 256) {
743 write_z_char_g(unicode);
746 if (!compression_switch) {
747 warning("Unicode characters will not work in non-compressed \
748 string; substituting '?'.");
752 j = unicode_entity_index(unicode);
755 write_z_char_g('A' + ((j >>12) & 0x0F));
756 write_z_char_g('A' + ((j >> 8) & 0x0F));
757 write_z_char_g('A' + ((j >> 4) & 0x0F));
758 write_z_char_g('A' + ((j ) & 0x0F));
767 if (text_out_overflow)
770 return((uchar *) text_out_pc);
773 static int unicode_entity_index(int32 unicode)
775 unicode_usage_t *uptr;
777 int buck = unicode % UNICODE_HASH_BUCKETS;
779 for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
780 if (uptr->ch == unicode)
784 j = (uptr - unicode_usage_entries);
787 if (no_unicode_chars >= MAX_UNICODE_CHARS) {
788 memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
792 j = no_unicode_chars;
794 uptr = unicode_usage_entries + j;
796 uptr->next = unicode_usage_hash[buck];
797 unicode_usage_hash[buck] = uptr;
804 /* ------------------------------------------------------------------------- */
805 /* Glulx compression code */
806 /* ------------------------------------------------------------------------- */
809 static void compress_makebits(int entnum, int depth, int prevbit,
810 huffbitlist_t *bits);
812 /* The compressor. This uses the usual Huffman compression algorithm. */
813 void compress_game_text()
815 int entities=0, branchstart, branches;
823 if (compression_switch) {
825 /* How many entities have we currently got? Well, 256 plus the
826 string-terminator plus Unicode chars plus abbrevations plus
829 huff_unicode_start = entities;
830 entities += no_unicode_chars;
831 huff_abbrev_start = entities;
833 entities += no_abbreviations;
834 huff_dynam_start = entities;
835 entities += no_dynamic_strings;
837 if (entities > MAX_CHARACTER_SET)
838 memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
841 for (jx=0; jx<256; jx++) {
842 huff_entities[jx].type = 2;
843 huff_entities[jx].count = 0;
844 huff_entities[jx].u.ch = jx;
847 huff_entities[256].type = 1;
848 huff_entities[256].count = 0;
849 for (jx=0; jx<no_unicode_chars; jx++) {
850 huff_entities[huff_unicode_start+jx].type = 4;
851 huff_entities[huff_unicode_start+jx].count = 0;
852 huff_entities[huff_unicode_start+jx].u.val = jx;
854 if (economy_switch) {
855 for (jx=0; jx<no_abbreviations; jx++) {
856 huff_entities[huff_abbrev_start+jx].type = 3;
857 huff_entities[huff_abbrev_start+jx].count = 0;
858 huff_entities[huff_abbrev_start+jx].u.val = jx;
861 for (jx=0; jx<no_dynamic_strings; jx++) {
862 huff_entities[huff_dynam_start+jx].type = 9;
863 huff_entities[huff_dynam_start+jx].count = 0;
864 huff_entities[huff_dynam_start+jx].u.val = jx;
868 /* No compression; use defaults that will make it easy to check
870 no_huff_entities = 257;
871 huff_unicode_start = 257;
872 huff_abbrev_start = 257;
873 huff_dynam_start = 257+MAX_ABBREVS;
874 compression_table_size = 0;
877 if (temporary_files_switch) {
879 Temp1_fp=fopen(Temp1_Name,"rb");
881 fatalerror("I/O failure: couldn't reopen temporary file 1");
884 if (compression_switch) {
886 for (lx=0, ix=0; lx<no_strings; lx++) {
887 int escapelen=0, escapetype=0;
891 if (temporary_files_switch)
892 ch = fgetc(Temp1_fp);
894 ch = read_byte_from_memory_block(&static_strings_area, ix);
896 if (ix > static_strings_extent || ch < 0)
897 compiler_error("Read too much not-yet-compressed text.");
898 if (escapelen == -1) {
903 else if (ch == '0') {
906 else if (ch == 'A' || ch == 'D' || ch == 'U') {
913 compiler_error("Strange @ escape in processed text.");
916 else if (escapelen) {
917 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
919 if (escapelen == 0) {
920 if (escapetype == 'A') {
921 ch = huff_abbrev_start+escapeval;
923 else if (escapetype == 'D') {
924 ch = huff_dynam_start+escapeval;
926 else if (escapetype == 'U') {
927 ch = huff_unicode_start+escapeval;
930 compiler_error("Strange @ escape in processed text.");
946 huff_entities[ch].count++;
951 for (jx=0; jx<entities; jx++) {
952 if (huff_entities[jx].count) {
953 hufflist[numlive] = &(huff_entities[jx]);
958 branchstart = entities;
961 while (numlive > 1) {
963 int best1num, best2num;
966 if (hufflist[0]->count < hufflist[1]->count) {
975 best1num = hufflist[best1]->count;
976 best2num = hufflist[best2]->count;
978 for (jx=2; jx<numlive; jx++) {
979 if (hufflist[jx]->count < best1num) {
983 best1num = hufflist[best1]->count;
985 else if (hufflist[jx]->count < best2num) {
987 best2num = hufflist[best2]->count;
991 bran = &(huff_entities[branchstart+branches]);
994 bran->count = hufflist[best1]->count + hufflist[best2]->count;
995 bran->u.branch[0] = (hufflist[best1] - huff_entities);
996 bran->u.branch[1] = (hufflist[best2] - huff_entities);
997 hufflist[best1] = bran;
998 if (best2 < numlive-1) {
999 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1000 ((numlive-1) - best2) * sizeof(huffentity_t *));
1005 huff_entity_root = (hufflist[0] - huff_entities);
1007 for (ix=0; ix<MAXHUFFBYTES; ix++)
1009 compression_table_size = 12;
1011 no_huff_entities = 0; /* compress_makebits will total this up */
1012 compress_makebits(huff_entity_root, 0, -1, &bits);
1015 /* Now, sadly, we have to compute the size of the string section,
1016 without actually doing the compression. */
1017 compression_string_size = 0;
1019 if (temporary_files_switch) {
1020 fseek(Temp1_fp, 0, SEEK_SET);
1023 if (no_strings >= MAX_NUM_STATIC_STRINGS)
1024 memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1026 for (lx=0, ix=0; lx<no_strings; lx++) {
1027 int escapelen=0, escapetype=0;
1031 compressed_offsets[lx] = compression_table_size + compression_string_size;
1032 compression_string_size++; /* for the type byte */
1034 if (temporary_files_switch)
1035 ch = fgetc(Temp1_fp);
1037 ch = read_byte_from_memory_block(&static_strings_area, ix);
1039 if (ix > static_strings_extent || ch < 0)
1040 compiler_error("Read too much not-yet-compressed text.");
1041 if (escapelen == -1) {
1046 else if (ch == '0') {
1049 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1056 compiler_error("Strange @ escape in processed text.");
1059 else if (escapelen) {
1060 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1062 if (escapelen == 0) {
1063 if (escapetype == 'A') {
1064 ch = huff_abbrev_start+escapeval;
1066 else if (escapetype == 'D') {
1067 ch = huff_dynam_start+escapeval;
1069 else if (escapetype == 'U') {
1070 ch = huff_unicode_start+escapeval;
1073 compiler_error("Strange @ escape in processed text.");
1090 if (compression_switch) {
1091 jx += huff_entities[ch].depth;
1092 compression_string_size += (jx/8);
1096 if (ch >= huff_dynam_start) {
1097 compression_string_size += 3;
1099 else if (ch >= huff_unicode_start) {
1100 compiler_error("Abbreviation/Unicode in non-compressed string \
1101 should be impossible.");
1104 compression_string_size += 1;
1107 if (compression_switch && jx)
1108 compression_string_size++;
1111 done_compression = TRUE;
1114 static void compress_makebits(int entnum, int depth, int prevbit,
1115 huffbitlist_t *bits)
1117 huffentity_t *ent = &(huff_entities[entnum]);
1121 ent->addr = compression_table_size;
1126 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1129 switch (ent->type) {
1131 compression_table_size += 9;
1132 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1133 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1136 compression_table_size += 1;
1139 compression_table_size += 2;
1142 cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1143 compression_table_size += (1 + 1 + strlen(cx));
1147 compression_table_size += 5;
1152 /* ------------------------------------------------------------------------- */
1153 /* The abbreviations optimiser */
1155 /* This is a very complex, memory and time expensive algorithm to */
1156 /* approximately solve the problem of which abbreviation strings would */
1157 /* minimise the total number of Z-chars to which the game text translates. */
1158 /* It is in some ways a quite separate program but remains inside Inform */
1159 /* for compatibility with previous releases. */
1160 /* ------------------------------------------------------------------------- */
1162 typedef struct tlb_s
1164 int32 intab, occurrences;
1167 static int32 no_occs;
1169 static int32 *grandtable;
1170 static int32 *grandflags;
1171 typedef struct optab_s
1176 char text[MAX_ABBREV_LENGTH];
1178 static optab *bestyet, *bestyet2;
1182 static char *sub_buffer;
1184 static void optimise_pass(void)
1185 { int32 i; int t1, t2;
1186 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1187 for (i=0; i<256; i++) bestyet[i].length=0;
1188 for (i=0; i<no_occs; i++)
1189 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1192 if (i%((**g_pm_hndl).linespercheck) == 0)
1193 { ProcessEvents (&g_proc);
1197 my_free(&all_text,"transcription text");
1198 longjmp (g_fallback, 1);
1202 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1203 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1204 (long int) tlbtab[i].occurrences);
1206 for (j=0; j<tlbtab[i].occurrences; j++)
1207 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1208 nl=2; noflags=tlbtab[i].occurrences;
1209 while ((noflags>=2)&&(nl<=62))
1211 for (j2=0; j2<nl; j2++)
1212 if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1215 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1216 { if (grandflags[j2]==1)
1217 { x=grandtable[tlbtab[i].intab+j2]
1218 - grandtable[tlbtab[i].intab+j];
1219 if (((x>-nl)&&(x<nl))
1220 || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1221 all_text+grandtable[tlbtab[i].intab+j2],
1223 { grandflags[j2]=0; noflags--; }
1228 for (k=0; k<nl; k++)
1230 c=all_text[grandtable[tlbtab[i].intab+j+k]];
1232 { if (iso_to_alphabet_grid[c]<0)
1235 if (iso_to_alphabet_grid[c]>=26)
1239 score=(matches-1)*(scrabble-2);
1241 for (j2=0; j2<256; j2++)
1242 { if ((nl==bestyet[j2].length)
1243 && (memcmp(all_text+bestyet[j2].location,
1244 all_text+grandtable[tlbtab[i].intab+j],
1246 { j2=256; min=score; }
1248 { if (bestyet[j2].score<min)
1249 { min=bestyet[j2].score; minat=j2;
1254 { bestyet[minat].score=score;
1255 bestyet[minat].length=nl;
1256 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1257 bestyet[minat].popularity=matches;
1258 for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1259 all_text[bestyet[minat].location+j2];
1265 t2=((int) time(0)) - t1;
1266 printf(" (%d seconds)\n",t2);
1271 static int any_overlap(char *s1, char *s2)
1272 { int a, b, i, j, flag;
1273 a=strlen(s1); b=strlen(s2);
1274 for (i=1-b; i<a; i++)
1277 if ((0<=i+j)&&(i+j<=a-1))
1278 if (s1[i+j]!=s2[j]) flag=1;
1279 if (flag==0) return(1);
1284 #define MAX_TLBS 8000
1286 extern void optimise_abbreviations(void)
1287 { int32 i, j, t, max=0, MAX_GTABLE;
1288 int32 j2, selected, available, maxat=0, nl;
1291 printf("Beginning calculation of optimal abbreviations...\n");
1294 tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1295 sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1296 for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1298 bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1299 bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1301 bestyet2[0].text[0]='.';
1302 bestyet2[0].text[1]=' ';
1303 bestyet2[0].text[2]=0;
1305 bestyet2[1].text[0]=',';
1306 bestyet2[1].text[1]=' ';
1307 bestyet2[1].text[2]=0;
1309 for (i=0; all_text+i<all_text_top; i++)
1311 if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1312 { all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1313 bestyet2[0].popularity++;
1316 if ((all_text[i]=='.') && (all_text[i+1]==' '))
1317 { all_text[i]='\n'; all_text[i+1]='\n';
1318 bestyet2[0].popularity++;
1321 if ((all_text[i]==',') && (all_text[i+1]==' '))
1322 { all_text[i]='\n'; all_text[i+1]='\n';
1323 bestyet2[1].popularity++;
1327 MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1328 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1330 for (i=0, t=0; all_text+i<all_text_top; i++)
1331 { test.text[0]=all_text[i];
1332 test.text[1]=all_text[i+1];
1333 test.text[2]=all_text[i+2];
1335 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1337 for (j=0; j<no_occs; j++)
1338 if (strcmp(test.text,tlbtab[j].text)==0)
1341 for (j=i+3; all_text+j<all_text_top; j++)
1344 if (j%((**g_pm_hndl).linespercheck) == 0)
1345 { ProcessEvents (&g_proc);
1349 my_free(&all_text,"transcription text");
1350 longjmp (g_fallback, 1);
1354 if ((all_text[i]==all_text[j])
1355 && (all_text[i+1]==all_text[j+1])
1356 && (all_text[i+2]==all_text[j+2]))
1357 { grandtable[t+test.occurrences]=j;
1359 if (t+test.occurrences==MAX_GTABLE)
1360 { printf("All %ld cross-references used\n",
1361 (long int) MAX_GTABLE);
1366 if (test.occurrences>=2)
1367 { tlbtab[no_occs]=test;
1368 tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1369 if (max<tlbtab[no_occs].occurrences)
1370 max=tlbtab[no_occs].occurrences;
1372 if (no_occs==MAX_TLBS)
1373 { printf("All %d three-letter-blocks used\n",
1382 grandflags=my_calloc(sizeof(int), max, "grandflags");
1385 printf("Cross-reference table (%ld entries) built...\n",
1386 (long int) no_occs);
1387 /* for (i=0; i<no_occs; i++)
1388 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1389 tlbtab[i].occurrences);
1392 for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1394 while ((available>0)&&(selected<64))
1395 { printf("Pass %d\n", ++pass_no);
1399 for (i=0; i<256; i++)
1400 if (bestyet[i].score!=0)
1402 nl=bestyet[i].length;
1403 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1404 all_text[bestyet[i].location+j2];
1405 bestyet[i].text[nl]=0;
1408 /* printf("End of pass results:\n");
1409 printf("\nno score freq string\n");
1410 for (i=0; i<256; i++)
1411 if (bestyet[i].score>0)
1412 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1413 bestyet[i].popularity, bestyet[i].text);
1418 for (i=0; i<256; i++)
1419 if (max<bestyet[i].score)
1420 { max=bestyet[i].score;
1425 { bestyet2[selected++]=bestyet[maxat];
1428 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1429 (long int) selected,bestyet[maxat].text,
1430 (long int) bestyet[maxat].popularity,
1431 (long int) bestyet[maxat].score);
1433 test.text[0]=bestyet[maxat].text[0];
1434 test.text[1]=bestyet[maxat].text[1];
1435 test.text[2]=bestyet[maxat].text[2];
1438 for (i=0; i<no_occs; i++)
1439 if (strcmp(test.text,tlbtab[i].text)==0)
1442 for (j=0; j<tlbtab[i].occurrences; j++)
1443 { if (memcmp(bestyet[maxat].text,
1444 all_text+grandtable[tlbtab[i].intab+j],
1445 bestyet[maxat].length)==0)
1446 { for (j2=0; j2<bestyet[maxat].length; j2++)
1447 all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1451 for (i=0; i<256; i++)
1452 if ((bestyet[i].score>0)&&
1453 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1454 { bestyet[i].score=0;
1455 /* printf("Discarding '%s' as overlapping\n",
1456 bestyet[i].text); */
1459 } while ((max>0)&&(available>0)&&(selected<64));
1462 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1463 for (i=0; i<selected; i++)
1464 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1469 /* ------------------------------------------------------------------------- */
1470 /* The dictionary manager begins here. */
1472 /* Speed is extremely important in these algorithms. If a linear-time */
1473 /* routine were used to search the dictionary words so far built up, then */
1474 /* Inform would crawl. */
1476 /* Instead, the dictionary is stored as a binary tree, which is kept */
1477 /* balanced with the red-black algorithm. */
1478 /* ------------------------------------------------------------------------- */
1479 /* A dictionary table similar to the Z-machine format is kept: there is a */
1480 /* 7-byte header (left blank here to be filled in at the */
1481 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1482 /* records, one per word, in the form */
1484 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1485 /* 4 or 6 bytes byte byte byte */
1487 /* For Glulx, the form is instead: (But see below about Unicode-valued */
1488 /* dictionaries and my heinie.) */
1490 /* <plain text> <flags> <verbnumber> <adjectivenumber> */
1491 /* DICT_WORD_SIZE short short short */
1493 /* These records are stored in "accession order" (i.e. in order of their */
1494 /* first being received by these routines) and only alphabetically sorted */
1495 /* by construct_storyfile() (using the array below). */
1496 /* ------------------------------------------------------------------------- */
1498 /* Further notes about the data fields... */
1499 /* The flags are currently: */
1500 /* bit 0: word is used as a verb (in verb grammar) */
1501 /* bit 1: word is used as a meta verb */
1502 /* bit 2: word is plural (set by '//p') */
1503 /* bit 3: word is used as a preposition (in verb grammar) */
1504 /* bit 6: set for all verbs, but not used by the parser? */
1505 /* bit 7: word is used as a noun (set for every word that appears in */
1506 /* code or in an object property) */
1508 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1511 /* The compiler generates special constants #dict_par1, #dict_par2, */
1512 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1513 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1514 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1515 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1516 /* ------------------------------------------------------------------------- */
1518 uchar *dictionary, /* (These two pointers are externally
1519 used only in "tables.c" when
1520 building the story-file) */
1521 *dictionary_top; /* Pointer to next free record */
1523 int dict_entries; /* Total number of records entered */
1525 /* ------------------------------------------------------------------------- */
1526 /* dict_word is a typedef for a struct of 6 unsigned chars (defined in */
1527 /* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */
1528 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1529 /* alphabetic order corresponds to numeric order. For this reason, the */
1530 /* dict_word is called the "sort code" of the original text word. */
1532 /* ###- In modifying the compiler, I've found it easier to discard the */
1533 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1534 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1535 /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
1536 /* (That limit is defined as 40 in the header; it exists only for a few */
1537 /* static buffers, and can be increased without using significant memory.) */
1539 /* ###- Well, that certainly bit me on the butt, didn't it. In further */
1540 /* modifying the compiler to generate a Unicode dictionary, I have to */
1541 /* store four-byte values in the uchar array. This is handled by making */
1542 /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1543 /* Then we store the 32-bit character value big-endian. This lets us */
1544 /* continue to compare arrays bytewise, which is a nice simplification. */
1545 /* ------------------------------------------------------------------------- */
1547 extern int compare_sorts(uchar *d1, uchar *d2)
1549 for (i=0; i<DICT_WORD_BYTES; i++)
1550 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1551 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1556 extern void copy_sorts(uchar *d1, uchar *d2)
1558 for (i=0; i<DICT_WORD_BYTES; i++)
1562 static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
1565 static int number_and_case;
1567 /* Also used by verbs.c */
1568 static void dictionary_prepare_z(char *dword, uchar *optresult)
1569 { int i, j, k, k2, wd[13]; int32 tot;
1571 /* A rapid text translation algorithm using only the simplified rules
1572 applying to the text of dictionary entries: first produce a sequence
1573 of 6 (v3) or 9 (v4+) Z-characters */
1575 int dictsize = (version_number==3) ? 6 : 9;
1577 number_and_case = 0;
1579 for (i=0, j=0; dword[j]!=0; i++, j++)
1580 { if ((dword[j] == '/') && (dword[j+1] == '/'))
1581 { for (j+=2; dword[j] != 0; j++)
1583 { case 'p': number_and_case |= 4; break;
1585 error_named("Expected 'p' after '//' \
1586 to give number of dictionary word", dword);
1592 if (i>=dictsize) break;
1596 warning_named("Obsolete usage: use the ^ character for the \
1597 apostrophe in", dword);
1598 if (k==(int) '^') k=(int) '\'';
1601 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1602 { int unicode = text_to_unicode(dword+j);
1603 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1604 k = unicode_to_zscii(unicode);
1605 j += textual_form_length - 1;
1606 if ((k == 5) || (k >= 0x100))
1607 { unicode_char_error(
1608 "Character can be printed but not input:", unicode);
1611 k2 = zscii_to_alphabet_grid[(uchar) k];
1614 { if (isupper(k)) k = tolower(k);
1615 k2 = iso_to_alphabet_grid[(uchar) k];
1619 { if ((k2 == -5) || (k2 <= -0x100))
1620 char_error("Character can be printed but not input:", k);
1622 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
1624 wd[i++] = 5; wd[i++] = 6;
1626 wd[i++] = k2/32; wd[i] = k2%32;
1630 { alphabet_used[k2] = 'Y';
1632 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
1633 wd[i]=6+(k2%26); /* Write the Z character */
1637 /* Fill up to the end of the dictionary block with PAD characters */
1639 for (; i<9; i++) wd[i]=5;
1641 /* The array of Z-chars is converted to three 2-byte blocks */
1643 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1644 prepared_sort[1]=tot%0x100;
1645 prepared_sort[0]=(tot/0x100)%0x100;
1646 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1647 prepared_sort[3]=tot%0x100;
1648 prepared_sort[2]=(tot/0x100)%0x100;
1649 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1650 prepared_sort[5]=tot%0x100;
1651 prepared_sort[4]=(tot/0x100)%0x100;
1653 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
1655 if (version_number==3) prepared_sort[2]+=0x80;
1656 else prepared_sort[4]+=0x80;
1658 if (optresult) copy_sorts(optresult, prepared_sort);
1661 /* Also used by verbs.c */
1662 static void dictionary_prepare_g(char *dword, uchar *optresult)
1667 number_and_case = 0;
1669 for (i=0, j=0; (dword[j]!=0); i++, j++) {
1670 if ((dword[j] == '/') && (dword[j+1] == '/')) {
1671 for (j+=2; dword[j] != 0; j++) {
1674 number_and_case |= 4;
1677 error_named("Expected 'p' after '//' \
1678 to give gender or number of dictionary word", dword);
1684 if (i>=DICT_WORD_SIZE) break;
1686 k= ((unsigned char *)dword)[j];
1688 warning_named("Obsolete usage: use the ^ character for the \
1689 apostrophe in", dword);
1692 if (k=='~') /* as in iso_to_alphabet_grid */
1695 if (k=='@' || (character_set_unicode && (k & 0x80))) {
1696 unicode = text_to_unicode(dword+j);
1697 j += textual_form_length - 1;
1700 unicode = iso_to_unicode_grid[k];
1703 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1707 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1708 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1712 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1715 if (DICT_CHAR_SIZE == 1) {
1716 prepared_sort[i] = k;
1719 prepared_sort[4*i] = (k >> 24) & 0xFF;
1720 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1721 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
1722 prepared_sort[4*i+3] = (k) & 0xFF;
1726 if (DICT_CHAR_SIZE == 1) {
1727 for (; i<DICT_WORD_SIZE; i++)
1728 prepared_sort[i] = 0;
1731 for (; i<DICT_WORD_SIZE; i++) {
1732 prepared_sort[4*i] = 0;
1733 prepared_sort[4*i+1] = 0;
1734 prepared_sort[4*i+2] = 0;
1735 prepared_sort[4*i+3] = 0;
1739 if (optresult) copy_sorts(optresult, prepared_sort);
1742 extern void dictionary_prepare(char *dword, uchar *optresult)
1745 dictionary_prepare_z(dword, optresult);
1747 dictionary_prepare_g(dword, optresult);
1750 /* ------------------------------------------------------------------------- */
1751 /* The arrays below are all concerned with the problem of alphabetically */
1752 /* sorting the dictionary during the compilation pass. */
1753 /* Note that it is not enough simply to apply qsort to the dictionary at */
1754 /* the end of the pass: we need to ensure that no duplicates are ever */
1757 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
1758 /* word to be entered into the dictionary, where */
1759 /* n counts upward from 0 */
1760 /* (n is also called the "accession number") */
1762 /* The tree structure encodes an ordering. The special value VACANT means */
1763 /* "no node here": otherwise, node numbers are the same as accession */
1764 /* numbers. At all times, "root" holds the node number of the top of the */
1765 /* tree; each node has up to two branches, such that the subtree of the */
1766 /* left branch is always alphabetically before what's at the node, and */
1767 /* the subtree to the right is always after; and all branches are coloured */
1768 /* either "black" or "red". These colours are used to detect points where */
1769 /* the tree is growing asymmetrically (and therefore becoming inefficient */
1771 /* ------------------------------------------------------------------------- */
1778 typedef struct dict_tree_node_s
1779 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
1780 char colour; /* The colour of the branch to the parent */
1783 static dict_tree_node *dtree;
1785 int *final_dict_order;
1786 static uchar *dict_sort_codes;
1788 static void dictionary_begin_pass(void)
1790 /* Leave room for the 7-byte header (added in "tables.c" much later) */
1791 /* Glulx has a 4-byte header instead. */
1794 dictionary_top=dictionary+7;
1796 dictionary_top=dictionary+4;
1802 static int fdo_count;
1803 static void recursively_sort(int node)
1804 { if (dtree[node].branch[0] != VACANT)
1805 recursively_sort(dtree[node].branch[0]);
1806 final_dict_order[node] = fdo_count++;
1807 if (dtree[node].branch[1] != VACANT)
1808 recursively_sort(dtree[node].branch[1]);
1811 extern void sort_dictionary(void)
1814 { for (i=0; i<dict_entries; i++)
1815 final_dict_order[i] = i;
1820 { fdo_count = 0; recursively_sort(root);
1824 /* ------------------------------------------------------------------------- */
1825 /* If "dword" is in the dictionary, return its accession number plus 1; */
1826 /* If not, return 0. */
1827 /* ------------------------------------------------------------------------- */
1829 static int dictionary_find(char *dword)
1832 dictionary_prepare(dword, NULL);
1834 while (at != VACANT)
1835 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1836 if (n==0) return at + 1;
1837 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1842 /* ------------------------------------------------------------------------- */
1843 /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
1844 /* it already exists, in which case OR the data with (x,y,z) */
1846 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
1848 /* Returns: the accession number. */
1849 /* ------------------------------------------------------------------------- */
1851 extern int dictionary_add(char *dword, int x, int y, int z)
1853 int ggfr = 0, gfr = 0, fr = 0, r = 0;
1854 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1856 int res=((version_number==3)?4:6);
1858 dictionary_prepare(dword, NULL);
1861 { root = 0; goto CreateEntry;
1865 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1869 p = dictionary+7 + at*(3+res) + res;
1870 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1871 if (x & 128) p[0] = (p[0])|number_and_case;
1874 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1875 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1876 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1877 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1878 if (x & 128) p[1] = (p[1]) | number_and_case;
1882 if (n>0) r=1; else r=0;
1884 a = dtree[at].branch[0]; b = dtree[at].branch[1];
1885 if ((a != VACANT) && (dtree[a].colour == RED) &&
1886 (b != VACANT) && (dtree[b].colour == RED))
1887 { dtree[a].colour = BLACK;
1888 dtree[b].colour = BLACK;
1890 dtree[at].colour = RED;
1892 /* A tree rotation may be needed to avoid two red links in a row:
1894 ggf (or else gf is root) ggf (or f is root)
1897 / \(red) / \ (both red)
1903 In effect we rehang the "gf" subtree from "f".
1904 See the Technical Manual for further details.
1907 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1910 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1911 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1912 dtree[f].branch[1-fr] = gf;
1913 dtree[f].colour = BLACK;
1914 dtree[gf].colour = RED;
1915 gf = ggf; gfr = ggfr;
1918 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1919 dtree[at].colour = BLACK;
1920 dtree[gf].colour = RED;
1921 dtree[f].branch[fr] = dtree[at].branch[gfr];
1922 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1923 dtree[at].branch[gfr] = f;
1924 dtree[at].branch[fr] = gf;
1926 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1927 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1932 if (dtree[at].branch[r] == VACANT)
1933 { dtree[at].colour = RED;
1935 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1937 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1938 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1939 dtree[f].branch[1-fr] = gf;
1940 dtree[f].colour = BLACK;
1941 dtree[gf].colour = RED;
1944 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1945 dtree[at].colour = BLACK;
1946 dtree[gf].colour = RED;
1947 dtree[f].branch[fr] = dtree[at].branch[gfr];
1948 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1949 dtree[at].branch[gfr] = f;
1950 dtree[at].branch[fr] = gf;
1952 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1956 dtree[at].branch[r] = dict_entries;
1959 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1960 ggfr = gfr; gfr = fr; fr = r;
1965 if (dict_entries==MAX_DICT_ENTRIES)
1966 memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1968 dtree[dict_entries].branch[0] = VACANT;
1969 dtree[dict_entries].branch[1] = VACANT;
1970 dtree[dict_entries].colour = BLACK;
1972 /* Address in Inform's own dictionary table to write the record to */
1976 p = dictionary + (3+res)*dict_entries + 7;
1978 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1981 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1982 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1983 if (version_number > 3)
1984 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1985 p[res]=x; p[res+1]=y; p[res+2]=z;
1986 if (x & 128) p[res] = (p[res])|number_and_case;
1988 dictionary_top += res+3;
1993 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
1994 p[0] = 0x60; /* type byte -- dict word */
1996 p += DICT_CHAR_SIZE;
1997 for (i=0; i<DICT_WORD_BYTES; i++)
1998 p[i] = prepared_sort[i];
2000 p += DICT_WORD_BYTES;
2002 p[2] = y/256; p[3] = y%256;
2005 p[1] |= number_and_case;
2007 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2011 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2013 return dict_entries++;
2016 /* ------------------------------------------------------------------------- */
2017 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2018 /* new verb syntax of its own. (Otherwise existing verb-words never */
2019 /* change their verb-numbers.) */
2020 /* ------------------------------------------------------------------------- */
2022 extern void dictionary_set_verb_number(char *dword, int to)
2024 int res=((version_number==3)?4:6);
2025 i=dictionary_find(dword);
2029 p=dictionary+7+(i-1)*(3+res)+res;
2033 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2034 p[2]=to/256; p[3]=to%256;
2039 /* ------------------------------------------------------------------------- */
2040 /* Tracing code for the dictionary: used not only by "trace" and text */
2041 /* transcription, but also (in the case of "word_to_ascii") in a vital */
2042 /* by the linker. */
2043 /* ------------------------------------------------------------------------- */
2045 static char *d_show_to;
2046 static int d_show_total;
2048 static void show_char(char c)
2049 { if (d_show_to == NULL) printf("%c", c);
2051 { int i = strlen(d_show_to);
2052 d_show_to[i] = c; d_show_to[i+1] = 0;
2056 extern void word_to_ascii(uchar *p, char *results)
2057 { int i, shift, cc, zchar; uchar encoded_word[9];
2058 encoded_word[0] = (((int) p[0])&0x7c)/4;
2059 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2060 encoded_word[2] = ((int) p[1])&0x1f;
2061 encoded_word[3] = (((int) p[2])&0x7c)/4;
2062 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2063 encoded_word[5] = ((int) p[3])&0x1f;
2064 if (version_number > 3)
2065 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2066 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2067 encoded_word[8] = ((int) p[5])&0x1f;
2071 for (i=0; i< ((version_number==3)?6:9); i++)
2072 { zchar = encoded_word[i];
2074 if (zchar == 4) shift = 1;
2076 if (zchar == 5) shift = 2;
2078 { if ((shift == 2) && (zchar == 6))
2079 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2081 if ((zchar>=32) && (zchar<=126))
2082 results[cc++] = zchar;
2084 { zscii_to_text(results+cc, zchar);
2085 cc = strlen(results);
2089 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2090 cc = strlen(results);
2098 static void recursively_show_z(int node)
2099 { int i, cprinted, flags; uchar *p;
2100 char textual_form[32];
2101 int res = (version_number == 3)?4:6;
2103 if (dtree[node].branch[0] != VACANT)
2104 recursively_show_z(dtree[node].branch[0]);
2106 p = (uchar *)dictionary + 7 + (3+res)*node;
2108 word_to_ascii(p, textual_form);
2110 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2111 show_char(textual_form[cprinted]);
2112 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2115 if (d_show_to == NULL)
2116 { for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2118 flags = (int) p[res];
2121 if (flags & 4) printf("p"); else printf(" ");
2126 { if (grammar_version_number == 1)
2127 printf("preposition:%d ", (int) p[res+2]);
2129 printf("preposition ");
2131 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2132 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2136 if (d_show_total++ == 5)
2138 if (d_show_to != NULL)
2139 { write_to_transcript_file(d_show_to);
2144 if (dtree[node].branch[1] != VACANT)
2145 recursively_show_z(dtree[node].branch[1]);
2148 static void recursively_show_g(int node)
2150 warning("### Glulx dictionary-show not yet implemented.\n");
2153 static void show_alphabet(int i)
2154 { int j, c; char chartext[8];
2156 for (j=0; j<26; j++)
2157 { c = alphabet[i][j];
2159 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2161 zscii_to_text(chartext, c);
2162 printf("%s", chartext);
2164 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2169 extern void show_dictionary(void)
2170 { printf("Dictionary contains %d entries:\n",dict_entries);
2171 if (dict_entries != 0)
2172 { d_show_total = 0; d_show_to = NULL;
2174 recursively_show_z(root);
2176 recursively_show_g(root);
2178 printf("\nZ-machine alphabet entries:\n");
2184 extern void write_dictionary_to_transcript(void)
2185 { char d_buffer[81];
2187 sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2189 d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2191 if (dict_entries != 0)
2192 { d_show_total = 0; d_show_to = d_buffer;
2194 recursively_show_z(root);
2196 recursively_show_g(root);
2198 if (d_show_total != 0) write_to_transcript_file(d_buffer);
2201 /* ========================================================================= */
2202 /* Data structure management routines */
2203 /* ------------------------------------------------------------------------- */
2205 extern void init_text_vars(void)
2212 no_chars_transcribed = 0;
2213 is_abbreviation = FALSE;
2214 put_strings_in_low_memory = FALSE;
2216 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2218 total_zchars_trans = 0;
2221 final_dict_order = NULL;
2222 dict_sort_codes = NULL;
2225 initialise_memory_block(&static_strings_area);
2228 extern void text_begin_pass(void)
2229 { abbrevs_lookup_table_made = FALSE;
2231 total_chars_trans=0; total_bytes_trans=0;
2232 if (store_the_text) all_text_top=all_text;
2233 dictionary_begin_pass();
2234 low_strings_top = low_strings;
2236 static_strings_extent = 0;
2238 no_dynamic_strings = 0;
2239 no_unicode_chars = 0;
2242 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2244 extern void text_allocate_arrays(void)
2245 { abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2247 abbrev_values = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2248 abbrev_quality = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2249 abbrev_freqs = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev freqs");
2251 dtree = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2252 "red-black tree for dictionary");
2253 final_dict_order = my_calloc(sizeof(int), MAX_DICT_ENTRIES,
2254 "final dictionary ordering table");
2255 dict_sort_codes = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2256 "dictionary sort codes");
2259 dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2262 dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2265 strings_holding_area
2266 = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2267 low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2269 huff_entities = NULL;
2271 unicode_usage_entries = NULL;
2272 done_compression = FALSE;
2273 compression_table_size = 0;
2274 compressed_offsets = NULL;
2276 MAX_CHARACTER_SET = 0;
2279 if (compression_switch) {
2281 MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2282 + MAX_UNICODE_CHARS;
2283 huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2284 "huffman entities");
2285 hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2286 "huffman node list");
2287 unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2288 MAX_UNICODE_CHARS, "unicode entity entries");
2289 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2290 unicode_usage_hash[ix] = NULL;
2292 compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2293 "static strings index table");
2297 extern void text_free_arrays(void)
2299 my_free(&strings_holding_area, "static strings holding area");
2300 my_free(&low_strings, "low (abbreviation) strings");
2301 my_free(&abbreviations_at, "abbreviations");
2302 my_free(&abbrev_values, "abbrev values");
2303 my_free(&abbrev_quality, "abbrev quality");
2304 my_free(&abbrev_freqs, "abbrev freqs");
2306 my_free(&dtree, "red-black tree for dictionary");
2307 my_free(&final_dict_order, "final dictionary ordering table");
2308 my_free(&dict_sort_codes, "dictionary sort codes");
2310 my_free(&dictionary,"dictionary");
2312 my_free(&compressed_offsets, "static strings index table");
2313 my_free(&hufflist, "huffman node list");
2314 my_free(&huff_entities, "huffman entities");
2315 my_free(&unicode_usage_entries, "unicode entity entities");
2317 deallocate_memory_block(&static_strings_area);
2320 extern void ao_free_arrays(void)
2321 { my_free (&tlbtab,"tlb table");
2322 my_free (&sub_buffer,"sub_buffer");
2323 my_free (&bestyet,"bestyet");
2324 my_free (&bestyet2,"bestyet2");
2325 my_free (&grandtable,"grandtable");
2326 my_free (&grandflags,"grandflags");
2329 /* ========================================================================= */