1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Copyright (c) Graham Nelson 1993 - 2016 */
6 /* This file is part of Inform. */
8 /* Inform is free software: you can redistribute it and/or modify */
9 /* it under the terms of the GNU General Public License as published by */
10 /* the Free Software Foundation, either version 3 of the License, or */
11 /* (at your option) any later version. */
13 /* Inform is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16 /* GNU General Public License for more details. */
18 /* You should have received a copy of the GNU General Public License */
19 /* along with Inform. If not, see https://gnu.org/licenses/ */
21 /* ------------------------------------------------------------------------- */
25 uchar *low_strings, *low_strings_top; /* Start and next free byte in the low
28 int32 static_strings_extent; /* Number of bytes of static strings
30 memory_block static_strings_area; /* Used if (!temporary_files_switch) to
31 hold the static strings area so far */
33 static uchar *strings_holding_area; /* Area holding translated strings
34 until they are moved into either
35 a temporary file, or the
36 static_strings_area below */
38 char *all_text, *all_text_top; /* Start and next byte free in (large)
39 text buffer holding the entire text
40 of the game, when it is being
42 int put_strings_in_low_memory, /* When TRUE, put static strings in
43 the low strings pool at 0x100 rather
44 than in the static strings area */
45 is_abbreviation, /* When TRUE, the string being trans
46 is itself an abbreviation string
47 so can't make use of abbreviations */
48 abbrevs_lookup_table_made, /* The abbreviations lookup table is
49 constructed when the first non-
50 abbreviation string is translated:
51 this flag is TRUE after that */
52 abbrevs_lookup[256]; /* Once this has been constructed,
53 abbrevs_lookup[n] = the smallest
54 number of any abbreviation beginning
55 with ASCII character n, or -1
56 if none of the abbreviations do */
57 int no_abbreviations; /* No of abbreviations defined so far */
58 uchar *abbreviations_at; /* Memory to hold the text of any
59 abbreviation strings declared */
60 /* ------------------------------------------------------------------------- */
61 /* Glulx string compression storage */
62 /* ------------------------------------------------------------------------- */
64 int no_strings; /* No of strings in static strings
66 int no_dynamic_strings; /* No. of @.. string escapes used
67 (actually, the highest value used
69 int no_unicode_chars; /* Number of distinct Unicode chars
70 used. (Beyond 0xFF.) */
72 static int MAX_CHARACTER_SET; /* Number of possible entities */
73 huffentity_t *huff_entities; /* The list of entities (characters,
74 abbreviations, @.. escapes, and
76 static huffentity_t **hufflist; /* Copy of the list, for sorting */
78 int no_huff_entities; /* The number of entities in the list */
79 int huff_unicode_start; /* Position in the list where Unicode
81 int huff_abbrev_start; /* Position in the list where string
82 abbreviations begin. */
83 int huff_dynam_start; /* Position in the list where @..
85 int huff_entity_root; /* The position in the list of the root
86 entry (when considering the table
89 int done_compression; /* Has the game text been compressed? */
90 int32 compression_table_size; /* Length of the Huffman table, in
92 int32 compression_string_size; /* Length of the compressed string
94 int32 *compressed_offsets; /* The beginning of every string in
95 the game, relative to the beginning
96 of the Huffman table. (So entry 0
97 is equal to compression_table_size)*/
99 #define UNICODE_HASH_BUCKETS (64)
100 unicode_usage_t *unicode_usage_entries;
101 static unicode_usage_t *unicode_usage_hash[UNICODE_HASH_BUCKETS];
103 static int unicode_entity_index(int32 unicode);
105 /* ------------------------------------------------------------------------- */
106 /* Abbreviation arrays */
107 /* ------------------------------------------------------------------------- */
113 /* ------------------------------------------------------------------------- */
115 int32 total_chars_trans, /* Number of ASCII chars of text in */
116 total_bytes_trans, /* Number of bytes of Z-code text out */
117 zchars_trans_in_last_string; /* Number of Z-chars in last string:
118 needed only for abbrev efficiency
119 calculation in "directs.c" */
120 static int32 total_zchars_trans, /* Number of Z-chars of text out
121 (only used to calculate the above) */
122 no_chars_transcribed; /* Number of ASCII chars written to
123 the text transcription area (used
124 for the -r and -u switches) */
126 static int zchars_out_buffer[3], /* During text translation, a buffer of
127 3 Z-chars at a time: when it's full
128 these are written as a 2-byte word */
129 zob_index; /* Index (0 to 2) into it */
131 static unsigned char *text_out_pc; /* The "program counter" during text
132 translation: the next address to
133 write Z-coded text output to */
135 static unsigned char *text_out_limit; /* The upper limit of text_out_pc
136 during text translation */
138 static int text_out_overflow; /* During text translation, becomes
139 true if text_out_pc tries to pass
142 /* ------------------------------------------------------------------------- */
143 /* For variables/arrays used by the dictionary manager, see below */
144 /* ------------------------------------------------------------------------- */
146 /* ------------------------------------------------------------------------- */
147 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
148 /* detection in text translation). We first bubble-sort the abbrevs into */
149 /* alphabetical order (this is necessary for the detection algorithm to */
150 /* to work). Since the table is only prepared once, and for a table */
151 /* of size at most 96, there's no point using an efficient sort algorithm. */
152 /* ------------------------------------------------------------------------- */
154 static void make_abbrevs_lookup(void)
155 { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
157 { bubble_sort = FALSE;
158 for (j=0; j<no_abbreviations; j++)
159 for (k=j+1; k<no_abbreviations; k++)
160 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
161 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
163 { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
164 l=abbrev_values[j]; abbrev_values[j]=abbrev_values[k];
166 l=abbrev_quality[j]; abbrev_quality[j]=abbrev_quality[k];
171 } while (bubble_sort);
173 for (j=no_abbreviations-1; j>=0; j--)
174 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
175 abbrevs_lookup[(uchar)p1[0]]=j;
178 abbrevs_lookup_table_made = TRUE;
181 /* ------------------------------------------------------------------------- */
182 /* Search the abbreviations lookup table (a routine which must be fast). */
183 /* The source text to compare is text[i], text[i+1], ... and this routine */
184 /* is only called if text[i] is indeed the first character of at least one */
185 /* abbreviation, "from" begin the least index into the abbreviations table */
186 /* of an abbreviation for which text[i] is the first character. Recall */
187 /* that the abbrevs table is in alphabetical order. */
189 /* The return value is -1 if there is no match. If there is a match, the */
190 /* text to be abbreviated out is over-written by a string of null chars */
191 /* with "ASCII" value 1, and the abbreviation number is returned. */
193 /* In Glulx, we *do not* do this overwriting with 1's. */
194 /* ------------------------------------------------------------------------- */
196 static int try_abbreviations_from(unsigned char *text, int i, int from)
197 { int j, k; uchar *p, c;
199 for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
200 (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
201 { if (text[i+1]==p[1])
202 { for (k=2; p[k]!=0; k++)
203 if (text[i+k]!=p[k]) goto NotMatched;
205 for (k=0; p[k]!=0; k++) text[i+k]=1;
215 extern void make_abbreviation(char *text)
217 strcpy((char *)abbreviations_at
218 + no_abbreviations*MAX_ABBREV_LENGTH, text);
220 is_abbreviation = TRUE;
221 abbrev_values[no_abbreviations] = compile_string(text, TRUE, TRUE);
222 is_abbreviation = FALSE;
224 /* The quality is the number of Z-chars saved by using this */
225 /* abbreviation: note that it takes 2 Z-chars to print it. */
227 abbrev_quality[no_abbreviations++] = zchars_trans_in_last_string - 2;
230 /* ------------------------------------------------------------------------- */
231 /* The front end routine for text translation */
232 /* ------------------------------------------------------------------------- */
234 extern int32 compile_string(char *b, int in_low_memory, int is_abbrev)
235 { int i, j; uchar *c;
237 is_abbreviation = is_abbrev;
239 /* Put into the low memory pool (at 0x100 in the Z-machine) of strings */
240 /* which may be wanted as possible entries in the abbreviations table */
242 if (!glulx_mode && in_low_memory)
243 { j=subtract_pointers(low_strings_top,low_strings);
244 low_strings_top=translate_text(low_strings_top, low_strings+MAX_LOW_STRINGS, b);
245 if (!low_strings_top)
246 memoryerror("MAX_LOW_STRINGS", MAX_LOW_STRINGS);
247 is_abbreviation = FALSE;
251 if (glulx_mode && done_compression)
252 compiler_error("Tried to add a string after compression was done.");
254 c = translate_text(strings_holding_area, strings_holding_area+MAX_STATIC_STRINGS, b);
256 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
258 i = subtract_pointers(c, strings_holding_area);
260 /* Insert null bytes as needed to ensure that the next static string */
261 /* also occurs at an address expressible as a packed address */
265 if (oddeven_packing_switch)
266 textalign = scale_factor*2;
268 textalign = scale_factor;
269 while ((i%textalign)!=0)
271 if (i+2 > MAX_STATIC_STRINGS)
272 memoryerror("MAX_STATIC_STRINGS",MAX_STATIC_STRINGS);
273 i+=2; *c++ = 0; *c++ = 0;
277 j = static_strings_extent;
279 if (temporary_files_switch)
280 for (c=strings_holding_area; c<strings_holding_area+i;
281 c++, static_strings_extent++)
284 for (c=strings_holding_area; c<strings_holding_area+i;
285 c++, static_strings_extent++)
286 write_byte_to_memory_block(&static_strings_area,
287 static_strings_extent, *c);
289 is_abbreviation = FALSE;
292 return(j/scale_factor);
295 /* The marker value is a one-based string number. (We reserve zero
296 to mean "not a string at all". */
297 return (++no_strings);
301 /* ------------------------------------------------------------------------- */
302 /* Output a single Z-character into the buffer, and flush it if full */
303 /* ------------------------------------------------------------------------- */
305 static void write_z_char_z(int i)
308 total_zchars_trans++;
309 zchars_out_buffer[zob_index++]=(i%32);
310 if (zob_index!=3) return;
312 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
313 + zchars_out_buffer[2];
314 if (text_out_pc+2 > text_out_limit) {
315 text_out_overflow = TRUE;
318 text_out_pc[0] = j/256; text_out_pc[1] = j%256; text_out_pc+=2;
319 total_bytes_trans+=2;
322 static void write_zscii(int zsc)
324 int lookup_value, in_alphabet;
331 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
333 else lookup_value = -1;
335 if (lookup_value >= 0)
336 { alphabet_used[lookup_value] = 'Y';
337 in_alphabet = lookup_value/26;
338 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
339 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
340 write_z_char_z(lookup_value%26 + 6);
343 { write_z_char_z(5); write_z_char_z(6);
344 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
348 /* ------------------------------------------------------------------------- */
349 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
350 /* setting the "end" bit on the final 2-byte word */
351 /* ------------------------------------------------------------------------- */
353 static void end_z_chars(void)
355 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
356 while (zob_index!=0) write_z_char_z(5);
357 p=(unsigned char *) text_out_pc;
361 /* Glulx handles this much more simply -- compression is done elsewhere. */
362 static void write_z_char_g(int i)
365 if (text_out_pc+1 > text_out_limit) {
366 text_out_overflow = TRUE;
369 total_zchars_trans++;
375 /* ------------------------------------------------------------------------- */
376 /* The main routine "text.c" provides to the rest of Inform: the text */
377 /* translator. p is the address to write output to, s_text the source text */
378 /* and the return value is the next free address to write output to. */
379 /* The return value will not exceed p_limit. If the translation tries to */
380 /* overflow this boundary, the return value will be NULL (and you should */
381 /* display an error). */
382 /* Note that the source text may be corrupted by this routine. */
383 /* ------------------------------------------------------------------------- */
385 extern uchar *translate_text(uchar *p, uchar *p_limit, char *s_text)
386 { int i, j, k, in_alphabet, lookup_value;
387 int32 unicode; int zscii;
388 unsigned char *text_in;
390 /* Cast the input and output streams to unsigned char: text_out_pc will
391 advance as bytes of Z-coded text are written, but text_in doesn't */
393 text_in = (unsigned char *) s_text;
394 text_out_pc = (unsigned char *) p;
395 text_out_limit = (unsigned char *) p_limit;
396 text_out_overflow = FALSE;
398 /* Remember the Z-chars total so that later we can subtract to find the
399 number of Z-chars translated on this string */
401 zchars_trans_in_last_string = total_zchars_trans;
403 /* Start with the Z-characters output buffer empty */
407 /* If this is the first text translated since the abbreviations were
408 declared, and if some were declared, then it's time to make the
409 lookup table for abbreviations
411 (Except: we don't if the text being translated is itself
412 the text of an abbreviation currently being defined) */
414 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
415 && (!is_abbreviation))
416 make_abbrevs_lookup();
418 /* If we're storing the whole game text to memory, then add this text */
420 if ((!is_abbreviation) && (store_the_text))
421 { no_chars_transcribed += strlen(s_text)+2;
422 if (no_chars_transcribed >= MAX_TRANSCRIPT_SIZE)
423 memoryerror("MAX_TRANSCRIPT_SIZE", MAX_TRANSCRIPT_SIZE);
424 sprintf(all_text_top, "%s\n\n", s_text);
425 all_text_top += strlen(all_text_top);
428 if (transcript_switch && (!veneer_mode))
429 write_to_transcript_file(s_text);
433 /* The empty string of Z-text is illegal, since it can't carry an end
434 bit: so we translate an empty string of ASCII text to just the
435 pad character 5. Printing this causes nothing to appear on screen. */
437 if (text_in[0]==0) write_z_char_z(5);
439 /* Loop through the characters of the null-terminated input text: note
440 that if 1 is written over a character in the input text, it is
441 afterwards ignored */
443 for (i=0; text_in[i]!=0; i++)
444 { total_chars_trans++;
446 /* Contract ". " into ". " if double-space-removing switch set:
447 likewise "? " and "! " if the setting is high enough */
449 if ((double_space_setting >= 1)
450 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
451 { if (text_in[i]=='.') text_in[i+2]=1;
452 if (double_space_setting >= 2)
453 { if (text_in[i]=='?') text_in[i+2]=1;
454 if (text_in[i]=='!') text_in[i+2]=1;
458 /* Try abbreviations if the economy switch set */
460 if ((economy_switch) && (!is_abbreviation)
461 && ((k=abbrevs_lookup[text_in[i]])!=-1))
462 { if ((j=try_abbreviations_from(text_in, i, k))!=-1)
463 { if (j<32) { write_z_char_z(2); write_z_char_z(j); }
464 else { write_z_char_z(3); write_z_char_z(j-32); }
468 /* If Unicode switch set, use text_to_unicode to perform UTF-8
470 if (character_set_unicode && (text_in[i] & 0x80))
471 { unicode = text_to_unicode((char *) (text_in+i));
472 zscii = unicode_to_zscii(unicode);
473 if (zscii != 5) write_zscii(zscii);
475 { unicode_char_error(
476 "Character can only be used if declared in \
477 advance as part of 'Zcharacter table':", unicode);
479 i += textual_form_length - 1;
483 /* '@' is the escape character in Inform string notation: the various
487 @@decimalnumber : write this ZSCII char (0 to 1023)
488 @twodigits : write the abbreviation string with this
492 @accentcode : this accented character: e.g.,
493 for @'e write an E-acute
494 @{...} : this Unicode char (in hex) */
497 { if (text_in[i+1]=='@')
501 i+=2; j=atoi((char *) (text_in+i));
503 { /* Prevent ~ and ^ from being translated to double-quote
504 and new-line, as they ordinarily would be */
506 case 94: write_z_char_z(5); write_z_char_z(6);
507 write_z_char_z(94/32); write_z_char_z(94%32);
509 case 126: write_z_char_z(5); write_z_char_z(6);
510 write_z_char_z(126/32); write_z_char_z(126%32);
513 default: write_zscii(j); break;
515 while (isdigit(text_in[i])) i++; i--;
517 else if (isdigit(text_in[i+1])!=0)
522 d1 = character_digit_value[text_in[i+1]];
523 d2 = character_digit_value[text_in[i+2]];
524 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
525 error("'@..' must have two decimal digits");
528 write_z_char_z(1); write_z_char_z(d1*10 + d2);
533 /* A string escape specifying an unusual character */
535 unicode = text_to_unicode((char *) (text_in+i));
536 zscii = unicode_to_zscii(unicode);
537 if (zscii != 5) write_zscii(zscii);
539 { unicode_char_error(
540 "Character can only be used if declared in \
541 advance as part of 'Zcharacter table':", unicode);
543 i += textual_form_length - 1;
547 { /* Skip a character which has been over-written with the null
548 value 1 earlier on */
551 { if (text_in[i]==' ') write_z_char_z(0);
553 { j = (int) text_in[i];
554 lookup_value = iso_to_alphabet_grid[j];
555 if (lookup_value < 0)
556 { /* The character isn't in the standard alphabets, so
557 we have to use the ZSCII 4-Z-char sequence */
559 if (lookup_value == -5)
560 { /* Character isn't in the ZSCII set at all */
562 unicode = iso_to_unicode(j);
564 "Character can only be used if declared in \
565 advance as part of 'Zcharacter table':", unicode);
566 write_zscii(0x200 + unicode/0x100);
567 write_zscii(0x300 + unicode%0x100);
569 else write_zscii(-lookup_value);
572 { /* The character is in one of the standard alphabets:
573 write a SHIFT to temporarily change alphabet if
574 it isn't in alphabet 0, then write the Z-char */
576 alphabet_used[lookup_value] = 'Y';
577 in_alphabet = lookup_value/26;
578 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
579 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
580 write_z_char_z(lookup_value%26 + 6);
587 /* Flush the Z-characters output buffer and set the "end" bit */
594 /* The text storage here is, of course, temporary. Compression
595 will occur when we're finished compiling, so that all the
596 clever Huffman stuff will work.
597 In the stored text, we use "@@" to indicate @,
598 "@0" to indicate a zero byte,
599 "@ANNNN" to indicate an abbreviation,
600 "@DNNNN" to indicate a dynamic string thing.
601 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
602 (NNNN is a four-digit hex number using the letters A-P... an
603 ugly representation but a convenient one.)
606 for (i=0; text_in[i]!=0; i++) {
608 /* Contract ". " into ". " if double-space-removing switch set:
609 likewise "? " and "! " if the setting is high enough. */
610 if ((double_space_setting >= 1)
611 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
613 || (double_space_setting >= 2
614 && (text_in[i]=='?' || text_in[i]=='!'))) {
615 text_in[i+1] = text_in[i];
622 /* Try abbreviations if the economy switch set. We have to be in
623 compression mode too, since the abbreviation mechanism is part
624 of string decompression. */
626 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
627 && ((k=abbrevs_lookup[text_in[i]])!=-1)
628 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
629 char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
633 write_z_char_g('A' + ((j >>12) & 0x0F));
634 write_z_char_g('A' + ((j >> 8) & 0x0F));
635 write_z_char_g('A' + ((j >> 4) & 0x0F));
636 write_z_char_g('A' + ((j ) & 0x0F));
638 else if (text_in[i] == '@') {
639 if (text_in[i+1]=='@') {
641 i+=2; j=atoi((char *) (text_in+i));
642 if (j == '@' || j == '\0') {
646 if (!compression_switch)
647 warning("Ascii @@0 will prematurely terminate non-compressed \
652 while (isdigit(text_in[i])) i++; i--;
654 else if (isdigit(text_in[i+1])) {
656 d1 = character_digit_value[text_in[i+1]];
657 d2 = character_digit_value[text_in[i+2]];
658 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
659 error("'@..' must have two decimal digits");
662 if (!compression_switch)
663 warning("'@..' print variable will not work in non-compressed \
664 string; substituting ' '.");
667 if (j >= MAX_DYNAMIC_STRINGS) {
668 memoryerror("MAX_DYNAMIC_STRINGS", MAX_DYNAMIC_STRINGS);
671 if (j+1 >= no_dynamic_strings)
672 no_dynamic_strings = j+1;
675 write_z_char_g('A' + ((j >>12) & 0x0F));
676 write_z_char_g('A' + ((j >> 8) & 0x0F));
677 write_z_char_g('A' + ((j >> 4) & 0x0F));
678 write_z_char_g('A' + ((j ) & 0x0F));
682 unicode = text_to_unicode((char *) (text_in+i));
683 i += textual_form_length - 1;
684 if (unicode == '@' || unicode == '\0') {
686 write_z_char_g(unicode ? '@' : '0');
688 else if (unicode >= 0 && unicode < 256) {
689 write_z_char_g(unicode);
692 if (!compression_switch) {
693 warning("Unicode characters will not work in non-compressed \
694 string; substituting '?'.");
698 j = unicode_entity_index(unicode);
701 write_z_char_g('A' + ((j >>12) & 0x0F));
702 write_z_char_g('A' + ((j >> 8) & 0x0F));
703 write_z_char_g('A' + ((j >> 4) & 0x0F));
704 write_z_char_g('A' + ((j ) & 0x0F));
709 else if (text_in[i] == '^')
710 write_z_char_g(0x0A);
711 else if (text_in[i] == '~')
713 else if (character_set_unicode) {
714 if (text_in[i] & 0x80) {
715 unicode = text_to_unicode((char *) (text_in+i));
716 i += textual_form_length - 1;
717 if (unicode >= 0 && unicode < 256) {
718 write_z_char_g(unicode);
721 if (!compression_switch) {
722 warning("Unicode characters will not work in non-compressed \
723 string; substituting '?'.");
727 j = unicode_entity_index(unicode);
730 write_z_char_g('A' + ((j >>12) & 0x0F));
731 write_z_char_g('A' + ((j >> 8) & 0x0F));
732 write_z_char_g('A' + ((j >> 4) & 0x0F));
733 write_z_char_g('A' + ((j ) & 0x0F));
738 write_z_char_g(text_in[i]);
742 unicode = iso_to_unicode_grid[text_in[i]];
743 if (unicode >= 0 && unicode < 256) {
744 write_z_char_g(unicode);
747 if (!compression_switch) {
748 warning("Unicode characters will not work in non-compressed \
749 string; substituting '?'.");
753 j = unicode_entity_index(unicode);
756 write_z_char_g('A' + ((j >>12) & 0x0F));
757 write_z_char_g('A' + ((j >> 8) & 0x0F));
758 write_z_char_g('A' + ((j >> 4) & 0x0F));
759 write_z_char_g('A' + ((j ) & 0x0F));
768 if (text_out_overflow)
771 return((uchar *) text_out_pc);
774 static int unicode_entity_index(int32 unicode)
776 unicode_usage_t *uptr;
778 int buck = unicode % UNICODE_HASH_BUCKETS;
780 for (uptr = unicode_usage_hash[buck]; uptr; uptr=uptr->next) {
781 if (uptr->ch == unicode)
785 j = (uptr - unicode_usage_entries);
788 if (no_unicode_chars >= MAX_UNICODE_CHARS) {
789 memoryerror("MAX_UNICODE_CHARS", MAX_UNICODE_CHARS);
793 j = no_unicode_chars;
795 uptr = unicode_usage_entries + j;
797 uptr->next = unicode_usage_hash[buck];
798 unicode_usage_hash[buck] = uptr;
805 /* ------------------------------------------------------------------------- */
806 /* Glulx compression code */
807 /* ------------------------------------------------------------------------- */
810 static void compress_makebits(int entnum, int depth, int prevbit,
811 huffbitlist_t *bits);
813 /* The compressor. This uses the usual Huffman compression algorithm. */
814 void compress_game_text()
816 int entities=0, branchstart, branches;
824 if (compression_switch) {
826 /* How many entities have we currently got? Well, 256 plus the
827 string-terminator plus Unicode chars plus abbrevations plus
830 huff_unicode_start = entities;
831 entities += no_unicode_chars;
832 huff_abbrev_start = entities;
834 entities += no_abbreviations;
835 huff_dynam_start = entities;
836 entities += no_dynamic_strings;
838 if (entities > MAX_CHARACTER_SET)
839 memoryerror("MAX_CHARACTER_SET",MAX_CHARACTER_SET);
842 for (jx=0; jx<256; jx++) {
843 huff_entities[jx].type = 2;
844 huff_entities[jx].count = 0;
845 huff_entities[jx].u.ch = jx;
848 huff_entities[256].type = 1;
849 huff_entities[256].count = 0;
850 for (jx=0; jx<no_unicode_chars; jx++) {
851 huff_entities[huff_unicode_start+jx].type = 4;
852 huff_entities[huff_unicode_start+jx].count = 0;
853 huff_entities[huff_unicode_start+jx].u.val = jx;
855 if (economy_switch) {
856 for (jx=0; jx<no_abbreviations; jx++) {
857 huff_entities[huff_abbrev_start+jx].type = 3;
858 huff_entities[huff_abbrev_start+jx].count = 0;
859 huff_entities[huff_abbrev_start+jx].u.val = jx;
862 for (jx=0; jx<no_dynamic_strings; jx++) {
863 huff_entities[huff_dynam_start+jx].type = 9;
864 huff_entities[huff_dynam_start+jx].count = 0;
865 huff_entities[huff_dynam_start+jx].u.val = jx;
869 /* No compression; use defaults that will make it easy to check
871 no_huff_entities = 257;
872 huff_unicode_start = 257;
873 huff_abbrev_start = 257;
874 huff_dynam_start = 257+MAX_ABBREVS;
875 compression_table_size = 0;
878 if (temporary_files_switch) {
880 Temp1_fp=fopen(Temp1_Name,"rb");
882 fatalerror("I/O failure: couldn't reopen temporary file 1");
885 if (compression_switch) {
887 for (lx=0, ix=0; lx<no_strings; lx++) {
888 int escapelen=0, escapetype=0;
892 if (temporary_files_switch)
893 ch = fgetc(Temp1_fp);
895 ch = read_byte_from_memory_block(&static_strings_area, ix);
897 if (ix > static_strings_extent || ch < 0)
898 compiler_error("Read too much not-yet-compressed text.");
899 if (escapelen == -1) {
904 else if (ch == '0') {
907 else if (ch == 'A' || ch == 'D' || ch == 'U') {
914 compiler_error("Strange @ escape in processed text.");
917 else if (escapelen) {
918 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
920 if (escapelen == 0) {
921 if (escapetype == 'A') {
922 ch = huff_abbrev_start+escapeval;
924 else if (escapetype == 'D') {
925 ch = huff_dynam_start+escapeval;
927 else if (escapetype == 'U') {
928 ch = huff_unicode_start+escapeval;
931 compiler_error("Strange @ escape in processed text.");
947 huff_entities[ch].count++;
952 for (jx=0; jx<entities; jx++) {
953 if (huff_entities[jx].count) {
954 hufflist[numlive] = &(huff_entities[jx]);
959 branchstart = entities;
962 while (numlive > 1) {
964 int best1num, best2num;
967 if (hufflist[0]->count < hufflist[1]->count) {
976 best1num = hufflist[best1]->count;
977 best2num = hufflist[best2]->count;
979 for (jx=2; jx<numlive; jx++) {
980 if (hufflist[jx]->count < best1num) {
984 best1num = hufflist[best1]->count;
986 else if (hufflist[jx]->count < best2num) {
988 best2num = hufflist[best2]->count;
992 bran = &(huff_entities[branchstart+branches]);
995 bran->count = hufflist[best1]->count + hufflist[best2]->count;
996 bran->u.branch[0] = (hufflist[best1] - huff_entities);
997 bran->u.branch[1] = (hufflist[best2] - huff_entities);
998 hufflist[best1] = bran;
999 if (best2 < numlive-1) {
1000 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1001 ((numlive-1) - best2) * sizeof(huffentity_t *));
1006 huff_entity_root = (hufflist[0] - huff_entities);
1008 for (ix=0; ix<MAXHUFFBYTES; ix++)
1010 compression_table_size = 12;
1012 no_huff_entities = 0; /* compress_makebits will total this up */
1013 compress_makebits(huff_entity_root, 0, -1, &bits);
1016 /* Now, sadly, we have to compute the size of the string section,
1017 without actually doing the compression. */
1018 compression_string_size = 0;
1020 if (temporary_files_switch) {
1021 fseek(Temp1_fp, 0, SEEK_SET);
1024 if (no_strings >= MAX_NUM_STATIC_STRINGS)
1025 memoryerror("MAX_NUM_STATIC_STRINGS", MAX_NUM_STATIC_STRINGS);
1027 for (lx=0, ix=0; lx<no_strings; lx++) {
1028 int escapelen=0, escapetype=0;
1032 compressed_offsets[lx] = compression_table_size + compression_string_size;
1033 compression_string_size++; /* for the type byte */
1035 if (temporary_files_switch)
1036 ch = fgetc(Temp1_fp);
1038 ch = read_byte_from_memory_block(&static_strings_area, ix);
1040 if (ix > static_strings_extent || ch < 0)
1041 compiler_error("Read too much not-yet-compressed text.");
1042 if (escapelen == -1) {
1047 else if (ch == '0') {
1050 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1057 compiler_error("Strange @ escape in processed text.");
1060 else if (escapelen) {
1061 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1063 if (escapelen == 0) {
1064 if (escapetype == 'A') {
1065 ch = huff_abbrev_start+escapeval;
1067 else if (escapetype == 'D') {
1068 ch = huff_dynam_start+escapeval;
1070 else if (escapetype == 'U') {
1071 ch = huff_unicode_start+escapeval;
1074 compiler_error("Strange @ escape in processed text.");
1091 if (compression_switch) {
1092 jx += huff_entities[ch].depth;
1093 compression_string_size += (jx/8);
1097 if (ch >= huff_dynam_start) {
1098 compression_string_size += 3;
1100 else if (ch >= huff_unicode_start) {
1101 compiler_error("Abbreviation/Unicode in non-compressed string \
1102 should be impossible.");
1105 compression_string_size += 1;
1108 if (compression_switch && jx)
1109 compression_string_size++;
1112 done_compression = TRUE;
1115 static void compress_makebits(int entnum, int depth, int prevbit,
1116 huffbitlist_t *bits)
1118 huffentity_t *ent = &(huff_entities[entnum]);
1122 ent->addr = compression_table_size;
1127 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1130 switch (ent->type) {
1132 compression_table_size += 9;
1133 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1134 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1137 compression_table_size += 1;
1140 compression_table_size += 2;
1143 cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1144 compression_table_size += (1 + 1 + strlen(cx));
1148 compression_table_size += 5;
1153 /* ------------------------------------------------------------------------- */
1154 /* The abbreviations optimiser */
1156 /* This is a very complex, memory and time expensive algorithm to */
1157 /* approximately solve the problem of which abbreviation strings would */
1158 /* minimise the total number of Z-chars to which the game text translates. */
1159 /* It is in some ways a quite separate program but remains inside Inform */
1160 /* for compatibility with previous releases. */
1161 /* ------------------------------------------------------------------------- */
1163 typedef struct tlb_s
1165 int32 intab, occurrences;
1168 static int32 no_occs;
1170 static int32 *grandtable;
1171 static int32 *grandflags;
1172 typedef struct optab_s
1177 char text[MAX_ABBREV_LENGTH];
1179 static optab *bestyet, *bestyet2;
1183 static char *sub_buffer;
1185 static void optimise_pass(void)
1186 { int32 i; int t1, t2;
1187 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1188 for (i=0; i<256; i++) bestyet[i].length=0;
1189 for (i=0; i<no_occs; i++)
1190 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1193 if (i%((**g_pm_hndl).linespercheck) == 0)
1194 { ProcessEvents (&g_proc);
1198 my_free(&all_text,"transcription text");
1199 longjmp (g_fallback, 1);
1203 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1204 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1205 (long int) tlbtab[i].occurrences);
1207 for (j=0; j<tlbtab[i].occurrences; j++)
1208 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1209 nl=2; noflags=tlbtab[i].occurrences;
1210 while ((noflags>=2)&&(nl<=62))
1212 for (j2=0; j2<nl; j2++)
1213 if (all_text[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1216 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1217 { if (grandflags[j2]==1)
1218 { x=grandtable[tlbtab[i].intab+j2]
1219 - grandtable[tlbtab[i].intab+j];
1220 if (((x>-nl)&&(x<nl))
1221 || (memcmp(all_text+grandtable[tlbtab[i].intab+j],
1222 all_text+grandtable[tlbtab[i].intab+j2],
1224 { grandflags[j2]=0; noflags--; }
1229 for (k=0; k<nl; k++)
1231 c=all_text[grandtable[tlbtab[i].intab+j+k]];
1233 { if (iso_to_alphabet_grid[c]<0)
1236 if (iso_to_alphabet_grid[c]>=26)
1240 score=(matches-1)*(scrabble-2);
1242 for (j2=0; j2<256; j2++)
1243 { if ((nl==bestyet[j2].length)
1244 && (memcmp(all_text+bestyet[j2].location,
1245 all_text+grandtable[tlbtab[i].intab+j],
1247 { j2=256; min=score; }
1249 { if (bestyet[j2].score<min)
1250 { min=bestyet[j2].score; minat=j2;
1255 { bestyet[minat].score=score;
1256 bestyet[minat].length=nl;
1257 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1258 bestyet[minat].popularity=matches;
1259 for (j2=0; j2<nl; j2++) sub_buffer[j2]=
1260 all_text[bestyet[minat].location+j2];
1266 t2=((int) time(0)) - t1;
1267 printf(" (%d seconds)\n",t2);
1272 static int any_overlap(char *s1, char *s2)
1273 { int a, b, i, j, flag;
1274 a=strlen(s1); b=strlen(s2);
1275 for (i=1-b; i<a; i++)
1278 if ((0<=i+j)&&(i+j<=a-1))
1279 if (s1[i+j]!=s2[j]) flag=1;
1280 if (flag==0) return(1);
1285 #define MAX_TLBS 8000
1287 extern void optimise_abbreviations(void)
1288 { int32 i, j, t, max=0, MAX_GTABLE;
1289 int32 j2, selected, available, maxat=0, nl;
1292 printf("Beginning calculation of optimal abbreviations...\n");
1295 tlbtab=my_calloc(sizeof(tlb), MAX_TLBS, "tlb table"); no_occs=0;
1296 sub_buffer=my_calloc(sizeof(char), 4000, "sub_buffer");
1297 for (i=0; i<MAX_TLBS; i++) tlbtab[i].occurrences=0;
1299 bestyet=my_calloc(sizeof(optab), 256, "bestyet");
1300 bestyet2=my_calloc(sizeof(optab), 64, "bestyet2");
1302 bestyet2[0].text[0]='.';
1303 bestyet2[0].text[1]=' ';
1304 bestyet2[0].text[2]=0;
1306 bestyet2[1].text[0]=',';
1307 bestyet2[1].text[1]=' ';
1308 bestyet2[1].text[2]=0;
1310 for (i=0; all_text+i<all_text_top; i++)
1312 if ((all_text[i]=='.') && (all_text[i+1]==' ') && (all_text[i+2]==' '))
1313 { all_text[i]='\n'; all_text[i+1]='\n'; all_text[i+2]='\n';
1314 bestyet2[0].popularity++;
1317 if ((all_text[i]=='.') && (all_text[i+1]==' '))
1318 { all_text[i]='\n'; all_text[i+1]='\n';
1319 bestyet2[0].popularity++;
1322 if ((all_text[i]==',') && (all_text[i+1]==' '))
1323 { all_text[i]='\n'; all_text[i+1]='\n';
1324 bestyet2[1].popularity++;
1328 MAX_GTABLE=subtract_pointers(all_text_top,all_text)+1;
1329 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1331 for (i=0, t=0; all_text+i<all_text_top; i++)
1332 { test.text[0]=all_text[i];
1333 test.text[1]=all_text[i+1];
1334 test.text[2]=all_text[i+2];
1336 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1338 for (j=0; j<no_occs; j++)
1339 if (strcmp(test.text,tlbtab[j].text)==0)
1342 for (j=i+3; all_text+j<all_text_top; j++)
1345 if (j%((**g_pm_hndl).linespercheck) == 0)
1346 { ProcessEvents (&g_proc);
1350 my_free(&all_text,"transcription text");
1351 longjmp (g_fallback, 1);
1355 if ((all_text[i]==all_text[j])
1356 && (all_text[i+1]==all_text[j+1])
1357 && (all_text[i+2]==all_text[j+2]))
1358 { grandtable[t+test.occurrences]=j;
1360 if (t+test.occurrences==MAX_GTABLE)
1361 { printf("All %ld cross-references used\n",
1362 (long int) MAX_GTABLE);
1367 if (test.occurrences>=2)
1368 { tlbtab[no_occs]=test;
1369 tlbtab[no_occs].intab=t; t+=tlbtab[no_occs].occurrences;
1370 if (max<tlbtab[no_occs].occurrences)
1371 max=tlbtab[no_occs].occurrences;
1373 if (no_occs==MAX_TLBS)
1374 { printf("All %d three-letter-blocks used\n",
1383 grandflags=my_calloc(sizeof(int), max, "grandflags");
1386 printf("Cross-reference table (%ld entries) built...\n",
1387 (long int) no_occs);
1388 /* for (i=0; i<no_occs; i++)
1389 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1390 tlbtab[i].occurrences);
1393 for (i=0; i<64; i++) bestyet2[i].length=0; selected=2;
1395 while ((available>0)&&(selected<64))
1396 { printf("Pass %d\n", ++pass_no);
1400 for (i=0; i<256; i++)
1401 if (bestyet[i].score!=0)
1403 nl=bestyet[i].length;
1404 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1405 all_text[bestyet[i].location+j2];
1406 bestyet[i].text[nl]=0;
1409 /* printf("End of pass results:\n");
1410 printf("\nno score freq string\n");
1411 for (i=0; i<256; i++)
1412 if (bestyet[i].score>0)
1413 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1414 bestyet[i].popularity, bestyet[i].text);
1419 for (i=0; i<256; i++)
1420 if (max<bestyet[i].score)
1421 { max=bestyet[i].score;
1426 { bestyet2[selected++]=bestyet[maxat];
1429 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1430 (long int) selected,bestyet[maxat].text,
1431 (long int) bestyet[maxat].popularity,
1432 (long int) bestyet[maxat].score);
1434 test.text[0]=bestyet[maxat].text[0];
1435 test.text[1]=bestyet[maxat].text[1];
1436 test.text[2]=bestyet[maxat].text[2];
1439 for (i=0; i<no_occs; i++)
1440 if (strcmp(test.text,tlbtab[i].text)==0)
1443 for (j=0; j<tlbtab[i].occurrences; j++)
1444 { if (memcmp(bestyet[maxat].text,
1445 all_text+grandtable[tlbtab[i].intab+j],
1446 bestyet[maxat].length)==0)
1447 { for (j2=0; j2<bestyet[maxat].length; j2++)
1448 all_text[grandtable[tlbtab[i].intab+j]+j2]='\n';
1452 for (i=0; i<256; i++)
1453 if ((bestyet[i].score>0)&&
1454 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1455 { bestyet[i].score=0;
1456 /* printf("Discarding '%s' as overlapping\n",
1457 bestyet[i].text); */
1460 } while ((max>0)&&(available>0)&&(selected<64));
1463 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1464 for (i=0; i<selected; i++)
1465 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1470 /* ------------------------------------------------------------------------- */
1471 /* The dictionary manager begins here. */
1473 /* Speed is extremely important in these algorithms. If a linear-time */
1474 /* routine were used to search the dictionary words so far built up, then */
1475 /* Inform would crawl. */
1477 /* Instead, the dictionary is stored as a binary tree, which is kept */
1478 /* balanced with the red-black algorithm. */
1479 /* ------------------------------------------------------------------------- */
1480 /* A dictionary table similar to the Z-machine format is kept: there is a */
1481 /* 7-byte header (left blank here to be filled in at the */
1482 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1483 /* records, one per word, in the form */
1485 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1486 /* 4 or 6 bytes byte byte byte */
1488 /* For Glulx, the form is instead: (But see below about Unicode-valued */
1489 /* dictionaries and my heinie.) */
1491 /* <plain text> <flags> <verbnumber> <adjectivenumber> */
1492 /* DICT_WORD_SIZE short short short */
1494 /* These records are stored in "accession order" (i.e. in order of their */
1495 /* first being received by these routines) and only alphabetically sorted */
1496 /* by construct_storyfile() (using the array below). */
1497 /* ------------------------------------------------------------------------- */
1499 /* Further notes about the data fields... */
1500 /* The flags are currently: */
1501 /* bit 0: word is used as a verb (in verb grammar) */
1502 /* bit 1: word is used as a meta verb */
1503 /* bit 2: word is plural (set by '//p') */
1504 /* bit 3: word is used as a preposition (in verb grammar) */
1505 /* bit 6: set for all verbs, but not used by the parser? */
1506 /* bit 7: word is used as a noun (set for every word that appears in */
1507 /* code or in an object property) */
1509 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1512 /* The compiler generates special constants #dict_par1, #dict_par2, */
1513 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1514 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1515 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1516 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1517 /* ------------------------------------------------------------------------- */
1519 uchar *dictionary, /* (These two pointers are externally
1520 used only in "tables.c" when
1521 building the story-file) */
1522 *dictionary_top; /* Pointer to next free record */
1524 int dict_entries; /* Total number of records entered */
1526 /* ------------------------------------------------------------------------- */
1527 /* dict_word is a typedef for a struct of 6 unsigned chars (defined in */
1528 /* "header.h"): it holds the (4 or) 6 bytes of Z-coded text of a word. */
1529 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1530 /* alphabetic order corresponds to numeric order. For this reason, the */
1531 /* dict_word is called the "sort code" of the original text word. */
1533 /* ###- In modifying the compiler, I've found it easier to discard the */
1534 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1535 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1536 /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
1537 /* (That limit is defined as 40 in the header; it exists only for a few */
1538 /* static buffers, and can be increased without using significant memory.) */
1540 /* ###- Well, that certainly bit me on the butt, didn't it. In further */
1541 /* modifying the compiler to generate a Unicode dictionary, I have to */
1542 /* store four-byte values in the uchar array. This is handled by making */
1543 /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1544 /* Then we store the 32-bit character value big-endian. This lets us */
1545 /* continue to compare arrays bytewise, which is a nice simplification. */
1546 /* ------------------------------------------------------------------------- */
1548 extern int compare_sorts(uchar *d1, uchar *d2)
1550 for (i=0; i<DICT_WORD_BYTES; i++)
1551 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1552 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1557 extern void copy_sorts(uchar *d1, uchar *d2)
1559 for (i=0; i<DICT_WORD_BYTES; i++)
1563 static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
1566 static int number_and_case;
1568 /* Also used by verbs.c */
1569 static void dictionary_prepare_z(char *dword, uchar *optresult)
1570 { int i, j, k, k2, wd[13]; int32 tot;
1572 /* A rapid text translation algorithm using only the simplified rules
1573 applying to the text of dictionary entries: first produce a sequence
1574 of 6 (v3) or 9 (v4+) Z-characters */
1576 number_and_case = 0;
1578 for (i=0, j=0; dword[j]!=0; i++, j++)
1579 { if ((dword[j] == '/') && (dword[j+1] == '/'))
1580 { for (j+=2; dword[j] != 0; j++)
1582 { case 'p': number_and_case |= 4; break;
1584 error_named("Expected 'p' after '//' \
1585 to give number of dictionary word", dword);
1595 warning_named("Obsolete usage: use the ^ character for the \
1596 apostrophe in", dword);
1597 if (k==(int) '^') k=(int) '\'';
1600 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1601 { int unicode = text_to_unicode(dword+j);
1602 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1603 k = unicode_to_zscii(unicode);
1604 j += textual_form_length - 1;
1605 if ((k == 5) || (k >= 0x100))
1606 { unicode_char_error(
1607 "Character can be printed but not input:", unicode);
1610 k2 = zscii_to_alphabet_grid[(uchar) k];
1613 { if (isupper(k)) k = tolower(k);
1614 k2 = iso_to_alphabet_grid[(uchar) k];
1618 { if ((k2 == -5) || (k2 <= -0x100))
1619 char_error("Character can be printed but not input:", k);
1621 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
1623 wd[i++] = 5; wd[i++] = 6;
1625 wd[i++] = k2/32; wd[i] = k2%32;
1629 { alphabet_used[k2] = 'Y';
1631 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
1632 wd[i]=6+(k2%26); /* Write the Z character */
1636 /* Fill up to the end of the dictionary block with PAD characters */
1638 for (; i<9; i++) wd[i]=5;
1640 /* The array of Z-chars is converted to three 2-byte blocks */
1642 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1643 prepared_sort[1]=tot%0x100;
1644 prepared_sort[0]=(tot/0x100)%0x100;
1645 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1646 prepared_sort[3]=tot%0x100;
1647 prepared_sort[2]=(tot/0x100)%0x100;
1648 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1649 prepared_sort[5]=tot%0x100;
1650 prepared_sort[4]=(tot/0x100)%0x100;
1652 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
1654 if (version_number==3) prepared_sort[2]+=0x80;
1655 else prepared_sort[4]+=0x80;
1657 if (optresult) copy_sorts(optresult, prepared_sort);
1660 /* Also used by verbs.c */
1661 static void dictionary_prepare_g(char *dword, uchar *optresult)
1666 number_and_case = 0;
1668 for (i=0, j=0; (dword[j]!=0); i++, j++) {
1669 if ((dword[j] == '/') && (dword[j+1] == '/')) {
1670 for (j+=2; dword[j] != 0; j++) {
1673 number_and_case |= 4;
1676 error_named("Expected 'p' after '//' \
1677 to give gender or number of dictionary word", dword);
1683 if (i>=DICT_WORD_SIZE) break;
1685 k= ((unsigned char *)dword)[j];
1687 warning_named("Obsolete usage: use the ^ character for the \
1688 apostrophe in", dword);
1691 if (k=='~') /* as in iso_to_alphabet_grid */
1694 if (k=='@' || (character_set_unicode && (k & 0x80))) {
1695 unicode = text_to_unicode(dword+j);
1696 j += textual_form_length - 1;
1699 unicode = iso_to_unicode_grid[k];
1702 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1706 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1707 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1711 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1714 if (DICT_CHAR_SIZE == 1) {
1715 prepared_sort[i] = k;
1718 prepared_sort[4*i] = (k >> 24) & 0xFF;
1719 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
1720 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
1721 prepared_sort[4*i+3] = (k) & 0xFF;
1725 if (DICT_CHAR_SIZE == 1) {
1726 for (; i<DICT_WORD_SIZE; i++)
1727 prepared_sort[i] = 0;
1730 for (; i<DICT_WORD_SIZE; i++) {
1731 prepared_sort[4*i] = 0;
1732 prepared_sort[4*i+1] = 0;
1733 prepared_sort[4*i+2] = 0;
1734 prepared_sort[4*i+3] = 0;
1738 if (optresult) copy_sorts(optresult, prepared_sort);
1741 extern void dictionary_prepare(char *dword, uchar *optresult)
1744 dictionary_prepare_z(dword, optresult);
1746 dictionary_prepare_g(dword, optresult);
1749 /* ------------------------------------------------------------------------- */
1750 /* The arrays below are all concerned with the problem of alphabetically */
1751 /* sorting the dictionary during the compilation pass. */
1752 /* Note that it is not enough simply to apply qsort to the dictionary at */
1753 /* the end of the pass: we need to ensure that no duplicates are ever */
1756 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
1757 /* word to be entered into the dictionary, where */
1758 /* n counts upward from 0 */
1759 /* (n is also called the "accession number") */
1761 /* The tree structure encodes an ordering. The special value VACANT means */
1762 /* "no node here": otherwise, node numbers are the same as accession */
1763 /* numbers. At all times, "root" holds the node number of the top of the */
1764 /* tree; each node has up to two branches, such that the subtree of the */
1765 /* left branch is always alphabetically before what's at the node, and */
1766 /* the subtree to the right is always after; and all branches are coloured */
1767 /* either "black" or "red". These colours are used to detect points where */
1768 /* the tree is growing asymmetrically (and therefore becoming inefficient */
1770 /* ------------------------------------------------------------------------- */
1777 typedef struct dict_tree_node_s
1778 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
1779 char colour; /* The colour of the branch to the parent */
1782 static dict_tree_node *dtree;
1784 int *final_dict_order;
1785 static uchar *dict_sort_codes;
1787 static void dictionary_begin_pass(void)
1789 /* Leave room for the 7-byte header (added in "tables.c" much later) */
1790 /* Glulx has a 4-byte header instead. */
1793 dictionary_top=dictionary+7;
1795 dictionary_top=dictionary+4;
1801 static int fdo_count;
1802 static void recursively_sort(int node)
1803 { if (dtree[node].branch[0] != VACANT)
1804 recursively_sort(dtree[node].branch[0]);
1805 final_dict_order[node] = fdo_count++;
1806 if (dtree[node].branch[1] != VACANT)
1807 recursively_sort(dtree[node].branch[1]);
1810 extern void sort_dictionary(void)
1813 { for (i=0; i<dict_entries; i++)
1814 final_dict_order[i] = i;
1819 { fdo_count = 0; recursively_sort(root);
1823 /* ------------------------------------------------------------------------- */
1824 /* If "dword" is in the dictionary, return its accession number plus 1; */
1825 /* If not, return 0. */
1826 /* ------------------------------------------------------------------------- */
1828 static int dictionary_find(char *dword)
1831 dictionary_prepare(dword, NULL);
1833 while (at != VACANT)
1834 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1835 if (n==0) return at + 1;
1836 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
1841 /* ------------------------------------------------------------------------- */
1842 /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
1843 /* it already exists, in which case OR the data with (x,y,z) */
1845 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
1847 /* Returns: the accession number. */
1848 /* ------------------------------------------------------------------------- */
1850 extern int dictionary_add(char *dword, int x, int y, int z)
1852 int ggfr = 0, gfr = 0, fr = 0, r = 0;
1853 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
1855 int res=((version_number==3)?4:6);
1857 dictionary_prepare(dword, NULL);
1860 { root = 0; goto CreateEntry;
1864 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
1868 p = dictionary+7 + at*(3+res) + res;
1869 p[0]=(p[0])|x; p[1]=(p[1])|y; p[2]=(p[2])|z;
1870 if (x & 128) p[0] = (p[0])|number_and_case;
1873 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
1874 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
1875 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
1876 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
1877 if (x & 128) p[1] = (p[1]) | number_and_case;
1881 if (n>0) r=1; else r=0;
1883 a = dtree[at].branch[0]; b = dtree[at].branch[1];
1884 if ((a != VACANT) && (dtree[a].colour == RED) &&
1885 (b != VACANT) && (dtree[b].colour == RED))
1886 { dtree[a].colour = BLACK;
1887 dtree[b].colour = BLACK;
1889 dtree[at].colour = RED;
1891 /* A tree rotation may be needed to avoid two red links in a row:
1893 ggf (or else gf is root) ggf (or f is root)
1896 / \(red) / \ (both red)
1902 In effect we rehang the "gf" subtree from "f".
1903 See the Technical Manual for further details.
1906 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1909 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1910 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1911 dtree[f].branch[1-fr] = gf;
1912 dtree[f].colour = BLACK;
1913 dtree[gf].colour = RED;
1914 gf = ggf; gfr = ggfr;
1917 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1918 dtree[at].colour = BLACK;
1919 dtree[gf].colour = RED;
1920 dtree[f].branch[fr] = dtree[at].branch[gfr];
1921 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1922 dtree[at].branch[gfr] = f;
1923 dtree[at].branch[fr] = gf;
1925 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1926 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
1931 if (dtree[at].branch[r] == VACANT)
1932 { dtree[at].colour = RED;
1934 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
1936 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
1937 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
1938 dtree[f].branch[1-fr] = gf;
1939 dtree[f].colour = BLACK;
1940 dtree[gf].colour = RED;
1943 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
1944 dtree[at].colour = BLACK;
1945 dtree[gf].colour = RED;
1946 dtree[f].branch[fr] = dtree[at].branch[gfr];
1947 dtree[gf].branch[gfr] = dtree[at].branch[fr];
1948 dtree[at].branch[gfr] = f;
1949 dtree[at].branch[fr] = gf;
1951 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
1955 dtree[at].branch[r] = dict_entries;
1958 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
1959 ggfr = gfr; gfr = fr; fr = r;
1964 if (dict_entries==MAX_DICT_ENTRIES)
1965 memoryerror("MAX_DICT_ENTRIES",MAX_DICT_ENTRIES);
1967 dtree[dict_entries].branch[0] = VACANT;
1968 dtree[dict_entries].branch[1] = VACANT;
1969 dtree[dict_entries].colour = BLACK;
1971 /* Address in Inform's own dictionary table to write the record to */
1975 p = dictionary + (3+res)*dict_entries + 7;
1977 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
1980 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
1981 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
1982 if (version_number > 3)
1983 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
1984 p[res]=x; p[res+1]=y; p[res+2]=z;
1985 if (x & 128) p[res] = (p[res])|number_and_case;
1987 dictionary_top += res+3;
1992 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
1993 p[0] = 0x60; /* type byte -- dict word */
1995 p += DICT_CHAR_SIZE;
1996 for (i=0; i<DICT_WORD_BYTES; i++)
1997 p[i] = prepared_sort[i];
1999 p += DICT_WORD_BYTES;
2001 p[2] = y/256; p[3] = y%256;
2004 p[1] |= number_and_case;
2006 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2010 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2012 return dict_entries++;
2015 /* ------------------------------------------------------------------------- */
2016 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2017 /* new verb syntax of its own. (Otherwise existing verb-words never */
2018 /* change their verb-numbers.) */
2019 /* ------------------------------------------------------------------------- */
2021 extern void dictionary_set_verb_number(char *dword, int to)
2023 int res=((version_number==3)?4:6);
2024 i=dictionary_find(dword);
2028 p=dictionary+7+(i-1)*(3+res)+res;
2032 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2033 p[2]=to/256; p[3]=to%256;
2038 /* ------------------------------------------------------------------------- */
2039 /* Tracing code for the dictionary: used not only by "trace" and text */
2040 /* transcription, but also (in the case of "word_to_ascii") in a vital */
2041 /* by the linker. */
2042 /* ------------------------------------------------------------------------- */
2044 static char *d_show_to;
2045 static int d_show_total;
2047 static void show_char(char c)
2048 { if (d_show_to == NULL) printf("%c", c);
2050 { int i = strlen(d_show_to);
2051 d_show_to[i] = c; d_show_to[i+1] = 0;
2055 extern void word_to_ascii(uchar *p, char *results)
2056 { int i, shift, cc, zchar; uchar encoded_word[9];
2057 encoded_word[0] = (((int) p[0])&0x7c)/4;
2058 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2059 encoded_word[2] = ((int) p[1])&0x1f;
2060 encoded_word[3] = (((int) p[2])&0x7c)/4;
2061 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2062 encoded_word[5] = ((int) p[3])&0x1f;
2063 if (version_number > 3)
2064 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2065 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2066 encoded_word[8] = ((int) p[5])&0x1f;
2070 for (i=0; i< ((version_number==3)?6:9); i++)
2071 { zchar = encoded_word[i];
2073 if (zchar == 4) shift = 1;
2075 if (zchar == 5) shift = 2;
2077 { if ((shift == 2) && (zchar == 6))
2078 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2080 if ((zchar>=32) && (zchar<=126))
2081 results[cc++] = zchar;
2083 { zscii_to_text(results+cc, zchar);
2084 cc = strlen(results);
2088 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2089 cc = strlen(results);
2097 static void recursively_show_z(int node)
2098 { int i, cprinted, flags; uchar *p;
2099 char textual_form[32];
2100 int res = (version_number == 3)?4:6;
2102 if (dtree[node].branch[0] != VACANT)
2103 recursively_show_z(dtree[node].branch[0]);
2105 p = (uchar *)dictionary + 7 + (3+res)*node;
2107 word_to_ascii(p, textual_form);
2109 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2110 show_char(textual_form[cprinted]);
2111 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2114 if (d_show_to == NULL)
2115 { for (i=0; i<3+res; i++) printf("%02x ",p[i]);
2117 flags = (int) p[res];
2120 if (flags & 4) printf("p"); else printf(" ");
2125 { if (grammar_version_number == 1)
2126 printf("preposition:%d ", (int) p[res+2]);
2128 printf("preposition ");
2130 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2131 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2135 if (d_show_total++ == 5)
2137 if (d_show_to != NULL)
2138 { write_to_transcript_file(d_show_to);
2143 if (dtree[node].branch[1] != VACANT)
2144 recursively_show_z(dtree[node].branch[1]);
2147 static void recursively_show_g(int node)
2149 warning("### Glulx dictionary-show not yet implemented.\n");
2152 static void show_alphabet(int i)
2153 { int j, c; char chartext[8];
2155 for (j=0; j<26; j++)
2156 { c = alphabet[i][j];
2158 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2160 zscii_to_text(chartext, c);
2161 printf("%s", chartext);
2163 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2168 extern void show_dictionary(void)
2169 { printf("Dictionary contains %d entries:\n",dict_entries);
2170 if (dict_entries != 0)
2171 { d_show_total = 0; d_show_to = NULL;
2173 recursively_show_z(root);
2175 recursively_show_g(root);
2177 printf("\nZ-machine alphabet entries:\n");
2183 extern void write_dictionary_to_transcript(void)
2184 { char d_buffer[81];
2186 sprintf(d_buffer, "\n[Dictionary contains %d entries:]\n", dict_entries);
2188 d_buffer[0] = 0; write_to_transcript_file(d_buffer);
2190 if (dict_entries != 0)
2191 { d_show_total = 0; d_show_to = d_buffer;
2193 recursively_show_z(root);
2195 recursively_show_g(root);
2197 if (d_show_total != 0) write_to_transcript_file(d_buffer);
2200 /* ========================================================================= */
2201 /* Data structure management routines */
2202 /* ------------------------------------------------------------------------- */
2204 extern void init_text_vars(void)
2211 no_chars_transcribed = 0;
2212 is_abbreviation = FALSE;
2213 put_strings_in_low_memory = FALSE;
2215 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2217 total_zchars_trans = 0;
2220 final_dict_order = NULL;
2221 dict_sort_codes = NULL;
2224 initialise_memory_block(&static_strings_area);
2227 extern void text_begin_pass(void)
2228 { abbrevs_lookup_table_made = FALSE;
2230 total_chars_trans=0; total_bytes_trans=0;
2231 if (store_the_text) all_text_top=all_text;
2232 dictionary_begin_pass();
2233 low_strings_top = low_strings;
2235 static_strings_extent = 0;
2237 no_dynamic_strings = 0;
2238 no_unicode_chars = 0;
2241 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2243 extern void text_allocate_arrays(void)
2244 { abbreviations_at = my_malloc(MAX_ABBREVS*MAX_ABBREV_LENGTH,
2246 abbrev_values = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev values");
2247 abbrev_quality = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev quality");
2248 abbrev_freqs = my_calloc(sizeof(int), MAX_ABBREVS, "abbrev freqs");
2250 dtree = my_calloc(sizeof(dict_tree_node), MAX_DICT_ENTRIES,
2251 "red-black tree for dictionary");
2252 final_dict_order = my_calloc(sizeof(int), MAX_DICT_ENTRIES,
2253 "final dictionary ordering table");
2254 dict_sort_codes = my_calloc(DICT_WORD_BYTES, MAX_DICT_ENTRIES,
2255 "dictionary sort codes");
2258 dictionary = my_malloc(9*MAX_DICT_ENTRIES+7,
2261 dictionary = my_malloc(DICT_ENTRY_BYTE_LENGTH*MAX_DICT_ENTRIES+4,
2264 strings_holding_area
2265 = my_malloc(MAX_STATIC_STRINGS,"static strings holding area");
2266 low_strings = my_malloc(MAX_LOW_STRINGS,"low (abbreviation) strings");
2268 huff_entities = NULL;
2270 unicode_usage_entries = NULL;
2271 done_compression = FALSE;
2272 compression_table_size = 0;
2273 compressed_offsets = NULL;
2275 MAX_CHARACTER_SET = 0;
2278 if (compression_switch) {
2280 MAX_CHARACTER_SET = 257 + MAX_ABBREVS + MAX_DYNAMIC_STRINGS
2281 + MAX_UNICODE_CHARS;
2282 huff_entities = my_calloc(sizeof(huffentity_t), MAX_CHARACTER_SET*2+1,
2283 "huffman entities");
2284 hufflist = my_calloc(sizeof(huffentity_t *), MAX_CHARACTER_SET,
2285 "huffman node list");
2286 unicode_usage_entries = my_calloc(sizeof(unicode_usage_t),
2287 MAX_UNICODE_CHARS, "unicode entity entries");
2288 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2289 unicode_usage_hash[ix] = NULL;
2291 compressed_offsets = my_calloc(sizeof(int32), MAX_NUM_STATIC_STRINGS,
2292 "static strings index table");
2296 extern void text_free_arrays(void)
2298 my_free(&strings_holding_area, "static strings holding area");
2299 my_free(&low_strings, "low (abbreviation) strings");
2300 my_free(&abbreviations_at, "abbreviations");
2301 my_free(&abbrev_values, "abbrev values");
2302 my_free(&abbrev_quality, "abbrev quality");
2303 my_free(&abbrev_freqs, "abbrev freqs");
2305 my_free(&dtree, "red-black tree for dictionary");
2306 my_free(&final_dict_order, "final dictionary ordering table");
2307 my_free(&dict_sort_codes, "dictionary sort codes");
2309 my_free(&dictionary,"dictionary");
2311 my_free(&compressed_offsets, "static strings index table");
2312 my_free(&hufflist, "huffman node list");
2313 my_free(&huff_entities, "huffman entities");
2314 my_free(&unicode_usage_entries, "unicode entity entities");
2316 deallocate_memory_block(&static_strings_area);
2319 extern void ao_free_arrays(void)
2320 { my_free (&tlbtab,"tlb table");
2321 my_free (&sub_buffer,"sub_buffer");
2322 my_free (&bestyet,"bestyet");
2323 my_free (&bestyet2,"bestyet2");
2324 my_free (&grandtable,"grandtable");
2325 my_free (&grandflags,"grandflags");
2328 /* ========================================================================= */