1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Part of Inform 6.40 */
5 /* copyright (c) Graham Nelson 1993 - 2022 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ */
20 /* ------------------------------------------------------------------------- */
24 uchar *low_strings; /* Allocated to low_strings_top */
25 int32 low_strings_top;
26 static memory_list low_strings_memlist;
28 int32 static_strings_extent; /* Number of bytes of static strings
30 uchar *static_strings_area; /* Used to hold the static strings
32 Allocated to static_strings_extent */
33 memory_list static_strings_area_memlist;
35 static char *all_text; /* Text buffer holding the entire text
36 of the game, when it is being
38 (Allocated to all_text_top) */
39 static memory_list all_text_memlist;
40 static int32 all_text_top;
42 int abbrevs_lookup_table_made, /* The abbreviations lookup table is
43 constructed when the first non-
44 abbreviation string is translated:
45 this flag is TRUE after that */
46 abbrevs_lookup[256]; /* Once this has been constructed,
47 abbrevs_lookup[n] = the smallest
48 number of any abbreviation beginning
49 with ASCII character n, or -1
50 if none of the abbreviations do */
51 int no_abbreviations; /* No of abbreviations defined so far */
52 /* ------------------------------------------------------------------------- */
53 /* Glulx string compression storage */
54 /* ------------------------------------------------------------------------- */
56 int no_strings; /* No of strings in static strings
58 int no_dynamic_strings; /* No. of @.. string escapes used
59 (actually, the highest value used
61 int no_unicode_chars; /* Number of distinct Unicode chars
62 used. (Beyond 0xFF.) */
64 huffentity_t *huff_entities; /* The list of entities (characters,
65 abbreviations, @.. escapes, and
67 static huffentity_t **hufflist; /* Copy of the list, for sorting */
69 int no_huff_entities; /* The number of entities in the list */
70 int huff_unicode_start; /* Position in the list where Unicode
72 int huff_abbrev_start; /* Position in the list where string
73 abbreviations begin. */
74 int huff_dynam_start; /* Position in the list where @..
76 int huff_entity_root; /* The position in the list of the root
77 entry (when considering the table
80 int done_compression; /* Has the game text been compressed? */
81 int32 compression_table_size; /* Length of the Huffman table, in
83 int32 compression_string_size; /* Length of the compressed string
85 int32 *compressed_offsets; /* The beginning of every string in
86 the game, relative to the beginning
87 of the Huffman table. (So entry 0
88 is equal to compression_table_size).
89 Allocated to no_strings at
90 compress_game_text() time. */
91 static memory_list compressed_offsets_memlist;
93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars */
94 static memory_list unicode_usage_entries_memlist;
96 #define UNICODE_HASH_BUCKETS (64)
97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
99 static int unicode_entity_index(int32 unicode);
101 /* ------------------------------------------------------------------------- */
102 /* Abbreviation arrays */
103 /* ------------------------------------------------------------------------- */
105 abbreviation *abbreviations; /* Allocated up to no_abbreviations */
106 static memory_list abbreviations_memlist;
108 /* Memory to hold the text of any abbreviation strings declared. This is
109 counted in units of MAX_ABBREV_LENGTH bytes. (An abbreviation must fit
110 in that many bytes, null included.) */
111 uchar *abbreviations_at; /* Allocated up to no_abbreviations */
112 static memory_list abbreviations_at_memlist;
114 static int *abbreviations_optimal_parse_schedule;
115 static memory_list abbreviations_optimal_parse_schedule_memlist;
117 static int *abbreviations_optimal_parse_scores;
118 static memory_list abbreviations_optimal_parse_scores_memlist;
120 /* ------------------------------------------------------------------------- */
122 int32 total_chars_trans, /* Number of ASCII chars of text in */
123 total_bytes_trans, /* Number of bytes of Z-code text out */
124 zchars_trans_in_last_string; /* Number of Z-chars in last string:
125 needed only for abbrev efficiency
126 calculation in "directs.c" */
127 static int32 total_zchars_trans; /* Number of Z-chars of text out
128 (only used to calculate the above) */
130 static int zchars_out_buffer[3], /* During text translation, a buffer of
131 3 Z-chars at a time: when it's full
132 these are written as a 2-byte word */
133 zob_index; /* Index (0 to 2) into it */
135 uchar *translated_text; /* Area holding translated strings
136 until they are moved into the
137 static_strings_area below */
138 static memory_list translated_text_memlist;
140 static int32 text_out_pos; /* The "program counter" during text
141 translation: the next position to
142 write Z-coded text output to */
144 static int32 text_out_limit; /* The upper limit of text_out_pos
145 during text translation (or -1
148 static int text_out_overflow; /* During text translation, becomes
149 true if text_out_pos tries to pass
152 /* ------------------------------------------------------------------------- */
153 /* For variables/arrays used by the dictionary manager, see below */
154 /* ------------------------------------------------------------------------- */
156 /* ------------------------------------------------------------------------- */
157 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
158 /* detection in text translation). We first bubble-sort the abbrevs into */
159 /* alphabetical order (this is necessary for the detection algorithm to */
160 /* to work). Since the table is only prepared once, and for a table */
161 /* of size at most 96, there's no point using an efficient sort algorithm. */
162 /* ------------------------------------------------------------------------- */
164 static void make_abbrevs_lookup(void)
165 { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
167 { bubble_sort = FALSE;
168 for (j=0; j<no_abbreviations; j++)
169 for (k=j+1; k<no_abbreviations; k++)
170 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
171 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
173 { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
174 l=abbreviations[j].value; abbreviations[j].value=abbreviations[k].value;
175 abbreviations[k].value=l;
176 l=abbreviations[j].quality; abbreviations[j].quality=abbreviations[k].quality;
177 abbreviations[k].quality=l;
181 } while (bubble_sort);
183 for (j=no_abbreviations-1; j>=0; j--)
184 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
185 abbrevs_lookup[(uchar)p1[0]]=j;
186 abbreviations[j].freq=0;
188 abbrevs_lookup_table_made = TRUE;
191 /* ------------------------------------------------------------------------- */
192 /* Search the abbreviations lookup table (a routine which must be fast). */
193 /* The source text to compare is text[i], text[i+1], ... and this routine */
194 /* is only called if text[i] is indeed the first character of at least one */
195 /* abbreviation, "from" begin the least index into the abbreviations table */
196 /* of an abbreviation for which text[i] is the first character. Recall */
197 /* that the abbrevs table is in alphabetical order. */
199 /* The return value is -1 if there is no match. If there is a match, the */
200 /* text to be abbreviated out is over-written by a string of null chars */
201 /* with "ASCII" value 1, and the abbreviation number is returned. */
203 /* In Glulx, we *do not* do this overwriting with 1's. */
204 /* ------------------------------------------------------------------------- */
206 static int try_abbreviations_from(unsigned char *text, int i, int from)
207 { int j, k; uchar *p, c;
209 for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
210 (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
211 { if (text[i+1]==p[1])
212 { for (k=2; p[k]!=0; k++)
213 if (text[i+k]!=p[k]) goto NotMatched;
215 for (k=0; p[k]!=0; k++) text[i+k]=1;
217 abbreviations[j].freq++;
225 extern void make_abbreviation(char *text)
227 ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
228 ensure_memory_list_available(&abbreviations_at_memlist, no_abbreviations+1);
230 strcpy((char *)abbreviations_at
231 + no_abbreviations*MAX_ABBREV_LENGTH, text);
233 abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
234 abbreviations[no_abbreviations].freq = 0;
236 /* The quality is the number of Z-chars saved by using this */
237 /* abbreviation: note that it takes 2 Z-chars to print it. */
239 abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
241 if (abbreviations[no_abbreviations].quality <= 0) {
242 warning_named("Abbreviation does not save any characters:", text);
248 /* ------------------------------------------------------------------------- */
249 /* The front end routine for text translation. */
250 /* strctx indicates the purpose of the string. This is mostly used for */
251 /* informational output (gametext.txt), but we treat some string contexts */
252 /* specially during compilation. */
253 /* ------------------------------------------------------------------------- */
255 extern int32 compile_string(char *b, int strctx)
260 if (execution_never_reaches_here) {
261 /* No need to put strings into gametext.txt or the static/low
263 if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
264 /* VENEER and VENEEROPC are only used at the translate_text level,
265 so we don't have to catch them here. */
270 /* In Z-code, abbreviations go in the low memory pool (0x100). So
271 do strings explicitly defined with the Lowstring directive.
272 (In Glulx, the in_low_memory flag is ignored.) */
273 in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
275 if (!glulx_mode && in_low_memory)
277 k = translate_text(-1, b, strctx);
279 error("text translation failed");
282 ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
283 memcpy(low_strings+low_strings_top, translated_text, k);
285 low_strings_top += k;
289 if (glulx_mode && done_compression)
290 compiler_error("Tried to add a string after compression was done.");
292 i = translate_text(-1, b, strctx);
294 error("text translation failed");
298 /* Insert null bytes as needed to ensure that the next static string */
299 /* also occurs at an address expressible as a packed address */
303 if (oddeven_packing_switch)
304 textalign = scale_factor*2;
306 textalign = scale_factor;
307 while ((i%textalign)!=0)
309 ensure_memory_list_available(&translated_text_memlist, i+2);
310 translated_text[i++] = 0;
311 translated_text[i++] = 0;
315 j = static_strings_extent;
317 ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
318 for (c=translated_text; c<translated_text+i;
319 c++, static_strings_extent++)
320 static_strings_area[static_strings_extent] = *c;
323 return(j/scale_factor);
326 /* The marker value is a one-based string number. (We reserve zero
327 to mean "not a string at all". */
328 return (++no_strings);
332 /* ------------------------------------------------------------------------- */
333 /* Output a single Z-character into the buffer, and flush it if full */
334 /* ------------------------------------------------------------------------- */
336 static void write_z_char_z(int i)
339 total_zchars_trans++;
340 zchars_out_buffer[zob_index++]=(i%32);
341 if (zob_index!=3) return;
343 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
344 + zchars_out_buffer[2];
346 if (text_out_limit >= 0) {
347 if (text_out_pos+2 > text_out_limit) {
348 text_out_overflow = TRUE;
353 ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
356 translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
357 total_bytes_trans+=2;
360 static void write_zscii(int zsc)
362 int lookup_value, in_alphabet;
369 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
371 else lookup_value = -1;
373 if (lookup_value >= 0)
374 { alphabet_used[lookup_value] = 'Y';
375 in_alphabet = lookup_value/26;
376 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
377 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
378 write_z_char_z(lookup_value%26 + 6);
381 { write_z_char_z(5); write_z_char_z(6);
382 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
386 /* ------------------------------------------------------------------------- */
387 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
388 /* setting the "end" bit on the final 2-byte word */
389 /* ------------------------------------------------------------------------- */
391 static void end_z_chars(void)
393 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
394 while (zob_index!=0) write_z_char_z(5);
395 if (text_out_pos < 2) {
396 /* Something went wrong. */
397 text_out_overflow = TRUE;
400 translated_text[text_out_pos-2] += 128;
403 /* Glulx handles this much more simply -- compression is done elsewhere. */
404 static void write_z_char_g(int i)
407 if (text_out_limit >= 0) {
408 if (text_out_pos+1 > text_out_limit) {
409 text_out_overflow = TRUE;
414 ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
416 total_zchars_trans++;
417 translated_text[text_out_pos++] = i;
421 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
422 static int zchar_weight(int c)
424 int lookup = iso_to_alphabet_grid[c];
425 if (lookup < 0) return 4;
426 if (lookup < 26) return 1;
430 /* ------------------------------------------------------------------------- */
431 /* The main routine "text.c" provides to the rest of Inform: the text */
432 /* translator. s_text is the source text and the return value is the */
433 /* number of bytes translated. */
434 /* The translated text will be stored in translated_text. */
436 /* If p_limit is >= 0, the text length will not exceed that many bytes. */
437 /* If the translation tries to overflow this boundary, the return value */
438 /* will be -1. (You should display an error and not read translated_text.) */
440 /* If p_limit is negative, any amount of text is accepted (up to int32 */
443 /* Note that the source text may be corrupted by this routine. */
444 /* ------------------------------------------------------------------------- */
446 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
447 { int i, j, k, in_alphabet, lookup_value, is_abbreviation;
448 int32 unicode; int zscii;
449 unsigned char *text_in;
452 ensure_memory_list_available(&translated_text_memlist, p_limit);
455 /* For STRCTX_ABBREV, the string being translated is itself an
456 abbreviation string, so it can't make use of abbreviations. Set
457 the is_abbreviation flag to indicate this.
458 The compiler has historically set this flag for the Lowstring
459 directive as well -- the in_low_memory and is_abbreviation flag were
460 always the same. I am preserving that convention. */
461 is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
464 /* Cast the input and output streams to unsigned char: text_out_pos will
465 advance as bytes of Z-coded text are written, but text_in doesn't */
467 text_in = (unsigned char *) s_text;
469 text_out_limit = p_limit;
470 text_out_overflow = FALSE;
472 /* Remember the Z-chars total so that later we can subtract to find the
473 number of Z-chars translated on this string */
475 zchars_trans_in_last_string = total_zchars_trans;
477 /* Start with the Z-characters output buffer empty */
481 /* If this is the first text translated since the abbreviations were
482 declared, and if some were declared, then it's time to make the
483 lookup table for abbreviations
485 (Except: we don't if the text being translated is itself
486 the text of an abbreviation currently being defined) */
488 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
489 && (!is_abbreviation))
490 make_abbrevs_lookup();
492 /* If we're storing the whole game text to memory, then add this text.
493 We will put two newlines between each text and four at the very end.
494 (The optimise code does a lot of sloppy text[i+2], so the extra
495 two newlines past all_text_top are necessary.) */
497 if ((!is_abbreviation) && (store_the_text))
498 { int addlen = strlen(s_text);
499 ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
500 sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
501 /* Advance past two newlines. */
502 all_text_top += (addlen+2);
505 if (transcript_switch) {
506 /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
507 if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
510 if (label == STRCTX_GAME)
511 label = STRCTX_VENEER;
512 else if (label == STRCTX_GAMEOPC)
513 label = STRCTX_VENEEROPC;
515 write_to_transcript_file(s_text, label);
519 /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
520 /* (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
521 /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
525 int l, min_score, from;
528 text_in_length = strlen( (char*) text_in);
529 ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
530 ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
532 abbreviations_optimal_parse_scores[text_in_length] = 0;
533 for(j=text_in_length-1; j>=0; j--)
534 { /* Initial values: empty schedule, score = just write the letter without abbreviating. */
535 abbreviations_optimal_parse_schedule[j] = -1;
536 min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
537 /* If there's an abbreviation starting with that letter... */
538 if ( (from = abbrevs_lookup[text_in[j]]) != -1)
541 /* Loop on all abbreviations starting with what is in c. */
542 for (k=from, q=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
543 (k<no_abbreviations)&&(c==q[0]); k++, q+=MAX_ABBREV_LENGTH)
545 /* Let's compare; we also keep track of the length of the abbreviation. */
546 for (l=1; q[l]!=0; l++)
547 { if (text_in[j+l]!=q[l]) {goto NotMatched;}
549 /* We have a match (length l), but is it smaller in size? */
550 if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
551 { /* It is indeed smaller, so let's write it down in our schedule. */
552 min_score = 2 + abbreviations_optimal_parse_scores[j+l];
553 abbreviations_optimal_parse_schedule[j] = k;
558 /* We gave it our best, this is the smallest we got. */
559 abbreviations_optimal_parse_scores[j] = min_score;
567 /* The empty string of Z-text is illegal, since it can't carry an end
568 bit: so we translate an empty string of ASCII text to just the
569 pad character 5. Printing this causes nothing to appear on screen. */
571 if (text_in[0]==0) write_z_char_z(5);
573 /* Loop through the characters of the null-terminated input text: note
574 that if 1 is written over a character in the input text, it is
575 afterwards ignored */
577 for (i=0; text_in[i]!=0; i++)
578 { total_chars_trans++;
580 /* Contract ". " into ". " if double-space-removing switch set:
581 likewise "? " and "! " if the setting is high enough */
583 if ((double_space_setting >= 1)
584 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
585 { if (text_in[i]=='.') text_in[i+2]=1;
586 if (double_space_setting >= 2)
587 { if (text_in[i]=='?') text_in[i+2]=1;
588 if (text_in[i]=='!') text_in[i+2]=1;
592 /* Try abbreviations if the economy switch set. */
593 /* Look at the abbreviation schedule to see if we should abbreviate here. */
594 /* Note: Just because the schedule has something doesn't mean we should abbreviate there; */
595 /* sometimes you abbreviate before because it's better. If we have already replaced the */
596 /* char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
597 if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
598 ((j = abbreviations_optimal_parse_schedule[i]) != -1))
600 /* Fill with 1s, which will get ignored by everyone else. */
601 uchar *p = (uchar *)abbreviations_at+j*MAX_ABBREV_LENGTH;
602 for (k=0; p[k]!=0; k++) text_in[i+k]=1;
603 /* Actually write the abbreviation in the story file. */
604 abbreviations[j].freq++;
605 /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
606 j += MAX_DYNAMIC_STRINGS;
607 write_z_char_z(j/32+1); write_z_char_z(j%32);
611 /* If Unicode switch set, use text_to_unicode to perform UTF-8
613 if (character_set_unicode && (text_in[i] & 0x80))
614 { unicode = text_to_unicode((char *) (text_in+i));
615 zscii = unicode_to_zscii(unicode);
616 if (zscii != 5) write_zscii(zscii);
618 { unicode_char_error(
619 "Character can only be used if declared in \
620 advance as part of 'Zcharacter table':", unicode);
622 i += textual_form_length - 1;
626 /* '@' is the escape character in Inform string notation: the various
629 @@decimalnumber : write this ZSCII char (0 to 1023)
630 @twodigits or : write the abbreviation string with this
631 @(digits) decimal number
632 @(symbol) : write the abbreviation string with this
634 @accentcode : this accented character: e.g.,
635 for @'e write an E-acute
636 @{...} : this Unicode char (in hex) */
639 { if (text_in[i+1]=='@')
641 /* @@... (ascii value) */
643 i+=2; j=atoi((char *) (text_in+i));
645 { /* Prevent ~ and ^ from being translated to double-quote
646 and new-line, as they ordinarily would be */
648 case 94: write_z_char_z(5); write_z_char_z(6);
649 write_z_char_z(94/32); write_z_char_z(94%32);
651 case 126: write_z_char_z(5); write_z_char_z(6);
652 write_z_char_z(126/32); write_z_char_z(126%32);
655 default: write_zscii(j); break;
657 while (isdigit(text_in[i])) i++; i--;
659 else if (text_in[i+1]=='(')
661 /* @(...) (dynamic string) */
662 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
663 int len = 0, digits = 0;
665 /* This accepts "12xyz" as a symbol, which it really isn't,
666 but that just means it won't be found. */
667 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
668 char ch = text_in[i++];
669 if (isdigit(ch)) digits++;
674 /* We would like to parse dsymbol as *either* a decimal
675 number or a constant symbol. */
676 if (text_in[i] != ')' || len == 0) {
677 error("'@(...)' abbreviation must contain a symbol");
679 else if (digits == len) {
680 /* all digits; parse as decimal */
684 int sym = symbol_index(dsymbol, -1);
685 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
686 error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
689 symbols[sym].flags |= USED_SFLAG;
690 j = symbols[sym].value;
693 if (!glulx_mode && j >= 96) {
694 error_max_dynamic_strings(j);
697 if (j >= MAX_DYNAMIC_STRINGS) {
698 error_max_dynamic_strings(j);
702 write_z_char_z(j/32+1); write_z_char_z(j%32);
705 write_z_char_z(' '); /* error fallback */
708 else if (isdigit(text_in[i+1])!=0)
711 /* @.. (dynamic string) */
713 d1 = character_digit_value[text_in[i+1]];
714 d2 = character_digit_value[text_in[i+2]];
715 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
716 error("'@..' must have two decimal digits");
720 if (!glulx_mode && j >= 96) {
721 error_max_dynamic_strings(j);
724 if (j >= MAX_DYNAMIC_STRINGS) {
725 /* Shouldn't get here with two digits */
726 error_max_dynamic_strings(j);
731 write_z_char_z(j/32+1); write_z_char_z(j%32);
734 write_z_char_z(' '); /* error fallback */
740 /* A string escape specifying an unusual character */
742 unicode = text_to_unicode((char *) (text_in+i));
743 zscii = unicode_to_zscii(unicode);
744 if (zscii != 5) write_zscii(zscii);
746 { unicode_char_error(
747 "Character can only be used if declared in \
748 advance as part of 'Zcharacter table':", unicode);
750 i += textual_form_length - 1;
754 { /* Skip a character which has been over-written with the null
755 value 1 earlier on */
758 { if (text_in[i]==' ') write_z_char_z(0);
760 { j = (int) text_in[i];
761 lookup_value = iso_to_alphabet_grid[j];
762 if (lookup_value < 0)
763 { /* The character isn't in the standard alphabets, so
764 we have to use the ZSCII 4-Z-char sequence */
766 if (lookup_value == -5)
767 { /* Character isn't in the ZSCII set at all */
769 unicode = iso_to_unicode(j);
771 "Character can only be used if declared in \
772 advance as part of 'Zcharacter table':", unicode);
773 write_zscii(0x200 + unicode/0x100);
774 write_zscii(0x300 + unicode%0x100);
776 else write_zscii(-lookup_value);
779 { /* The character is in one of the standard alphabets:
780 write a SHIFT to temporarily change alphabet if
781 it isn't in alphabet 0, then write the Z-char */
783 alphabet_used[lookup_value] = 'Y';
784 in_alphabet = lookup_value/26;
785 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
786 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
787 write_z_char_z(lookup_value%26 + 6);
794 /* Flush the Z-characters output buffer and set the "end" bit */
800 /* The text storage here is, of course, temporary. Compression
801 will occur when we're finished compiling, so that all the
802 clever Huffman stuff will work.
803 In the stored text, we use "@@" to indicate @,
804 "@0" to indicate a zero byte,
805 "@ANNNN" to indicate an abbreviation,
806 "@DNNNN" to indicate a dynamic string thing.
807 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
808 (NNNN is a four-digit hex number using the letters A-P... an
809 ugly representation but a convenient one.)
812 for (i=0; text_in[i]!=0; i++) {
814 /* Contract ". " into ". " if double-space-removing switch set:
815 likewise "? " and "! " if the setting is high enough. */
816 if ((double_space_setting >= 1)
817 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
819 || (double_space_setting >= 2
820 && (text_in[i]=='?' || text_in[i]=='!'))) {
821 text_in[i+1] = text_in[i];
828 /* Try abbreviations if the economy switch set. We have to be in
829 compression mode too, since the abbreviation mechanism is part
830 of string decompression. */
832 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
833 && ((k=abbrevs_lookup[text_in[i]])!=-1)
834 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
835 char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
839 write_z_char_g('A' + ((j >>12) & 0x0F));
840 write_z_char_g('A' + ((j >> 8) & 0x0F));
841 write_z_char_g('A' + ((j >> 4) & 0x0F));
842 write_z_char_g('A' + ((j ) & 0x0F));
844 else if (text_in[i] == '@') {
845 if (text_in[i+1]=='@') {
847 i+=2; j=atoi((char *) (text_in+i));
848 if (j == '@' || j == '\0') {
852 if (!compression_switch)
853 warning("Ascii @@0 will prematurely terminate non-compressed \
858 while (isdigit(text_in[i])) i++; i--;
860 else if (text_in[i+1]=='(') {
861 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
862 int len = 0, digits = 0;
864 /* This accepts "12xyz" as a symbol, which it really isn't,
865 but that just means it won't be found. */
866 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
867 char ch = text_in[i++];
868 if (isdigit(ch)) digits++;
873 /* We would like to parse dsymbol as *either* a decimal
874 number or a constant symbol. */
875 if (text_in[i] != ')' || len == 0) {
876 error("'@(...)' abbreviation must contain a symbol");
878 else if (digits == len) {
879 /* all digits; parse as decimal */
883 int sym = symbol_index(dsymbol, -1);
884 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
885 error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
888 symbols[sym].flags |= USED_SFLAG;
889 j = symbols[sym].value;
892 if (j >= MAX_DYNAMIC_STRINGS) {
893 error_max_dynamic_strings(j);
896 if (j+1 >= no_dynamic_strings)
897 no_dynamic_strings = j+1;
901 write_z_char_g('A' + ((j >>12) & 0x0F));
902 write_z_char_g('A' + ((j >> 8) & 0x0F));
903 write_z_char_g('A' + ((j >> 4) & 0x0F));
904 write_z_char_g('A' + ((j ) & 0x0F));
907 write_z_char_g(' '); /* error fallback */
910 else if (isdigit(text_in[i+1])) {
912 d1 = character_digit_value[text_in[i+1]];
913 d2 = character_digit_value[text_in[i+2]];
914 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
915 error("'@..' must have two decimal digits");
918 if (!compression_switch)
919 warning("'@..' print variable will not work in non-compressed \
920 string; substituting ' '.");
923 if (j >= MAX_DYNAMIC_STRINGS) {
924 error_max_dynamic_strings(j);
927 if (j+1 >= no_dynamic_strings)
928 no_dynamic_strings = j+1;
932 write_z_char_g('A' + ((j >>12) & 0x0F));
933 write_z_char_g('A' + ((j >> 8) & 0x0F));
934 write_z_char_g('A' + ((j >> 4) & 0x0F));
935 write_z_char_g('A' + ((j ) & 0x0F));
938 write_z_char_g(' '); /* error fallback */
943 unicode = text_to_unicode((char *) (text_in+i));
944 i += textual_form_length - 1;
945 if (unicode == '@' || unicode == '\0') {
947 write_z_char_g(unicode ? '@' : '0');
949 else if (unicode >= 0 && unicode < 256) {
950 write_z_char_g(unicode);
953 if (!compression_switch) {
954 warning("Unicode characters will not work in non-compressed \
955 string; substituting '?'.");
959 j = unicode_entity_index(unicode);
962 write_z_char_g('A' + ((j >>12) & 0x0F));
963 write_z_char_g('A' + ((j >> 8) & 0x0F));
964 write_z_char_g('A' + ((j >> 4) & 0x0F));
965 write_z_char_g('A' + ((j ) & 0x0F));
970 else if (text_in[i] == '^')
971 write_z_char_g(0x0A);
972 else if (text_in[i] == '~')
974 else if (character_set_unicode) {
975 if (text_in[i] & 0x80) {
976 unicode = text_to_unicode((char *) (text_in+i));
977 i += textual_form_length - 1;
978 if (unicode >= 0 && unicode < 256) {
979 write_z_char_g(unicode);
982 if (!compression_switch) {
983 warning("Unicode characters will not work in non-compressed \
984 string; substituting '?'.");
988 j = unicode_entity_index(unicode);
991 write_z_char_g('A' + ((j >>12) & 0x0F));
992 write_z_char_g('A' + ((j >> 8) & 0x0F));
993 write_z_char_g('A' + ((j >> 4) & 0x0F));
994 write_z_char_g('A' + ((j ) & 0x0F));
999 write_z_char_g(text_in[i]);
1003 unicode = iso_to_unicode_grid[text_in[i]];
1004 if (unicode >= 0 && unicode < 256) {
1005 write_z_char_g(unicode);
1008 if (!compression_switch) {
1009 warning("Unicode characters will not work in non-compressed \
1010 string; substituting '?'.");
1011 write_z_char_g('?');
1014 j = unicode_entity_index(unicode);
1015 write_z_char_g('@');
1016 write_z_char_g('U');
1017 write_z_char_g('A' + ((j >>12) & 0x0F));
1018 write_z_char_g('A' + ((j >> 8) & 0x0F));
1019 write_z_char_g('A' + ((j >> 4) & 0x0F));
1020 write_z_char_g('A' + ((j ) & 0x0F));
1026 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1030 if (text_out_overflow)
1033 return text_out_pos;
1036 static int unicode_entity_index(int32 unicode)
1039 int buck = unicode % UNICODE_HASH_BUCKETS;
1041 for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1042 if (unicode_usage_entries[j].ch == unicode)
1046 ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1047 j = no_unicode_chars++;
1048 unicode_usage_entries[j].ch = unicode;
1049 unicode_usage_entries[j].next = unicode_usage_hash[buck];
1050 unicode_usage_hash[buck] = j;
1056 /* ------------------------------------------------------------------------- */
1057 /* Glulx compression code */
1058 /* ------------------------------------------------------------------------- */
1061 static void compress_makebits(int entnum, int depth, int prevbit,
1062 huffbitlist_t *bits);
1064 /* The compressor. This uses the usual Huffman compression algorithm. */
1065 void compress_game_text()
1067 int entities=0, branchstart, branches;
1076 if (compression_switch) {
1077 max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1079 huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1080 "huffman entities");
1081 hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1082 "huffman node list");
1084 /* How many entities have we currently got? Well, 256 plus the
1085 string-terminator plus Unicode chars plus abbrevations plus
1088 huff_unicode_start = entities;
1089 entities += no_unicode_chars;
1090 huff_abbrev_start = entities;
1092 entities += no_abbreviations;
1093 huff_dynam_start = entities;
1094 entities += no_dynamic_strings;
1096 if (entities > max_char_set)
1097 compiler_error("Too many entities for max_char_set");
1100 for (jx=0; jx<256; jx++) {
1101 huff_entities[jx].type = 2;
1102 huff_entities[jx].count = 0;
1103 huff_entities[jx].u.ch = jx;
1106 huff_entities[256].type = 1;
1107 huff_entities[256].count = 0;
1108 for (jx=0; jx<no_unicode_chars; jx++) {
1109 huff_entities[huff_unicode_start+jx].type = 4;
1110 huff_entities[huff_unicode_start+jx].count = 0;
1111 huff_entities[huff_unicode_start+jx].u.val = jx;
1113 if (economy_switch) {
1114 for (jx=0; jx<no_abbreviations; jx++) {
1115 huff_entities[huff_abbrev_start+jx].type = 3;
1116 huff_entities[huff_abbrev_start+jx].count = 0;
1117 huff_entities[huff_abbrev_start+jx].u.val = jx;
1120 for (jx=0; jx<no_dynamic_strings; jx++) {
1121 huff_entities[huff_dynam_start+jx].type = 9;
1122 huff_entities[huff_dynam_start+jx].count = 0;
1123 huff_entities[huff_dynam_start+jx].u.val = jx;
1127 /* No compression; use defaults that will make it easy to check
1129 no_huff_entities = 257;
1130 huff_unicode_start = 257;
1131 huff_abbrev_start = 257;
1132 huff_dynam_start = 257+no_abbreviations;
1133 compression_table_size = 0;
1136 if (compression_switch) {
1138 for (lx=0, ix=0; lx<no_strings; lx++) {
1139 int escapelen=0, escapetype=0;
1143 ch = static_strings_area[ix];
1145 if (ix > static_strings_extent || ch < 0)
1146 compiler_error("Read too much not-yet-compressed text.");
1147 if (escapelen == -1) {
1152 else if (ch == '0') {
1155 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1162 compiler_error("Strange @ escape in processed text.");
1165 else if (escapelen) {
1166 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1168 if (escapelen == 0) {
1169 if (escapetype == 'A') {
1170 ch = huff_abbrev_start+escapeval;
1172 else if (escapetype == 'D') {
1173 ch = huff_dynam_start+escapeval;
1175 else if (escapetype == 'U') {
1176 ch = huff_unicode_start+escapeval;
1179 compiler_error("Strange @ escape in processed text.");
1195 huff_entities[ch].count++;
1200 for (jx=0; jx<entities; jx++) {
1201 if (huff_entities[jx].count) {
1202 hufflist[numlive] = &(huff_entities[jx]);
1207 branchstart = entities;
1210 while (numlive > 1) {
1212 int best1num, best2num;
1215 if (hufflist[0]->count < hufflist[1]->count) {
1224 best1num = hufflist[best1]->count;
1225 best2num = hufflist[best2]->count;
1227 for (jx=2; jx<numlive; jx++) {
1228 if (hufflist[jx]->count < best1num) {
1230 best2num = best1num;
1232 best1num = hufflist[best1]->count;
1234 else if (hufflist[jx]->count < best2num) {
1236 best2num = hufflist[best2]->count;
1240 bran = &(huff_entities[branchstart+branches]);
1243 bran->count = hufflist[best1]->count + hufflist[best2]->count;
1244 bran->u.branch[0] = (hufflist[best1] - huff_entities);
1245 bran->u.branch[1] = (hufflist[best2] - huff_entities);
1246 hufflist[best1] = bran;
1247 if (best2 < numlive-1) {
1248 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1249 ((numlive-1) - best2) * sizeof(huffentity_t *));
1254 huff_entity_root = (hufflist[0] - huff_entities);
1256 for (ix=0; ix<MAXHUFFBYTES; ix++)
1258 compression_table_size = 12;
1260 no_huff_entities = 0; /* compress_makebits will total this up */
1261 compress_makebits(huff_entity_root, 0, -1, &bits);
1264 /* Now, sadly, we have to compute the size of the string section,
1265 without actually doing the compression. */
1266 compression_string_size = 0;
1268 ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1270 for (lx=0, ix=0; lx<no_strings; lx++) {
1271 int escapelen=0, escapetype=0;
1275 compressed_offsets[lx] = compression_table_size + compression_string_size;
1276 compression_string_size++; /* for the type byte */
1278 ch = static_strings_area[ix];
1280 if (ix > static_strings_extent || ch < 0)
1281 compiler_error("Read too much not-yet-compressed text.");
1282 if (escapelen == -1) {
1287 else if (ch == '0') {
1290 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1297 compiler_error("Strange @ escape in processed text.");
1300 else if (escapelen) {
1301 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1303 if (escapelen == 0) {
1304 if (escapetype == 'A') {
1305 ch = huff_abbrev_start+escapeval;
1307 else if (escapetype == 'D') {
1308 ch = huff_dynam_start+escapeval;
1310 else if (escapetype == 'U') {
1311 ch = huff_unicode_start+escapeval;
1314 compiler_error("Strange @ escape in processed text.");
1331 if (compression_switch) {
1332 jx += huff_entities[ch].depth;
1333 compression_string_size += (jx/8);
1337 if (ch >= huff_dynam_start) {
1338 compression_string_size += 3;
1340 else if (ch >= huff_unicode_start) {
1341 compiler_error("Abbreviation/Unicode in non-compressed string \
1342 should be impossible.");
1345 compression_string_size += 1;
1348 if (compression_switch && jx)
1349 compression_string_size++;
1352 done_compression = TRUE;
1355 static void compress_makebits(int entnum, int depth, int prevbit,
1356 huffbitlist_t *bits)
1358 huffentity_t *ent = &(huff_entities[entnum]);
1362 ent->addr = compression_table_size;
1367 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1370 switch (ent->type) {
1372 compression_table_size += 9;
1373 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1374 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1377 compression_table_size += 1;
1380 compression_table_size += 2;
1383 cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1384 compression_table_size += (1 + 1 + strlen(cx));
1388 compression_table_size += 5;
1393 /* ------------------------------------------------------------------------- */
1394 /* The abbreviations optimiser */
1396 /* This is a very complex, memory and time expensive algorithm to */
1397 /* approximately solve the problem of which abbreviation strings would */
1398 /* minimise the total number of Z-chars to which the game text translates. */
1399 /* It is in some ways a quite separate program but remains inside Inform */
1400 /* for compatibility with previous releases. */
1401 /* ------------------------------------------------------------------------- */
1403 /* The complete game text. */
1404 static char *opttext;
1405 static int32 opttextlen;
1407 typedef struct tlb_s
1409 int32 intab, occurrences;
1411 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1412 static memory_list tlbtab_memlist;
1413 static int32 no_occs;
1415 static int32 *grandtable;
1416 static int32 *grandflags;
1417 typedef struct optab_s
1422 char text[MAX_ABBREV_LENGTH];
1424 static int32 MAX_BESTYET;
1425 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1426 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1430 static void optimise_pass(void)
1435 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1436 for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1437 for (i=0; i<no_occs; i++)
1438 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1441 if (i%((**g_pm_hndl).linespercheck) == 0)
1442 { ProcessEvents (&g_proc);
1445 longjmp (g_fallback, 1);
1449 if (optabbrevs_trace_setting >= 2) {
1450 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1451 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1452 (long int) tlbtab[i].occurrences);
1455 for (j=0; j<tlbtab[i].occurrences; j++)
1456 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1457 nl=2; noflags=tlbtab[i].occurrences;
1458 while ((noflags>=2)&&(nl<MAX_ABBREV_LENGTH-1))
1460 for (j2=0; j2<nl; j2++)
1461 if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1464 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1465 { if (grandflags[j2]==1)
1466 { x=grandtable[tlbtab[i].intab+j2]
1467 - grandtable[tlbtab[i].intab+j];
1468 if (((x>-nl)&&(x<nl))
1469 || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1470 opttext+grandtable[tlbtab[i].intab+j2],
1472 { grandflags[j2]=0; noflags--; }
1477 for (k=0; k<nl; k++)
1479 c=opttext[grandtable[tlbtab[i].intab+j+k]];
1481 { if (iso_to_alphabet_grid[c]<0)
1484 if (iso_to_alphabet_grid[c]>=26)
1488 score=(matches-1)*(scrabble-2);
1490 for (j2=0; j2<MAX_BESTYET; j2++)
1491 { if ((nl==bestyet[j2].length)
1492 && (memcmp(opttext+bestyet[j2].location,
1493 opttext+grandtable[tlbtab[i].intab+j],
1495 { j2=MAX_BESTYET; min=score; }
1497 { if (bestyet[j2].score<min)
1498 { min=bestyet[j2].score; minat=j2;
1503 { bestyet[minat].score=score;
1504 bestyet[minat].length=nl;
1505 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1506 bestyet[minat].popularity=matches;
1511 if (optabbrevs_trace_setting >= 2) {
1513 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1514 printf(" (%.4f seconds)\n", duration);
1520 static int any_overlap(char *s1, char *s2)
1521 { int a, b, i, j, flag;
1522 a=strlen(s1); b=strlen(s2);
1523 for (i=1-b; i<a; i++)
1526 if ((0<=i+j)&&(i+j<=a-1))
1527 if (s1[i+j]!=s2[j]) flag=1;
1528 if (flag==0) return(1);
1533 extern void optimise_abbreviations(void)
1534 { int32 i, j, tcount, max=0, MAX_GTABLE;
1535 int32 j2, selected, available, maxat=0, nl;
1537 if (opttext == NULL)
1540 /* We insist that the first two abbreviations will be ". " and ", ". */
1541 if (MAX_ABBREVS < 2)
1544 /* Note that it's safe to access opttext[opttextlen+2]. There are
1545 two newlines and a null beyond opttextlen. */
1547 printf("Beginning calculation of optimal abbreviations...\n");
1551 initialise_memory_list(&tlbtab_memlist,
1552 sizeof(tlb), 1000, (void**)&tlbtab,
1553 "three-letter-blocks buffer");
1557 /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1558 MAX_BESTYET = 4 * MAX_ABBREVS;
1560 bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1561 bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1563 bestyet2[0].text[0]='.';
1564 bestyet2[0].text[1]=' ';
1565 bestyet2[0].text[2]=0;
1567 bestyet2[1].text[0]=',';
1568 bestyet2[1].text[1]=' ';
1569 bestyet2[1].text[2]=0;
1573 for (i=0; i<opttextlen; i++)
1575 if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1576 { opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1577 bestyet2[0].popularity++;
1580 if ((opttext[i]=='.') && (opttext[i+1]==' '))
1581 { opttext[i]='\n'; opttext[i+1]='\n';
1582 bestyet2[0].popularity++;
1585 if ((opttext[i]==',') && (opttext[i+1]==' '))
1586 { opttext[i]='\n'; opttext[i+1]='\n';
1587 bestyet2[1].popularity++;
1591 MAX_GTABLE=opttextlen+1;
1592 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1594 for (i=0, tcount=0; i<opttextlen; i++)
1597 test.text[0]=opttext[i];
1598 test.text[1]=opttext[i+1];
1599 test.text[2]=opttext[i+2];
1601 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1603 for (j=0; j<no_occs; j++) {
1604 if (strcmp(test.text,tlbtab[j].text)==0)
1609 for (j=i+3; j<opttextlen; j++)
1612 if (j%((**g_pm_hndl).linespercheck) == 0)
1613 { ProcessEvents (&g_proc);
1616 longjmp (g_fallback, 1);
1620 if ((opttext[i]==opttext[j])
1621 && (opttext[i+1]==opttext[j+1])
1622 && (opttext[i+2]==opttext[j+2]))
1623 { grandtable[tcount+test.occurrences]=j;
1625 if (tcount+test.occurrences==MAX_GTABLE)
1626 { printf("All %ld cross-references used\n",
1627 (long int) MAX_GTABLE);
1632 if (test.occurrences>=2)
1634 ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1635 tlbtab[no_occs]=test;
1636 tlbtab[no_occs].intab=tcount;
1637 tcount += tlbtab[no_occs].occurrences;
1638 if (max<tlbtab[no_occs].occurrences)
1639 max=tlbtab[no_occs].occurrences;
1646 grandflags=my_calloc(sizeof(int), max, "grandflags");
1649 if (optabbrevs_trace_setting >= 1) {
1650 printf("Cross-reference table (%ld entries) built...\n",
1651 (long int) no_occs);
1653 /* for (i=0; i<no_occs; i++)
1654 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1655 tlbtab[i].occurrences);
1658 for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1659 available=MAX_BESTYET;
1660 while ((available>0)&&(selected<MAX_ABBREVS))
1663 if (optabbrevs_trace_setting >= 1) {
1664 printf("Pass %d\n", pass_no);
1669 for (i=0; i<MAX_BESTYET; i++)
1670 if (bestyet[i].score!=0)
1672 nl=bestyet[i].length;
1673 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1674 opttext[bestyet[i].location+j2];
1675 bestyet[i].text[nl]=0;
1678 /* printf("End of pass results:\n");
1679 printf("\nno score freq string\n");
1680 for (i=0; i<MAX_BESTYET; i++)
1681 if (bestyet[i].score>0)
1682 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1683 bestyet[i].popularity, bestyet[i].text);
1688 for (i=0; i<MAX_BESTYET; i++)
1689 if (max<bestyet[i].score)
1690 { max=bestyet[i].score;
1697 bestyet2[selected++]=bestyet[maxat];
1699 if (optabbrevs_trace_setting >= 1) {
1701 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1702 (long int) selected,bestyet[maxat].text,
1703 (long int) bestyet[maxat].popularity,
1704 (long int) bestyet[maxat].score);
1707 testtext[0]=bestyet[maxat].text[0];
1708 testtext[1]=bestyet[maxat].text[1];
1709 testtext[2]=bestyet[maxat].text[2];
1712 for (i=0; i<no_occs; i++)
1713 if (strcmp(testtext,tlbtab[i].text)==0)
1716 for (j=0; j<tlbtab[i].occurrences; j++)
1717 { if (memcmp(bestyet[maxat].text,
1718 opttext+grandtable[tlbtab[i].intab+j],
1719 bestyet[maxat].length)==0)
1720 { for (j2=0; j2<bestyet[maxat].length; j2++)
1721 opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1725 for (i=0; i<MAX_BESTYET; i++)
1726 if ((bestyet[i].score>0)&&
1727 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1728 { bestyet[i].score=0;
1729 /* printf("Discarding '%s' as overlapping\n",
1730 bestyet[i].text); */
1733 } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1736 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1737 for (i=0; i<selected; i++)
1738 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1743 /* ------------------------------------------------------------------------- */
1744 /* The dictionary manager begins here. */
1746 /* Speed is extremely important in these algorithms. If a linear-time */
1747 /* routine were used to search the dictionary words so far built up, then */
1748 /* Inform would crawl. */
1750 /* Instead, the dictionary is stored as a binary tree, which is kept */
1751 /* balanced with the red-black algorithm. */
1752 /* ------------------------------------------------------------------------- */
1753 /* A dictionary table similar to the Z-machine format is kept: there is a */
1754 /* 7-byte header (left blank here to be filled in at the */
1755 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1756 /* records, one per word, in the form */
1758 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1759 /* 4 or 6 bytes byte byte byte */
1761 /* For Glulx, the form is instead: (See below about Unicode-valued */
1762 /* dictionaries and DICT_WORD_BYTES.) */
1764 /* <tag> <plain text> <flags> <verbnumber> <adjectivenumber> */
1765 /* $60 DICT_WORD_BYTES short short short */
1767 /* These records are stored in "accession order" (i.e. in order of their */
1768 /* first being received by these routines) and only alphabetically sorted */
1769 /* by construct_storyfile() (using the array below). */
1770 /* ------------------------------------------------------------------------- */
1772 /* Further notes about the data fields... */
1773 /* The flags are currently: */
1774 /* bit 0: word is used as a verb (in verb grammar) */
1775 /* bit 1: word is used as a meta verb */
1776 /* bit 2: word is plural (set by '//p') */
1777 /* bit 3: word is used as a preposition (in verb grammar) */
1778 /* bit 6: set for all verbs, but not used by the parser? */
1779 /* bit 7: word is used as a noun (set for every word that appears in */
1780 /* code or in an object property) */
1782 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1785 /* The compiler generates special constants #dict_par1, #dict_par2, */
1786 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1787 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1788 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1789 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1790 /* ------------------------------------------------------------------------- */
1792 uchar *dictionary; /* (These two variables are externally
1793 used only in "tables.c" when
1794 building the story-file) */
1795 static memory_list dictionary_memlist;
1796 int32 dictionary_top; /* Position of the next free record
1797 in dictionary (i.e., the current
1800 int dict_entries; /* Total number of records entered */
1802 /* ------------------------------------------------------------------------- */
1803 /* dict_word was originally a typedef for a struct of 6 unsigned chars. */
1804 /* It held the (4 or) 6 bytes of Z-coded text of a word. */
1805 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1806 /* alphabetic order corresponds to numeric order. For this reason, the */
1807 /* dict_word is called the "sort code" of the original text word. */
1809 /* In modifying the compiler for Glulx, I found it easier to discard the */
1810 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1811 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1812 /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
1813 /* (That limit is defined as 40 in the header; it exists only for a few */
1814 /* static buffers, and can be increased without using significant memory.) */
1816 /* ...Well, that certainly bit me on the butt, didn't it. In further */
1817 /* modifying the compiler to generate a Unicode dictionary, I have to */
1818 /* store four-byte values in the uchar array. This is handled by making */
1819 /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1820 /* Then we store the 32-bit character value big-endian. This lets us */
1821 /* continue to compare arrays bytewise, which is a nice simplification. */
1822 /* ------------------------------------------------------------------------- */
1824 extern int compare_sorts(uchar *d1, uchar *d2)
1826 for (i=0; i<DICT_WORD_BYTES; i++)
1827 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1828 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1833 extern void copy_sorts(uchar *d1, uchar *d2)
1835 for (i=0; i<DICT_WORD_BYTES; i++)
1839 static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
1842 static int number_and_case;
1844 /* Also used by verbs.c */
1845 static void dictionary_prepare_z(char *dword, uchar *optresult)
1846 { int i, j, k, k2, wd[13]; int32 tot;
1848 /* A rapid text translation algorithm using only the simplified rules
1849 applying to the text of dictionary entries: first produce a sequence
1850 of 6 (v3) or 9 (v4+) Z-characters */
1852 int dictsize = (version_number==3) ? 6 : 9;
1854 number_and_case = 0;
1856 for (i=0, j=0; dword[j]!=0; i++, j++)
1857 { if ((dword[j] == '/') && (dword[j+1] == '/'))
1858 { for (j+=2; dword[j] != 0; j++)
1860 { case 'p': number_and_case |= 4; break;
1862 error_named("Expected 'p' after '//' \
1863 to give number of dictionary word", dword);
1869 if (i>=dictsize) break;
1873 warning_named("Obsolete usage: use the ^ character for the \
1874 apostrophe in", dword);
1875 if (k==(int) '^') k=(int) '\'';
1878 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1879 { int unicode = text_to_unicode(dword+j);
1880 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1881 k = unicode_to_zscii(unicode);
1882 j += textual_form_length - 1;
1883 if ((k == 5) || (k >= 0x100))
1884 { unicode_char_error(
1885 "Character can be printed but not input:", unicode);
1888 k2 = zscii_to_alphabet_grid[(uchar) k];
1891 { if (isupper(k)) k = tolower(k);
1892 k2 = iso_to_alphabet_grid[(uchar) k];
1896 { if ((k2 == -5) || (k2 <= -0x100))
1897 char_error("Character can be printed but not input:", k);
1899 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
1901 wd[i++] = 5; wd[i++] = 6;
1903 wd[i++] = k2/32; wd[i] = k2%32;
1907 { alphabet_used[k2] = 'Y';
1909 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
1910 wd[i]=6+(k2%26); /* Write the Z character */
1914 /* Fill up to the end of the dictionary block with PAD characters */
1916 for (; i<9; i++) wd[i]=5;
1918 /* The array of Z-chars is converted to two or three 2-byte blocks */
1920 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1921 prepared_sort[1]=tot%0x100;
1922 prepared_sort[0]=(tot/0x100)%0x100;
1923 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1924 prepared_sort[3]=tot%0x100;
1925 prepared_sort[2]=(tot/0x100)%0x100;
1926 if (version_number==3)
1929 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1930 prepared_sort[5]=tot%0x100;
1931 prepared_sort[4]=(tot/0x100)%0x100;
1933 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
1935 if (version_number==3) prepared_sort[2]+=0x80;
1936 else prepared_sort[4]+=0x80;
1938 if (optresult) copy_sorts(optresult, prepared_sort);
1941 /* Also used by verbs.c */
1942 static void dictionary_prepare_g(char *dword, uchar *optresult)
1947 number_and_case = 0;
1949 for (i=0, j=0; (dword[j]!=0); i++, j++) {
1950 if ((dword[j] == '/') && (dword[j+1] == '/')) {
1951 for (j+=2; dword[j] != 0; j++) {
1954 number_and_case |= 4;
1957 error_named("Expected 'p' after '//' \
1958 to give gender or number of dictionary word", dword);
1964 if (i>=DICT_WORD_SIZE) break;
1966 k= ((unsigned char *)dword)[j];
1968 warning_named("Obsolete usage: use the ^ character for the \
1969 apostrophe in", dword);
1972 if (k=='~') /* as in iso_to_alphabet_grid */
1975 if (k=='@' || (character_set_unicode && (k & 0x80))) {
1976 unicode = text_to_unicode(dword+j);
1977 j += textual_form_length - 1;
1980 unicode = iso_to_unicode_grid[k];
1983 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1987 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1988 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1992 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1995 if (DICT_CHAR_SIZE == 1) {
1996 prepared_sort[i] = k;
1999 prepared_sort[4*i] = (k >> 24) & 0xFF;
2000 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2001 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
2002 prepared_sort[4*i+3] = (k) & 0xFF;
2006 if (DICT_CHAR_SIZE == 1) {
2007 for (; i<DICT_WORD_SIZE; i++)
2008 prepared_sort[i] = 0;
2011 for (; i<DICT_WORD_SIZE; i++) {
2012 prepared_sort[4*i] = 0;
2013 prepared_sort[4*i+1] = 0;
2014 prepared_sort[4*i+2] = 0;
2015 prepared_sort[4*i+3] = 0;
2019 if (optresult) copy_sorts(optresult, prepared_sort);
2022 extern void dictionary_prepare(char *dword, uchar *optresult)
2025 dictionary_prepare_z(dword, optresult);
2027 dictionary_prepare_g(dword, optresult);
2030 /* ------------------------------------------------------------------------- */
2031 /* The arrays below are all concerned with the problem of alphabetically */
2032 /* sorting the dictionary during the compilation pass. */
2033 /* Note that it is not enough simply to apply qsort to the dictionary at */
2034 /* the end of the pass: we need to ensure that no duplicates are ever */
2037 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
2038 /* word to be entered into the dictionary, where */
2039 /* n counts upward from 0 */
2040 /* (n is also called the "accession number") */
2042 /* The tree structure encodes an ordering. The special value VACANT means */
2043 /* "no node here": otherwise, node numbers are the same as accession */
2044 /* numbers. At all times, "root" holds the node number of the top of the */
2045 /* tree; each node has up to two branches, such that the subtree of the */
2046 /* left branch is always alphabetically before what's at the node, and */
2047 /* the subtree to the right is always after; and all branches are coloured */
2048 /* either "black" or "red". These colours are used to detect points where */
2049 /* the tree is growing asymmetrically (and therefore becoming inefficient */
2051 /* ------------------------------------------------------------------------- */
2058 typedef struct dict_tree_node_s
2059 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
2060 char colour; /* The colour of the branch to the parent */
2063 static dict_tree_node *dtree; /* Allocated to dict_entries */
2064 static memory_list dtree_memlist;
2066 static uchar *dict_sort_codes; /* Allocated to dict_entries*DICT_WORD_BYTES */
2067 static memory_list dict_sort_codes_memlist;
2069 int *final_dict_order; /* Allocated at sort_dictionary() time */
2071 static void dictionary_begin_pass(void)
2073 /* Leave room for the 7-byte header (added in "tables.c" much later) */
2074 /* Glulx has a 4-byte header instead. */
2081 ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2087 static int fdo_count;
2088 static void recursively_sort(int node)
2089 { if (dtree[node].branch[0] != VACANT)
2090 recursively_sort(dtree[node].branch[0]);
2091 final_dict_order[node] = fdo_count++;
2092 if (dtree[node].branch[1] != VACANT)
2093 recursively_sort(dtree[node].branch[1]);
2096 extern void sort_dictionary(void)
2099 final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2102 { for (i=0; i<dict_entries; i++)
2103 final_dict_order[i] = i;
2108 { fdo_count = 0; recursively_sort(root);
2112 /* ------------------------------------------------------------------------- */
2113 /* If "dword" is in the dictionary, return its accession number plus 1; */
2114 /* If not, return 0. */
2115 /* ------------------------------------------------------------------------- */
2117 static int dictionary_find(char *dword)
2120 dictionary_prepare(dword, NULL);
2122 while (at != VACANT)
2123 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2124 if (n==0) return at + 1;
2125 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2130 /* ------------------------------------------------------------------------- */
2131 /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
2132 /* it already exists, in which case OR the data with (x,y,z) */
2134 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
2136 /* Returns: the accession number. */
2137 /* ------------------------------------------------------------------------- */
2139 extern int dictionary_add(char *dword, int x, int y, int z)
2141 int ggfr = 0, gfr = 0, fr = 0, r = 0;
2142 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2144 int res=((version_number==3)?4:6);
2146 dictionary_prepare(dword, NULL);
2149 { root = 0; goto CreateEntry;
2153 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2157 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2158 p[0]=(p[0])|x; p[1]=(p[1])|y;
2159 if (!ZCODE_LESS_DICT_DATA)
2161 if (x & 128) p[0] = (p[0])|number_and_case;
2164 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2165 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
2166 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
2167 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
2168 if (x & 128) p[1] = (p[1]) | number_and_case;
2172 if (n>0) r=1; else r=0;
2174 a = dtree[at].branch[0]; b = dtree[at].branch[1];
2175 if ((a != VACANT) && (dtree[a].colour == RED) &&
2176 (b != VACANT) && (dtree[b].colour == RED))
2177 { dtree[a].colour = BLACK;
2178 dtree[b].colour = BLACK;
2180 dtree[at].colour = RED;
2182 /* A tree rotation may be needed to avoid two red links in a row:
2184 ggf (or else gf is root) ggf (or f is root)
2187 / \(red) / \ (both red)
2193 In effect we rehang the "gf" subtree from "f".
2194 See the Technical Manual for further details.
2197 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2200 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2201 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2202 dtree[f].branch[1-fr] = gf;
2203 dtree[f].colour = BLACK;
2204 dtree[gf].colour = RED;
2205 gf = ggf; gfr = ggfr;
2208 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2209 dtree[at].colour = BLACK;
2210 dtree[gf].colour = RED;
2211 dtree[f].branch[fr] = dtree[at].branch[gfr];
2212 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2213 dtree[at].branch[gfr] = f;
2214 dtree[at].branch[fr] = gf;
2216 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2217 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2222 if (dtree[at].branch[r] == VACANT)
2223 { dtree[at].colour = RED;
2225 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2227 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2228 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2229 dtree[f].branch[1-fr] = gf;
2230 dtree[f].colour = BLACK;
2231 dtree[gf].colour = RED;
2234 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2235 dtree[at].colour = BLACK;
2236 dtree[gf].colour = RED;
2237 dtree[f].branch[fr] = dtree[at].branch[gfr];
2238 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2239 dtree[at].branch[gfr] = f;
2240 dtree[at].branch[fr] = gf;
2242 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2246 dtree[at].branch[r] = dict_entries;
2249 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2250 ggfr = gfr; gfr = fr; fr = r;
2255 ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2256 ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2258 dtree[dict_entries].branch[0] = VACANT;
2259 dtree[dict_entries].branch[1] = VACANT;
2260 dtree[dict_entries].colour = BLACK;
2262 /* Address in Inform's own dictionary table to write the record to */
2266 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2267 p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2269 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2272 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2273 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2274 if (version_number > 3)
2275 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2276 p[res]=x; p[res+1]=y;
2277 if (!ZCODE_LESS_DICT_DATA) p[res+2]=z;
2278 if (x & 128) p[res] = (p[res])|number_and_case;
2280 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2285 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2286 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2287 p[0] = 0x60; /* type byte -- dict word */
2289 p += DICT_CHAR_SIZE;
2290 for (i=0; i<DICT_WORD_BYTES; i++)
2291 p[i] = prepared_sort[i];
2293 p += DICT_WORD_BYTES;
2295 p[2] = y/256; p[3] = y%256;
2298 p[1] |= number_and_case;
2300 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2304 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2306 return dict_entries++;
2309 /* ------------------------------------------------------------------------- */
2310 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2311 /* new verb syntax of its own. (Otherwise existing verb-words never */
2312 /* change their verb-numbers.) */
2313 /* ------------------------------------------------------------------------- */
2315 extern void dictionary_set_verb_number(char *dword, int to)
2317 int res=((version_number==3)?4:6);
2318 i=dictionary_find(dword);
2322 p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2326 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2327 p[2]=to/256; p[3]=to%256;
2332 /* ------------------------------------------------------------------------- */
2333 /* Tracing code for the dictionary: used not only by "trace" and text */
2334 /* transcription, but also (in the case of "word_to_ascii") in a vital */
2335 /* by the linker. */
2336 /* ------------------------------------------------------------------------- */
2338 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2339 printed directly. (The "Trace dictionary" directive does this.)
2340 If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2341 until it's a page-width.
2343 static char *d_show_buf = NULL;
2344 static int d_show_size; /* allocated size */
2345 static int d_show_len; /* current length */
2347 static void show_char(char c)
2349 if (d_show_buf == NULL) {
2353 if (d_show_len+2 >= d_show_size) {
2354 int newsize = 2 * d_show_len + 16;
2355 my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2356 d_show_size = newsize;
2358 d_show_buf[d_show_len++] = c;
2359 d_show_buf[d_show_len] = '\0';
2363 /* Display a Unicode character in user-readable form. This uses the same
2364 character encoding as the source code. */
2365 static void show_uchar(uint32 c)
2371 /* ASCII always works */
2375 if (character_set_unicode) {
2376 /* UTF-8 the character */
2380 else if (c < 0x800) {
2381 show_char((0xC0 | ((c & 0x7C0) >> 6)));
2382 show_char((0x80 | (c & 0x03F) ));
2384 else if (c < 0x10000) {
2385 show_char((0xE0 | ((c & 0xF000) >> 12)));
2386 show_char((0x80 | ((c & 0x0FC0) >> 6)));
2387 show_char((0x80 | (c & 0x003F) ));
2389 else if (c < 0x200000) {
2390 show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2391 show_char((0x80 | ((c & 0x03F000) >> 12)));
2392 show_char((0x80 | ((c & 0x000FC0) >> 6)));
2393 show_char((0x80 | (c & 0x00003F) ));
2400 if (character_set_setting == 1 && c < 0x100) {
2401 /* Fits in Latin-1 */
2405 /* Supporting other character_set_setting is harder; not currently implemented. */
2407 /* Use the escaped form */
2408 sprintf(buf, "@{%x}", c);
2409 for (ix=0; buf[ix]; ix++)
2413 extern void word_to_ascii(uchar *p, char *results)
2414 { int i, shift, cc, zchar; uchar encoded_word[9];
2415 encoded_word[0] = (((int) p[0])&0x7c)/4;
2416 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2417 encoded_word[2] = ((int) p[1])&0x1f;
2418 encoded_word[3] = (((int) p[2])&0x7c)/4;
2419 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2420 encoded_word[5] = ((int) p[3])&0x1f;
2421 if (version_number > 3)
2422 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2423 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2424 encoded_word[8] = ((int) p[5])&0x1f;
2428 encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2432 for (i=0; i< ((version_number==3)?6:9); i++)
2433 { zchar = encoded_word[i];
2435 if (zchar == 4) shift = 1;
2437 if (zchar == 5) shift = 2;
2439 { if ((shift == 2) && (zchar == 6))
2440 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2442 if ((zchar>=32) && (zchar<=126))
2443 results[cc++] = zchar;
2445 { zscii_to_text(results+cc, zchar);
2446 cc = strlen(results);
2450 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2451 cc = strlen(results);
2459 /* Print a dictionary word to stdout.
2460 (This assumes that d_show_buf is null.)
2462 void print_dict_word(int node)
2468 char textual_form[32];
2469 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2471 word_to_ascii(p, textual_form);
2473 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2474 show_char(textual_form[cprinted]);
2477 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2479 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2482 if (DICT_CHAR_SIZE == 1)
2485 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2493 static void recursively_show_z(int node, int level)
2494 { int i, cprinted, flags; uchar *p;
2495 char textual_form[32];
2496 int res = (version_number == 3)?4:6; /* byte length of encoded text */
2498 if (dtree[node].branch[0] != VACANT)
2499 recursively_show_z(dtree[node].branch[0], level);
2501 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2503 word_to_ascii(p, textual_form);
2505 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2506 show_char(textual_form[cprinted]);
2507 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2510 /* The level-1 info can only be printfed (d_show_buf must be null). */
2511 if (d_show_buf == NULL && level >= 1)
2514 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2517 flags = (int) p[res];
2520 if (flags & 4) printf("p"); else printf(" ");
2525 { if (grammar_version_number == 1)
2526 printf("preposition:%d ", (int) p[res+2]);
2528 printf("preposition ");
2530 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2531 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2535 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2536 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2538 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2542 if (dtree[node].branch[1] != VACANT)
2543 recursively_show_z(dtree[node].branch[1], level);
2546 static void recursively_show_g(int node, int level)
2550 if (dtree[node].branch[0] != VACANT)
2551 recursively_show_g(dtree[node].branch[0], level);
2553 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2555 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2558 if (DICT_CHAR_SIZE == 1)
2561 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2566 for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2569 /* The level-1 info can only be printfed (d_show_buf must be null). */
2570 if (d_show_buf == NULL && level >= 1)
2571 { int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2572 int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2573 int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2575 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2579 if (flags & 4) printf("p"); else printf(" ");
2584 { printf("preposition ");
2586 if ((flags & 3) == 3) printf("metaverb:%d ", verbnum);
2587 else if ((flags & 3) == 1) printf("verb:%d ", verbnum);
2591 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2592 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2594 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2598 if (dtree[node].branch[1] != VACANT)
2599 recursively_show_g(dtree[node].branch[1], level);
2602 static void show_alphabet(int i)
2603 { int j, c; char chartext[8];
2605 for (j=0; j<26; j++)
2606 { c = alphabet[i][j];
2608 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2610 zscii_to_text(chartext, c);
2611 printf("%s", chartext);
2613 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2618 extern void show_dictionary(int level)
2620 /* Level 0: show words only. Level 1: show words and flags.
2621 Level 2: also show bytes.*/
2622 printf("Dictionary contains %d entries:\n",dict_entries);
2623 if (dict_entries != 0)
2624 { d_show_len = 0; d_show_buf = NULL;
2626 recursively_show_z(root, level);
2628 recursively_show_g(root, level);
2632 printf("\nZ-machine alphabet entries:\n");
2639 extern void write_dictionary_to_transcript(void)
2641 d_show_size = 80; /* initial size */
2642 d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2644 write_to_transcript_file("", STRCTX_INFO);
2645 sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2646 write_to_transcript_file(d_show_buf, STRCTX_INFO);
2650 if (dict_entries != 0)
2653 recursively_show_z(root, 0);
2655 recursively_show_g(root, 0);
2657 if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2659 my_free(&d_show_buf, "dictionary display buffer");
2660 d_show_len = 0; d_show_buf = NULL;
2663 /* ========================================================================= */
2664 /* Data structure management routines */
2665 /* ------------------------------------------------------------------------- */
2667 extern void init_text_vars(void)
2680 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2682 total_zchars_trans = 0;
2687 final_dict_order = NULL;
2688 dict_sort_codes = NULL;
2691 static_strings_area = NULL;
2692 abbreviations_optimal_parse_schedule = NULL;
2693 abbreviations_optimal_parse_scores = NULL;
2695 compressed_offsets = NULL;
2696 huff_entities = NULL;
2698 unicode_usage_entries = NULL;
2701 extern void text_begin_pass(void)
2702 { abbrevs_lookup_table_made = FALSE;
2704 total_chars_trans=0; total_bytes_trans=0;
2706 dictionary_begin_pass();
2707 low_strings_top = 0;
2709 static_strings_extent = 0;
2711 no_dynamic_strings = 0;
2712 no_unicode_chars = 0;
2715 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2717 extern void text_allocate_arrays(void)
2721 initialise_memory_list(&translated_text_memlist,
2722 sizeof(uchar), 8000, (void**)&translated_text,
2723 "translated text holding area");
2725 initialise_memory_list(&all_text_memlist,
2726 sizeof(char), 0, (void**)&all_text,
2727 "transcription text for optimise");
2729 initialise_memory_list(&static_strings_area_memlist,
2730 sizeof(uchar), 128, (void**)&static_strings_area,
2731 "static strings area");
2733 initialise_memory_list(&abbreviations_at_memlist,
2734 MAX_ABBREV_LENGTH, 64, (void**)&abbreviations_at,
2735 "abbreviation text");
2737 initialise_memory_list(&abbreviations_memlist,
2738 sizeof(abbreviation), 64, (void**)&abbreviations,
2741 initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2742 sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2743 "abbreviations optimal parse schedule");
2744 initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2745 sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2746 "abbreviations optimal parse scores");
2748 initialise_memory_list(&dtree_memlist,
2749 sizeof(dict_tree_node), 1500, (void**)&dtree,
2750 "red-black tree for dictionary");
2751 initialise_memory_list(&dict_sort_codes_memlist,
2752 sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2753 "dictionary sort codes");
2755 final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2757 /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2758 4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2759 allocation; we don't have to be precise. */
2760 initialise_memory_list(&dictionary_memlist,
2761 sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2764 initialise_memory_list(&low_strings_memlist,
2765 sizeof(uchar), 1024, (void**)&low_strings,
2766 "low (abbreviation) strings");
2772 huff_entities = NULL;
2774 unicode_usage_entries = NULL;
2775 done_compression = FALSE;
2776 compression_table_size = 0;
2777 compressed_offsets = NULL;
2779 initialise_memory_list(&unicode_usage_entries_memlist,
2780 sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2781 "unicode entity entries");
2783 /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2785 /* This hash table is only used in Glulx */
2786 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2787 unicode_usage_hash[ix] = -1;
2789 initialise_memory_list(&compressed_offsets_memlist,
2790 sizeof(int32), 0, (void**)&compressed_offsets,
2791 "static strings index table");
2794 extern void extract_all_text()
2796 /* optimise_abbreviations() is called after free_arrays(). Therefore,
2797 we need to preserve the text transcript where it will not be
2798 freed up. We do this by copying the pointer to opttext. */
2800 opttextlen = all_text_top;
2802 /* Re-init all_text_memlist. This causes it to forget all about the
2803 old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2804 initialise_memory_list(&all_text_memlist,
2805 sizeof(char), 0, (void**)&all_text,
2806 "dummy transcription text");
2809 extern void text_free_arrays(void)
2811 deallocate_memory_list(&translated_text_memlist);
2813 deallocate_memory_list(&all_text_memlist);
2815 deallocate_memory_list(&low_strings_memlist);
2816 deallocate_memory_list(&abbreviations_at_memlist);
2817 deallocate_memory_list(&abbreviations_memlist);
2819 deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2820 deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2822 deallocate_memory_list(&dtree_memlist);
2823 deallocate_memory_list(&dict_sort_codes_memlist);
2824 my_free(&final_dict_order, "final dictionary ordering table");
2826 deallocate_memory_list(&dictionary_memlist);
2828 deallocate_memory_list(&compressed_offsets_memlist);
2829 my_free(&hufflist, "huffman node list");
2830 my_free(&huff_entities, "huffman entities");
2832 deallocate_memory_list(&unicode_usage_entries_memlist);
2834 deallocate_memory_list(&static_strings_area_memlist);
2837 extern void ao_free_arrays(void)
2839 /* Called only after optimise_abbreviations() runs. */
2841 my_free (&opttext,"stashed transcript for optimisation");
2842 my_free (&bestyet,"bestyet");
2843 my_free (&bestyet2,"bestyet2");
2844 my_free (&grandtable,"grandtable");
2845 my_free (&grandflags,"grandflags");
2847 deallocate_memory_list(&tlbtab_memlist);
2849 /* This was re-inited, so we should re-deallocate it. */
2850 deallocate_memory_list(&all_text_memlist);
2853 /* ========================================================================= */