1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Part of Inform 6.42 */
5 /* copyright (c) Graham Nelson 1993 - 2024 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ */
20 /* ------------------------------------------------------------------------- */
24 uchar *low_strings; /* Allocated to low_strings_top */
25 int32 low_strings_top;
26 static memory_list low_strings_memlist;
28 int32 static_strings_extent; /* Number of bytes of static strings
30 uchar *static_strings_area; /* Used to hold the static strings
32 Allocated to static_strings_extent */
33 memory_list static_strings_area_memlist;
35 static char *all_text; /* Text buffer holding the entire text
36 of the game, when it is being
38 (Allocated to all_text_top) */
39 static memory_list all_text_memlist;
40 static int32 all_text_top;
42 int abbrevs_lookup_table_made, /* The abbreviations lookup table is
43 constructed when the first non-
44 abbreviation string is translated:
45 this flag is TRUE after that */
46 abbrevs_lookup[256]; /* Once this has been constructed,
47 abbrevs_lookup[n] = the smallest
48 number of any abbreviation beginning
49 with ASCII character n, or -1
50 if none of the abbreviations do */
51 int no_abbreviations; /* No of abbreviations defined so far */
52 /* ------------------------------------------------------------------------- */
53 /* Glulx string compression storage */
54 /* ------------------------------------------------------------------------- */
56 int no_strings; /* No of strings in static strings
58 int no_dynamic_strings; /* No. of @.. string escapes used
59 (actually, the highest value used
61 int no_unicode_chars; /* Number of distinct Unicode chars
62 used. (Beyond 0xFF.) */
64 huffentity_t *huff_entities; /* The list of entities (characters,
65 abbreviations, @.. escapes, and
67 static huffentity_t **hufflist; /* Copy of the list, for sorting */
69 int no_huff_entities; /* The number of entities in the list */
70 int huff_unicode_start; /* Position in the list where Unicode
72 int huff_abbrev_start; /* Position in the list where string
73 abbreviations begin. */
74 int huff_dynam_start; /* Position in the list where @..
76 int huff_entity_root; /* The position in the list of the root
77 entry (when considering the table
80 int done_compression; /* Has the game text been compressed? */
81 int32 compression_table_size; /* Length of the Huffman table, in
83 int32 compression_string_size; /* Length of the compressed string
85 int32 *compressed_offsets; /* The beginning of every string in
86 the game, relative to the beginning
87 of the Huffman table. (So entry 0
88 is equal to compression_table_size).
89 Allocated to no_strings at
90 compress_game_text() time. */
91 static memory_list compressed_offsets_memlist;
93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars */
94 static memory_list unicode_usage_entries_memlist;
96 #define UNICODE_HASH_BUCKETS (64)
97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
99 static int unicode_entity_index(int32 unicode);
101 /* ------------------------------------------------------------------------- */
102 /* Abbreviation arrays */
103 /* ------------------------------------------------------------------------- */
105 abbreviation *abbreviations; /* Allocated up to no_abbreviations */
106 static memory_list abbreviations_memlist;
108 /* Memory to hold the text of any abbreviation strings declared. */
109 static int32 abbreviations_totaltext;
110 static char *abbreviations_text; /* Allocated up to abbreviations_totaltext */
111 static memory_list abbreviations_text_memlist;
113 static int *abbreviations_optimal_parse_schedule;
114 static memory_list abbreviations_optimal_parse_schedule_memlist;
116 static int *abbreviations_optimal_parse_scores;
117 static memory_list abbreviations_optimal_parse_scores_memlist;
119 /* ------------------------------------------------------------------------- */
121 int32 total_chars_trans, /* Number of ASCII chars of text in */
122 total_bytes_trans, /* Number of bytes of Z-code text out */
123 zchars_trans_in_last_string; /* Number of Z-chars in last string:
124 needed only for abbrev efficiency
125 calculation in "directs.c" */
126 static int32 total_zchars_trans; /* Number of Z-chars of text out
127 (only used to calculate the above) */
129 static int zchars_out_buffer[3], /* During text translation, a buffer of
130 3 Z-chars at a time: when it's full
131 these are written as a 2-byte word */
132 zob_index; /* Index (0 to 2) into it */
134 uchar *translated_text; /* Area holding translated strings
135 until they are moved into the
136 static_strings_area below */
137 static memory_list translated_text_memlist;
139 static char *temp_symbol; /* Temporary symbol name used while
140 processing "@(...)". */
141 static memory_list temp_symbol_memlist;
144 static int32 text_out_pos; /* The "program counter" during text
145 translation: the next position to
146 write Z-coded text output to */
148 static int32 text_out_limit; /* The upper limit of text_out_pos
149 during text translation (or -1
152 static int text_out_overflow; /* During text translation, becomes
153 true if text_out_pos tries to pass
156 /* ------------------------------------------------------------------------- */
157 /* For variables/arrays used by the dictionary manager, see below */
158 /* ------------------------------------------------------------------------- */
160 /* ------------------------------------------------------------------------- */
161 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
162 /* detection in text translation). We first bubble-sort the abbrevs into */
163 /* alphabetical order (this is necessary for the detection algorithm to */
164 /* to work). Since the table is only prepared once, and for a table */
165 /* of size at most 96, there's no point using an efficient sort algorithm. */
166 /* ------------------------------------------------------------------------- */
168 static void make_abbrevs_lookup(void)
169 { int bubble_sort, j, k;
172 { bubble_sort = FALSE;
173 for (j=0; j<no_abbreviations; j++)
174 for (k=j+1; k<no_abbreviations; k++)
175 { p1=abbreviation_text(j);
176 p2=abbreviation_text(k);
179 abbreviation temp = abbreviations[j];
180 abbreviations[j] = abbreviations[k];
181 abbreviations[k] = temp;
185 } while (bubble_sort);
187 for (j=no_abbreviations-1; j>=0; j--)
188 { p1=abbreviation_text(j);
189 abbrevs_lookup[(uchar)p1[0]]=j;
190 abbreviations[j].freq=0;
192 abbrevs_lookup_table_made = TRUE;
195 /* ------------------------------------------------------------------------- */
196 /* Search the abbreviations lookup table (a routine which must be fast). */
197 /* The source text to compare is text[i], text[i+1], ... and this routine */
198 /* is only called if text[i] is indeed the first character of at least one */
199 /* abbreviation, "from" begin the least index into the abbreviations table */
200 /* of an abbreviation for which text[i] is the first character. Recall */
201 /* that the abbrevs table is in alphabetical order. */
203 /* The return value is -1 if there is no match. If there is a match, the */
204 /* text to be abbreviated out is over-written by a string of null chars */
205 /* with "ASCII" value 1, and the abbreviation number is returned. */
207 /* In Glulx, we *do not* do this overwriting with 1's. */
208 /* ------------------------------------------------------------------------- */
210 static int try_abbreviations_from(unsigned char *text, int i, int from)
211 { int j, k; uchar *p, c;
217 p=(uchar *)abbreviations_text+abbreviations[j].textpos;
218 if (c != p[0]) break;
220 { for (k=2; p[k]!=0; k++)
221 if (text[i+k]!=p[k]) goto NotMatched;
223 for (k=0; p[k]!=0; k++) text[i+k]=1;
225 abbreviations[j].freq++;
233 /* Create an abbreviation. */
234 extern void make_abbreviation(char *text)
239 /* If -e mode is off, we won't waste space creating an abbreviation entry. */
244 pos = abbreviations_totaltext;
246 ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
247 ensure_memory_list_available(&abbreviations_text_memlist, pos+alen+1);
249 strcpy(abbreviations_text+pos, text);
250 abbreviations_totaltext += (alen+1);
252 abbreviations[no_abbreviations].textpos = pos;
253 abbreviations[no_abbreviations].textlen = alen;
254 abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
255 abbreviations[no_abbreviations].freq = 0;
257 /* The quality is the number of Z-chars saved by using this */
258 /* abbreviation: note that it takes 2 Z-chars to print it. */
260 abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
262 if (abbreviations[no_abbreviations].quality <= 0) {
263 warning_named("Abbreviation does not save any characters:", text);
269 /* Return a pointer to the (uncompressed) abbreviation text.
270 This should be treated as temporary; it is only valid until the next
271 make_abbreviation() call. */
272 extern char *abbreviation_text(int num)
274 if (num < 0 || num >= no_abbreviations) {
275 compiler_error("Invalid abbrev for abbreviation_text()");
279 return abbreviations_text + abbreviations[num].textpos;
282 /* ------------------------------------------------------------------------- */
283 /* The front end routine for text translation. */
284 /* strctx indicates the purpose of the string. This is mostly used for */
285 /* informational output (gametext.txt), but we treat some string contexts */
286 /* specially during compilation. */
287 /* ------------------------------------------------------------------------- */
289 /* TODO: When called from a print statement (parse_print()), it would be
290 nice to detect if the generated string is exactly one character. In that
291 case, we could return the character value and a flag to indicate the
292 caller could use @print_char/@streamchar/@new_line/@streamunichar
293 instead of printing a compiled string.
295 We'd need a new STRCTX value or two to distinguish direct-printed strings
296 from referenceable strings.
298 Currently, parse_print() checks for the "^" case manually, which is a
301 extern int32 compile_string(char *b, int strctx)
306 if (execution_never_reaches_here) {
307 /* No need to put strings into gametext.txt or the static/low
309 if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
310 /* VENEER and VENEEROPC are only used at the translate_text level,
311 so we don't have to catch them here. */
316 /* In Z-code, abbreviations go in the low memory pool (0x100). So
317 do strings explicitly defined with the Lowstring directive.
318 (In Glulx, the in_low_memory flag is ignored.) */
319 in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
321 if (!glulx_mode && in_low_memory)
323 k = translate_text(-1, b, strctx);
325 error("text translation failed");
328 ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
329 memcpy(low_strings+low_strings_top, translated_text, k);
331 low_strings_top += k;
335 if (glulx_mode && done_compression)
336 compiler_error("Tried to add a string after compression was done.");
338 i = translate_text(-1, b, strctx);
340 error("text translation failed");
344 /* Insert null bytes as needed to ensure that the next static string */
345 /* also occurs at an address expressible as a packed address */
349 if (oddeven_packing_switch)
350 textalign = scale_factor*2;
352 textalign = scale_factor;
353 while ((i%textalign)!=0)
355 ensure_memory_list_available(&translated_text_memlist, i+2);
356 translated_text[i++] = 0;
357 translated_text[i++] = 0;
361 j = static_strings_extent;
363 ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
364 for (c=translated_text; c<translated_text+i;
365 c++, static_strings_extent++)
366 static_strings_area[static_strings_extent] = *c;
369 return(j/scale_factor);
372 /* The marker value is a one-based string number. (We reserve zero
373 to mean "not a string at all". */
374 return (++no_strings);
378 /* ------------------------------------------------------------------------- */
379 /* Output a single Z-character into the buffer, and flush it if full */
380 /* ------------------------------------------------------------------------- */
382 static void write_z_char_z(int i)
385 total_zchars_trans++;
386 zchars_out_buffer[zob_index++]=(i%32);
387 if (zob_index!=3) return;
389 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
390 + zchars_out_buffer[2];
392 if (text_out_limit >= 0) {
393 if (text_out_pos+2 > text_out_limit) {
394 text_out_overflow = TRUE;
399 ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
402 translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
403 total_bytes_trans+=2;
406 static void write_zscii(int zsc)
408 int lookup_value, in_alphabet;
415 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
417 else lookup_value = -1;
419 if (lookup_value >= 0)
420 { alphabet_used[lookup_value] = 'Y';
421 in_alphabet = lookup_value/26;
422 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
423 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
424 write_z_char_z(lookup_value%26 + 6);
427 { write_z_char_z(5); write_z_char_z(6);
428 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
432 /* ------------------------------------------------------------------------- */
433 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
434 /* setting the "end" bit on the final 2-byte word */
435 /* ------------------------------------------------------------------------- */
437 static void end_z_chars(void)
439 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
440 while (zob_index!=0) write_z_char_z(5);
441 if (text_out_pos < 2) {
442 /* Something went wrong. */
443 text_out_overflow = TRUE;
446 translated_text[text_out_pos-2] += 128;
449 /* Glulx handles this much more simply -- compression is done elsewhere. */
450 static void write_z_char_g(int i)
453 if (text_out_limit >= 0) {
454 if (text_out_pos+1 > text_out_limit) {
455 text_out_overflow = TRUE;
460 ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
462 total_zchars_trans++;
463 translated_text[text_out_pos++] = i;
467 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
468 static int zchar_weight(int c)
471 if (c == ' ') return 1;
472 lookup = iso_to_alphabet_grid[c];
473 if (lookup < 0) return 4;
474 if (lookup < 26) return 1;
478 /* ------------------------------------------------------------------------- */
479 /* The main routine "text.c" provides to the rest of Inform: the text */
480 /* translator. s_text is the source text and the return value is the */
481 /* number of bytes translated. */
482 /* The translated text will be stored in translated_text. */
484 /* If p_limit is >= 0, the text length will not exceed that many bytes. */
485 /* If the translation tries to overflow this boundary, the return value */
486 /* will be -1. (You should display an error and not read translated_text.) */
488 /* If p_limit is negative, any amount of text is accepted (up to int32 */
491 /* Note that the source text may be corrupted by this routine. */
492 /* ------------------------------------------------------------------------- */
494 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
495 { int i, j, k, in_alphabet, lookup_value, is_abbreviation;
496 int32 unicode; int zscii;
497 unsigned char *text_in;
500 ensure_memory_list_available(&translated_text_memlist, p_limit);
503 /* For STRCTX_ABBREV, the string being translated is itself an
504 abbreviation string, so it can't make use of abbreviations. Set
505 the is_abbreviation flag to indicate this.
506 The compiler has historically set this flag for the Lowstring
507 directive as well -- the in_low_memory and is_abbreviation flag were
508 always the same. I am preserving that convention. */
509 is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
512 /* Cast the input and output streams to unsigned char: text_out_pos will
513 advance as bytes of Z-coded text are written, but text_in doesn't */
515 text_in = (unsigned char *) s_text;
517 text_out_limit = p_limit;
518 text_out_overflow = FALSE;
520 /* Remember the Z-chars total so that later we can subtract to find the
521 number of Z-chars translated on this string */
523 zchars_trans_in_last_string = total_zchars_trans;
525 /* Start with the Z-characters output buffer empty */
529 /* If this is the first text translated since the abbreviations were
530 declared, and if some were declared, then it's time to make the
531 lookup table for abbreviations
533 (Except: we don't if the text being translated is itself
534 the text of an abbreviation currently being defined) */
536 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
537 && (!is_abbreviation))
538 make_abbrevs_lookup();
540 /* If we're storing the whole game text to memory, then add this text.
541 We will put two newlines between each text and four at the very end.
542 (The optimise code does a lot of sloppy text[i+2], so the extra
543 two newlines past all_text_top are necessary.) */
545 if ((!is_abbreviation) && (store_the_text))
546 { int addlen = strlen(s_text);
547 ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
548 sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
549 /* Advance past two newlines. */
550 all_text_top += (addlen+2);
553 if (transcript_switch) {
554 /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
555 if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
558 if (label == STRCTX_GAME)
559 label = STRCTX_VENEER;
560 else if (label == STRCTX_GAMEOPC)
561 label = STRCTX_VENEEROPC;
563 write_to_transcript_file(s_text, label);
567 /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
568 /* (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
569 /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
573 int l, min_score, from;
576 text_in_length = strlen( (char*) text_in);
577 ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
578 ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
580 abbreviations_optimal_parse_scores[text_in_length] = 0;
581 for(j=text_in_length-1; j>=0; j--)
582 { /* Initial values: empty schedule, score = just write the letter without abbreviating. */
583 abbreviations_optimal_parse_schedule[j] = -1;
584 min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
585 /* If there's an abbreviation starting with that letter... */
586 if ( (from = abbrevs_lookup[text_in[j]]) != -1)
589 /* Loop on all abbreviations starting with what is in c. */
594 q=(uchar *)abbreviations_text+abbreviations[k].textpos;
596 /* Let's compare; we also keep track of the length of the abbreviation. */
597 for (l=1; q[l]!=0; l++)
598 { if (text_in[j+l]!=q[l]) {goto NotMatched;}
600 /* We have a match (length l), but is it smaller in size? */
601 if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
602 { /* It is indeed smaller, so let's write it down in our schedule. */
603 min_score = 2 + abbreviations_optimal_parse_scores[j+l];
604 abbreviations_optimal_parse_schedule[j] = k;
609 /* We gave it our best, this is the smallest we got. */
610 abbreviations_optimal_parse_scores[j] = min_score;
618 /* The empty string of Z-text is illegal, since it can't carry an end
619 bit: so we translate an empty string of ASCII text to just the
620 pad character 5. Printing this causes nothing to appear on screen. */
622 if (text_in[0]==0) write_z_char_z(5);
624 /* Loop through the characters of the null-terminated input text: note
625 that if 1 is written over a character in the input text, it is
626 afterwards ignored */
628 for (i=0; text_in[i]!=0; i++)
629 { total_chars_trans++;
631 /* Contract ". " into ". " if double-space-removing switch set:
632 likewise "? " and "! " if the setting is high enough */
634 if ((double_space_setting >= 1)
635 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
636 { if (text_in[i]=='.') text_in[i+2]=1;
637 if (double_space_setting >= 2)
638 { if (text_in[i]=='?') text_in[i+2]=1;
639 if (text_in[i]=='!') text_in[i+2]=1;
643 /* Try abbreviations if the economy switch set. */
644 /* Look at the abbreviation schedule to see if we should abbreviate here. */
645 /* Note: Just because the schedule has something doesn't mean we should abbreviate there; */
646 /* sometimes you abbreviate before because it's better. If we have already replaced the */
647 /* char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
648 if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
649 ((j = abbreviations_optimal_parse_schedule[i]) != -1))
651 /* Fill with 1s, which will get ignored by everyone else. */
652 uchar *p = (uchar *)abbreviation_text(j);
653 for (k=0; p[k]!=0; k++) text_in[i+k]=1;
654 /* Actually write the abbreviation in the story file. */
655 abbreviations[j].freq++;
656 /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
657 j += MAX_DYNAMIC_STRINGS;
658 write_z_char_z(j/32+1); write_z_char_z(j%32);
662 /* If Unicode switch set, use text_to_unicode to perform UTF-8
664 if (character_set_unicode && (text_in[i] & 0x80))
665 { unicode = text_to_unicode((char *) (text_in+i));
666 zscii = unicode_to_zscii(unicode);
667 if (zscii != 5) write_zscii(zscii);
669 { unicode_char_error(
670 "Character can only be used if declared in \
671 advance as part of 'Zcharacter table':", unicode);
673 i += textual_form_length - 1;
677 /* '@' is the escape character in Inform string notation: the various
680 @@decimalnumber : write this ZSCII char (0 to 1023)
681 @twodigits or : write the abbreviation string with this
682 @(digits) decimal number
683 @(symbol) : write the abbreviation string with this
685 @accentcode : this accented character: e.g.,
686 for @'e write an E-acute
687 @{...} : this Unicode char (in hex) */
690 { if (text_in[i+1]=='@')
692 /* @@... (ascii value) */
694 i+=2; j=atoi((char *) (text_in+i));
696 { /* Prevent ~ and ^ from being translated to double-quote
697 and new-line, as they ordinarily would be */
699 case 94: write_z_char_z(5); write_z_char_z(6);
700 write_z_char_z(94/32); write_z_char_z(94%32);
702 case 126: write_z_char_z(5); write_z_char_z(6);
703 write_z_char_z(126/32); write_z_char_z(126%32);
706 default: write_zscii(j); break;
708 while (isdigit(text_in[i])) i++; i--;
710 else if (text_in[i+1]=='(')
712 /* @(...) (dynamic string) */
713 int len = 0, digits = 0;
715 /* This accepts "12xyz" as a symbol, which it really isn't,
716 but that just means it won't be found. */
717 while ((text_in[i] == '_' || isalnum(text_in[i]))) {
718 char ch = text_in[i++];
719 if (isdigit(ch)) digits++;
720 ensure_memory_list_available(&temp_symbol_memlist, len+1);
721 temp_symbol[len++] = ch;
723 ensure_memory_list_available(&temp_symbol_memlist, len+1);
724 temp_symbol[len] = '\0';
726 /* We would like to parse temp_symbol as *either* a decimal
727 number or a constant symbol. */
728 if (text_in[i] != ')' || len == 0) {
729 error("'@(...)' abbreviation must contain a symbol");
731 else if (digits == len) {
732 /* all digits; parse as decimal */
733 j = atoi(temp_symbol);
736 int sym = get_symbol_index(temp_symbol);
737 if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
738 error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol);
741 symbols[sym].flags |= USED_SFLAG;
742 j = symbols[sym].value;
745 if (!glulx_mode && j >= 96) {
746 error_max_dynamic_strings(j);
749 if (j >= MAX_DYNAMIC_STRINGS) {
750 error_max_dynamic_strings(j);
754 write_z_char_z(j/32+1); write_z_char_z(j%32);
757 write_z_char_z(' '); /* error fallback */
760 else if (isdigit(text_in[i+1])!=0)
763 /* @.. (dynamic string) */
765 d1 = character_digit_value[text_in[i+1]];
766 d2 = character_digit_value[text_in[i+2]];
767 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
768 error("'@..' must have two decimal digits");
772 if (!glulx_mode && j >= 96) {
773 error_max_dynamic_strings(j);
776 if (j >= MAX_DYNAMIC_STRINGS) {
777 /* Shouldn't get here with two digits */
778 error_max_dynamic_strings(j);
783 write_z_char_z(j/32+1); write_z_char_z(j%32);
786 write_z_char_z(' '); /* error fallback */
792 /* A string escape specifying an unusual character */
794 unicode = text_to_unicode((char *) (text_in+i));
795 zscii = unicode_to_zscii(unicode);
796 if (zscii != 5) write_zscii(zscii);
798 { unicode_char_error(
799 "Character can only be used if declared in \
800 advance as part of 'Zcharacter table':", unicode);
802 i += textual_form_length - 1;
806 { /* Skip a character which has been over-written with the null
807 value 1 earlier on */
810 { if (text_in[i]==' ') write_z_char_z(0);
812 { j = (int) text_in[i];
813 lookup_value = iso_to_alphabet_grid[j];
814 if (lookup_value < 0)
815 { /* The character isn't in the standard alphabets, so
816 we have to use the ZSCII 4-Z-char sequence */
818 if (lookup_value == -5)
819 { /* Character isn't in the ZSCII set at all */
821 unicode = iso_to_unicode(j);
823 "Character can only be used if declared in \
824 advance as part of 'Zcharacter table':", unicode);
825 write_zscii(0x200 + unicode/0x100);
826 write_zscii(0x300 + unicode%0x100);
828 else write_zscii(-lookup_value);
831 { /* The character is in one of the standard alphabets:
832 write a SHIFT to temporarily change alphabet if
833 it isn't in alphabet 0, then write the Z-char */
835 alphabet_used[lookup_value] = 'Y';
836 in_alphabet = lookup_value/26;
837 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
838 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
839 write_z_char_z(lookup_value%26 + 6);
846 /* Flush the Z-characters output buffer and set the "end" bit */
852 /* The text storage here is, of course, temporary. Compression
853 will occur when we're finished compiling, so that all the
854 clever Huffman stuff will work.
855 In the stored text, we use "@@" to indicate @,
856 "@0" to indicate a zero byte,
857 "@ANNNN" to indicate an abbreviation,
858 "@DNNNN" to indicate a dynamic string thing.
859 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
860 (NNNN is a four-digit hex number using the letters A-P... an
861 ugly representation but a convenient one.)
864 for (i=0; text_in[i]!=0; i++) {
866 /* Contract ". " into ". " if double-space-removing switch set:
867 likewise "? " and "! " if the setting is high enough. */
868 if ((double_space_setting >= 1)
869 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
871 || (double_space_setting >= 2
872 && (text_in[i]=='?' || text_in[i]=='!'))) {
873 text_in[i+1] = text_in[i];
880 /* Try abbreviations if the economy switch set. We have to be in
881 compression mode too, since the abbreviation mechanism is part
882 of string decompression. */
884 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
885 && ((k=abbrevs_lookup[text_in[i]])!=-1)
886 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
887 char *cx = abbreviation_text(j);
891 write_z_char_g('A' + ((j >>12) & 0x0F));
892 write_z_char_g('A' + ((j >> 8) & 0x0F));
893 write_z_char_g('A' + ((j >> 4) & 0x0F));
894 write_z_char_g('A' + ((j ) & 0x0F));
896 else if (text_in[i] == '@') {
897 if (text_in[i+1]=='@') {
899 i+=2; j=atoi((char *) (text_in+i));
900 if (j == '@' || j == '\0') {
904 if (!compression_switch)
905 warning("Ascii @@0 will prematurely terminate non-compressed \
910 while (isdigit(text_in[i])) i++; i--;
912 else if (text_in[i+1]=='(') {
913 int len = 0, digits = 0;
915 /* This accepts "12xyz" as a symbol, which it really isn't,
916 but that just means it won't be found. */
917 while ((text_in[i] == '_' || isalnum(text_in[i]))) {
918 char ch = text_in[i++];
919 if (isdigit(ch)) digits++;
920 ensure_memory_list_available(&temp_symbol_memlist, len+1);
921 temp_symbol[len++] = ch;
923 ensure_memory_list_available(&temp_symbol_memlist, len+1);
924 temp_symbol[len] = '\0';
926 /* We would like to parse temp_symbol as *either* a decimal
927 number or a constant symbol. */
928 if (text_in[i] != ')' || len == 0) {
929 error("'@(...)' abbreviation must contain a symbol");
931 else if (digits == len) {
932 /* all digits; parse as decimal */
933 j = atoi(temp_symbol);
936 int sym = get_symbol_index(temp_symbol);
937 if (sym < 0 || (symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
938 error_named("'@(...)' abbreviation expected a known constant value, but contained", temp_symbol);
941 symbols[sym].flags |= USED_SFLAG;
942 j = symbols[sym].value;
945 if (j >= MAX_DYNAMIC_STRINGS) {
946 error_max_dynamic_strings(j);
949 if (j+1 >= no_dynamic_strings)
950 no_dynamic_strings = j+1;
954 write_z_char_g('A' + ((j >>12) & 0x0F));
955 write_z_char_g('A' + ((j >> 8) & 0x0F));
956 write_z_char_g('A' + ((j >> 4) & 0x0F));
957 write_z_char_g('A' + ((j ) & 0x0F));
960 write_z_char_g(' '); /* error fallback */
963 else if (isdigit(text_in[i+1])) {
965 d1 = character_digit_value[text_in[i+1]];
966 d2 = character_digit_value[text_in[i+2]];
967 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
968 error("'@..' must have two decimal digits");
971 if (!compression_switch)
972 warning("'@..' print variable will not work in non-compressed \
973 string; substituting ' '.");
976 if (j >= MAX_DYNAMIC_STRINGS) {
977 error_max_dynamic_strings(j);
980 if (j+1 >= no_dynamic_strings)
981 no_dynamic_strings = j+1;
985 write_z_char_g('A' + ((j >>12) & 0x0F));
986 write_z_char_g('A' + ((j >> 8) & 0x0F));
987 write_z_char_g('A' + ((j >> 4) & 0x0F));
988 write_z_char_g('A' + ((j ) & 0x0F));
991 write_z_char_g(' '); /* error fallback */
996 unicode = text_to_unicode((char *) (text_in+i));
997 i += textual_form_length - 1;
998 if (unicode == '@' || unicode == '\0') {
1000 write_z_char_g(unicode ? '@' : '0');
1002 else if (unicode >= 0 && unicode < 256) {
1003 write_z_char_g(unicode);
1006 if (!compression_switch) {
1007 warning("Unicode characters will not work in non-compressed \
1008 string; substituting '?'.");
1009 write_z_char_g('?');
1012 j = unicode_entity_index(unicode);
1013 write_z_char_g('@');
1014 write_z_char_g('U');
1015 write_z_char_g('A' + ((j >>12) & 0x0F));
1016 write_z_char_g('A' + ((j >> 8) & 0x0F));
1017 write_z_char_g('A' + ((j >> 4) & 0x0F));
1018 write_z_char_g('A' + ((j ) & 0x0F));
1023 else if (text_in[i] == '^')
1024 write_z_char_g(0x0A);
1025 else if (text_in[i] == '~')
1026 write_z_char_g('"');
1027 else if (character_set_unicode) {
1028 if (text_in[i] & 0x80) {
1029 unicode = text_to_unicode((char *) (text_in+i));
1030 i += textual_form_length - 1;
1031 if (unicode >= 0 && unicode < 256) {
1032 write_z_char_g(unicode);
1035 if (!compression_switch) {
1036 warning("Unicode characters will not work in non-compressed \
1037 string; substituting '?'.");
1038 write_z_char_g('?');
1041 j = unicode_entity_index(unicode);
1042 write_z_char_g('@');
1043 write_z_char_g('U');
1044 write_z_char_g('A' + ((j >>12) & 0x0F));
1045 write_z_char_g('A' + ((j >> 8) & 0x0F));
1046 write_z_char_g('A' + ((j >> 4) & 0x0F));
1047 write_z_char_g('A' + ((j ) & 0x0F));
1052 write_z_char_g(text_in[i]);
1056 unicode = iso_to_unicode_grid[text_in[i]];
1057 if (unicode >= 0 && unicode < 256) {
1058 write_z_char_g(unicode);
1061 if (!compression_switch) {
1062 warning("Unicode characters will not work in non-compressed \
1063 string; substituting '?'.");
1064 write_z_char_g('?');
1067 j = unicode_entity_index(unicode);
1068 write_z_char_g('@');
1069 write_z_char_g('U');
1070 write_z_char_g('A' + ((j >>12) & 0x0F));
1071 write_z_char_g('A' + ((j >> 8) & 0x0F));
1072 write_z_char_g('A' + ((j >> 4) & 0x0F));
1073 write_z_char_g('A' + ((j ) & 0x0F));
1079 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1083 if (text_out_overflow)
1086 return text_out_pos;
1089 static int unicode_entity_index(int32 unicode)
1092 int buck = unicode % UNICODE_HASH_BUCKETS;
1094 for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1095 if (unicode_usage_entries[j].ch == unicode)
1099 ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1100 j = no_unicode_chars++;
1101 unicode_usage_entries[j].ch = unicode;
1102 unicode_usage_entries[j].next = unicode_usage_hash[buck];
1103 unicode_usage_hash[buck] = j;
1109 /* ------------------------------------------------------------------------- */
1110 /* Glulx compression code */
1111 /* ------------------------------------------------------------------------- */
1114 static void compress_makebits(int entnum, int depth, int prevbit,
1115 huffbitlist_t *bits);
1117 /* The compressor. This uses the usual Huffman compression algorithm. */
1118 void compress_game_text()
1120 int entities=0, branchstart, branches;
1129 if (compression_switch) {
1130 max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1132 huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1133 "huffman entities");
1134 hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1135 "huffman node list");
1137 /* How many entities have we currently got? Well, 256 plus the
1138 string-terminator plus Unicode chars plus abbrevations plus
1141 huff_unicode_start = entities;
1142 entities += no_unicode_chars;
1143 huff_abbrev_start = entities;
1145 entities += no_abbreviations;
1146 huff_dynam_start = entities;
1147 entities += no_dynamic_strings;
1149 if (entities > max_char_set)
1150 compiler_error("Too many entities for max_char_set");
1153 for (jx=0; jx<256; jx++) {
1154 huff_entities[jx].type = 2;
1155 huff_entities[jx].count = 0;
1156 huff_entities[jx].u.ch = jx;
1159 huff_entities[256].type = 1;
1160 huff_entities[256].count = 0;
1161 for (jx=0; jx<no_unicode_chars; jx++) {
1162 huff_entities[huff_unicode_start+jx].type = 4;
1163 huff_entities[huff_unicode_start+jx].count = 0;
1164 huff_entities[huff_unicode_start+jx].u.val = jx;
1166 if (economy_switch) {
1167 for (jx=0; jx<no_abbreviations; jx++) {
1168 huff_entities[huff_abbrev_start+jx].type = 3;
1169 huff_entities[huff_abbrev_start+jx].count = 0;
1170 huff_entities[huff_abbrev_start+jx].u.val = jx;
1173 for (jx=0; jx<no_dynamic_strings; jx++) {
1174 huff_entities[huff_dynam_start+jx].type = 9;
1175 huff_entities[huff_dynam_start+jx].count = 0;
1176 huff_entities[huff_dynam_start+jx].u.val = jx;
1180 /* No compression; use defaults that will make it easy to check
1182 no_huff_entities = 257;
1183 huff_unicode_start = 257;
1184 huff_abbrev_start = 257;
1185 huff_dynam_start = 257+no_abbreviations;
1186 compression_table_size = 0;
1189 if (compression_switch) {
1191 for (lx=0, ix=0; lx<no_strings; lx++) {
1192 int escapelen=0, escapetype=0;
1196 ch = static_strings_area[ix];
1198 if (ix > static_strings_extent || ch < 0)
1199 compiler_error("Read too much not-yet-compressed text.");
1200 if (escapelen == -1) {
1205 else if (ch == '0') {
1208 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1215 compiler_error("Strange @ escape in processed text.");
1218 else if (escapelen) {
1219 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1221 if (escapelen == 0) {
1222 if (escapetype == 'A') {
1223 ch = huff_abbrev_start+escapeval;
1225 else if (escapetype == 'D') {
1226 ch = huff_dynam_start+escapeval;
1228 else if (escapetype == 'U') {
1229 ch = huff_unicode_start+escapeval;
1232 compiler_error("Strange @ escape in processed text.");
1248 huff_entities[ch].count++;
1253 for (jx=0; jx<entities; jx++) {
1254 if (huff_entities[jx].count) {
1255 hufflist[numlive] = &(huff_entities[jx]);
1260 branchstart = entities;
1263 while (numlive > 1) {
1265 int best1num, best2num;
1268 if (hufflist[0]->count < hufflist[1]->count) {
1277 best1num = hufflist[best1]->count;
1278 best2num = hufflist[best2]->count;
1280 for (jx=2; jx<numlive; jx++) {
1281 if (hufflist[jx]->count < best1num) {
1283 best2num = best1num;
1285 best1num = hufflist[best1]->count;
1287 else if (hufflist[jx]->count < best2num) {
1289 best2num = hufflist[best2]->count;
1293 bran = &(huff_entities[branchstart+branches]);
1296 bran->count = hufflist[best1]->count + hufflist[best2]->count;
1297 bran->u.branch[0] = (hufflist[best1] - huff_entities);
1298 bran->u.branch[1] = (hufflist[best2] - huff_entities);
1299 hufflist[best1] = bran;
1300 if (best2 < numlive-1) {
1301 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1302 ((numlive-1) - best2) * sizeof(huffentity_t *));
1307 huff_entity_root = (hufflist[0] - huff_entities);
1309 for (ix=0; ix<MAXHUFFBYTES; ix++)
1311 compression_table_size = 12;
1313 no_huff_entities = 0; /* compress_makebits will total this up */
1314 compress_makebits(huff_entity_root, 0, -1, &bits);
1317 /* Now, sadly, we have to compute the size of the string section,
1318 without actually doing the compression. */
1319 compression_string_size = 0;
1321 ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1323 for (lx=0, ix=0; lx<no_strings; lx++) {
1324 int escapelen=0, escapetype=0;
1328 compressed_offsets[lx] = compression_table_size + compression_string_size;
1329 compression_string_size++; /* for the type byte */
1331 ch = static_strings_area[ix];
1333 if (ix > static_strings_extent || ch < 0)
1334 compiler_error("Read too much not-yet-compressed text.");
1335 if (escapelen == -1) {
1340 else if (ch == '0') {
1343 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1350 compiler_error("Strange @ escape in processed text.");
1353 else if (escapelen) {
1354 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1356 if (escapelen == 0) {
1357 if (escapetype == 'A') {
1358 ch = huff_abbrev_start+escapeval;
1360 else if (escapetype == 'D') {
1361 ch = huff_dynam_start+escapeval;
1363 else if (escapetype == 'U') {
1364 ch = huff_unicode_start+escapeval;
1367 compiler_error("Strange @ escape in processed text.");
1384 if (compression_switch) {
1385 jx += huff_entities[ch].depth;
1386 compression_string_size += (jx/8);
1390 if (ch >= huff_dynam_start) {
1391 compression_string_size += 3;
1393 else if (ch >= huff_unicode_start) {
1394 compiler_error("Abbreviation/Unicode in non-compressed string \
1395 should be impossible.");
1398 compression_string_size += 1;
1401 if (compression_switch && jx)
1402 compression_string_size++;
1405 done_compression = TRUE;
1408 static void compress_makebits(int entnum, int depth, int prevbit,
1409 huffbitlist_t *bits)
1411 huffentity_t *ent = &(huff_entities[entnum]);
1415 ent->addr = compression_table_size;
1420 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1423 switch (ent->type) {
1425 compression_table_size += 9;
1426 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1427 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1430 compression_table_size += 1;
1433 compression_table_size += 2;
1436 cx = abbreviation_text(ent->u.val);
1437 compression_table_size += (1 + 1 + strlen(cx));
1441 compression_table_size += 5;
1446 /* ------------------------------------------------------------------------- */
1447 /* The abbreviations optimiser */
1449 /* This is a very complex, memory and time expensive algorithm to */
1450 /* approximately solve the problem of which abbreviation strings would */
1451 /* minimise the total number of Z-chars to which the game text translates. */
1452 /* It is in some ways a quite separate program but remains inside Inform */
1453 /* for compatibility with previous releases. */
1454 /* ------------------------------------------------------------------------- */
1456 /* The complete game text. */
1457 static char *opttext;
1458 static int32 opttextlen;
1460 typedef struct tlb_s
1462 int32 intab, occurrences;
1464 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1465 static memory_list tlbtab_memlist;
1466 static int32 no_occs;
1468 static int32 *grandtable;
1469 static int32 *grandflags;
1470 typedef struct optab_s
1475 char *text; /* allocated to textsize, min 4 */
1478 static int32 MAX_BESTYET;
1479 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1480 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1482 static void optab_copy(optab *dest, const optab *src)
1484 dest->length = src->length;
1485 dest->popularity = src->popularity;
1486 dest->score = src->score;
1487 dest->location = src->location;
1488 if (src->length+1 > dest->textsize) {
1489 int32 oldsize = dest->textsize;
1490 dest->textsize = (src->length+1)*2;
1491 my_realloc(&dest->text, oldsize, dest->textsize, "bestyet2.text");
1493 strcpy(dest->text, src->text);
1498 static void optimise_pass(void)
1503 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1504 for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1505 for (i=0; i<no_occs; i++)
1506 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1509 if (i%((**g_pm_hndl).linespercheck) == 0)
1510 { ProcessEvents (&g_proc);
1513 longjmp (g_fallback, 1);
1517 if (optabbrevs_trace_setting >= 2) {
1518 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1519 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1520 (long int) tlbtab[i].occurrences);
1523 for (j=0; j<tlbtab[i].occurrences; j++)
1524 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1525 nl=2; noflags=tlbtab[i].occurrences;
1528 for (j2=0; j2<nl; j2++)
1529 if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1532 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1533 { if (grandflags[j2]==1)
1534 { x=grandtable[tlbtab[i].intab+j2]
1535 - grandtable[tlbtab[i].intab+j];
1536 if (((x>-nl)&&(x<nl))
1537 || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1538 opttext+grandtable[tlbtab[i].intab+j2],
1540 { grandflags[j2]=0; noflags--; }
1545 for (k=0; k<nl; k++)
1547 c=opttext[grandtable[tlbtab[i].intab+j+k]];
1549 { if (iso_to_alphabet_grid[c]<0)
1552 if (iso_to_alphabet_grid[c]>=26)
1556 score=(matches-1)*(scrabble-2);
1558 for (j2=0; j2<MAX_BESTYET; j2++)
1559 { if ((nl==bestyet[j2].length)
1560 && (memcmp(opttext+bestyet[j2].location,
1561 opttext+grandtable[tlbtab[i].intab+j],
1563 { j2=MAX_BESTYET; min=score; }
1565 { if (bestyet[j2].score<min)
1566 { min=bestyet[j2].score; minat=j2;
1571 { bestyet[minat].score=score;
1572 bestyet[minat].length=nl;
1573 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1574 bestyet[minat].popularity=matches;
1579 if (optabbrevs_trace_setting >= 2) {
1581 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1582 printf(" (%.4f seconds)\n", duration);
1588 static int any_overlap(char *s1, char *s2)
1589 { int a, b, i, j, flag;
1590 a=strlen(s1); b=strlen(s2);
1591 for (i=1-b; i<a; i++)
1594 if ((0<=i+j)&&(i+j<=a-1))
1595 if (s1[i+j]!=s2[j]) flag=1;
1596 if (flag==0) return(1);
1601 extern void optimise_abbreviations(void)
1602 { int32 i, j, tcount, max=0, MAX_GTABLE;
1603 int32 j2, selected, available, maxat=0, nl;
1605 if (opttext == NULL)
1608 /* We insist that the first two abbreviations will be ". " and ", ". */
1609 if (MAX_ABBREVS < 2)
1612 /* Note that it's safe to access opttext[opttextlen+2]. There are
1613 two newlines and a null beyond opttextlen. */
1615 printf("Beginning calculation of optimal abbreviations...\n");
1619 initialise_memory_list(&tlbtab_memlist,
1620 sizeof(tlb), 1000, (void**)&tlbtab,
1621 "three-letter-blocks buffer");
1625 /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1626 MAX_BESTYET = 4 * MAX_ABBREVS;
1628 bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1629 for (i=0; i<MAX_BESTYET; i++) {
1630 bestyet[i].length = 0;
1631 bestyet[i].popularity = 0;
1632 bestyet[i].score = 0;
1633 bestyet[i].location = 0;
1634 bestyet[i].textsize = 4;
1635 bestyet[i].text = my_malloc(bestyet[i].textsize, "bestyet.text");
1638 bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1639 for (i=0; i<MAX_ABBREVS; i++) {
1640 bestyet2[i].length = 0;
1641 bestyet2[i].popularity = 0;
1642 bestyet2[i].score = 0;
1643 bestyet2[i].location = 0;
1644 bestyet2[i].textsize = 4;
1645 bestyet2[i].text = my_malloc(bestyet2[i].textsize, "bestyet2.text");
1648 bestyet2[0].text[0]='.';
1649 bestyet2[0].text[1]=' ';
1650 bestyet2[0].text[2]=0;
1652 bestyet2[1].text[0]=',';
1653 bestyet2[1].text[1]=' ';
1654 bestyet2[1].text[2]=0;
1658 for (i=0; i<opttextlen; i++)
1660 if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1661 { opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1662 bestyet2[0].popularity++;
1665 if ((opttext[i]=='.') && (opttext[i+1]==' '))
1666 { opttext[i]='\n'; opttext[i+1]='\n';
1667 bestyet2[0].popularity++;
1670 if ((opttext[i]==',') && (opttext[i+1]==' '))
1671 { opttext[i]='\n'; opttext[i+1]='\n';
1672 bestyet2[1].popularity++;
1676 MAX_GTABLE=opttextlen+1;
1677 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1679 for (i=0, tcount=0; i<opttextlen; i++)
1682 test.text[0]=opttext[i];
1683 test.text[1]=opttext[i+1];
1684 test.text[2]=opttext[i+2];
1686 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1688 for (j=0; j<no_occs; j++) {
1689 if (strcmp(test.text,tlbtab[j].text)==0)
1694 for (j=i+3; j<opttextlen; j++)
1697 if (j%((**g_pm_hndl).linespercheck) == 0)
1698 { ProcessEvents (&g_proc);
1701 longjmp (g_fallback, 1);
1705 if ((opttext[i]==opttext[j])
1706 && (opttext[i+1]==opttext[j+1])
1707 && (opttext[i+2]==opttext[j+2]))
1708 { grandtable[tcount+test.occurrences]=j;
1710 if (tcount+test.occurrences==MAX_GTABLE)
1711 { printf("All %ld cross-references used\n",
1712 (long int) MAX_GTABLE);
1717 if (test.occurrences>=2)
1719 ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1720 tlbtab[no_occs]=test;
1721 tlbtab[no_occs].intab=tcount;
1722 tcount += tlbtab[no_occs].occurrences;
1723 if (max<tlbtab[no_occs].occurrences)
1724 max=tlbtab[no_occs].occurrences;
1731 grandflags=my_calloc(sizeof(int), max, "grandflags");
1734 if (optabbrevs_trace_setting >= 1) {
1735 printf("Cross-reference table (%ld entries) built...\n",
1736 (long int) no_occs);
1738 /* for (i=0; i<no_occs; i++)
1739 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1740 tlbtab[i].occurrences);
1743 for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1744 available=MAX_BESTYET;
1745 while ((available>0)&&(selected<MAX_ABBREVS))
1748 if (optabbrevs_trace_setting >= 1) {
1749 printf("Pass %d\n", pass_no);
1754 for (i=0; i<MAX_BESTYET; i++)
1755 if (bestyet[i].score!=0)
1757 nl=bestyet[i].length;
1758 if (nl+1 > bestyet[i].textsize) {
1759 int32 oldsize = bestyet[i].textsize;
1760 bestyet[i].textsize = (nl+1)*2;
1761 my_realloc(&bestyet[i].text, oldsize, bestyet[i].textsize, "bestyet.text");
1763 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1764 opttext[bestyet[i].location+j2];
1765 bestyet[i].text[nl]=0;
1768 /* printf("End of pass results:\n");
1769 printf("\nno score freq string\n");
1770 for (i=0; i<MAX_BESTYET; i++)
1771 if (bestyet[i].score>0)
1772 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1773 bestyet[i].popularity, bestyet[i].text);
1778 for (i=0; i<MAX_BESTYET; i++)
1779 if (max<bestyet[i].score)
1780 { max=bestyet[i].score;
1787 optab_copy(&bestyet2[selected++], &bestyet[maxat]);
1789 if (optabbrevs_trace_setting >= 1) {
1791 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1792 (long int) selected,bestyet[maxat].text,
1793 (long int) bestyet[maxat].popularity,
1794 (long int) bestyet[maxat].score);
1797 testtext[0]=bestyet[maxat].text[0];
1798 testtext[1]=bestyet[maxat].text[1];
1799 testtext[2]=bestyet[maxat].text[2];
1802 for (i=0; i<no_occs; i++)
1803 if (strcmp(testtext,tlbtab[i].text)==0)
1806 for (j=0; j<tlbtab[i].occurrences; j++)
1807 { if (memcmp(bestyet[maxat].text,
1808 opttext+grandtable[tlbtab[i].intab+j],
1809 bestyet[maxat].length)==0)
1810 { for (j2=0; j2<bestyet[maxat].length; j2++)
1811 opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1815 for (i=0; i<MAX_BESTYET; i++)
1816 if ((bestyet[i].score>0)&&
1817 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1818 { bestyet[i].score=0;
1819 /* printf("Discarding '%s' as overlapping\n",
1820 bestyet[i].text); */
1823 } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1826 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1827 for (i=0; i<selected; i++)
1828 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1833 /* ------------------------------------------------------------------------- */
1834 /* The dictionary manager begins here. */
1836 /* Speed is extremely important in these algorithms. If a linear-time */
1837 /* routine were used to search the dictionary words so far built up, then */
1838 /* Inform would crawl. */
1840 /* Instead, the dictionary is stored as a binary tree, which is kept */
1841 /* balanced with the red-black algorithm. */
1842 /* ------------------------------------------------------------------------- */
1843 /* A dictionary table similar to the Z-machine format is kept: there is a */
1844 /* 7-byte header (left blank here to be filled in at the */
1845 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1846 /* records, one per word, in the form */
1848 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1849 /* 4 or 6 bytes byte byte byte */
1851 /* For Glulx, the form is instead: (See below about Unicode-valued */
1852 /* dictionaries and DICT_WORD_BYTES.) */
1854 /* <tag> <plain text> <flags> <verbnumber> <adjectivenumber> */
1855 /* $60 DICT_WORD_BYTES short short short */
1857 /* These records are stored in "accession order" (i.e. in order of their */
1858 /* first being received by these routines) and only alphabetically sorted */
1859 /* by construct_storyfile() (using the array below). */
1860 /* ------------------------------------------------------------------------- */
1862 /* Further notes about the data fields... */
1863 /* The flags are currently: */
1864 /* bit 0: word is used as a verb (in verb grammar) */
1865 /* bit 1: word is used as a meta verb */
1866 /* bit 2: word is plural (set by '//p') */
1867 /* bit 3: word is used as a preposition (in verb grammar) */
1868 /* bit 6: set for all verbs, but not used by the parser? */
1869 /* bit 7: word is used as a noun (set for every word that appears in */
1870 /* code or in an object property) */
1872 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1875 /* The compiler generates special constants #dict_par1, #dict_par2, */
1876 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1877 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1878 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1879 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1880 /* ------------------------------------------------------------------------- */
1882 uchar *dictionary; /* (These two variables are externally
1883 used only in "tables.c" when
1884 building the story-file) */
1885 static memory_list dictionary_memlist;
1886 int32 dictionary_top; /* Position of the next free record
1887 in dictionary (i.e., the current
1890 int dict_entries; /* Total number of records entered */
1892 /* ------------------------------------------------------------------------- */
1893 /* dict_word was originally a typedef for a struct of 6 unsigned chars. */
1894 /* It held the (4 or) 6 bytes of Z-coded text of a word. */
1895 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1896 /* alphabetic order corresponds to numeric order. For this reason, the */
1897 /* dict_word is called the "sort code" of the original text word. */
1899 /* In modifying the compiler for Glulx, I found it easier to discard the */
1900 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1901 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1902 /* as before. In Glulx, it can be any value. */
1904 /* In further modifying the compiler to generate a Unicode dictionary, */
1905 /* I have to store four-byte values in the uchar array. We make the array */
1906 /* size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE). */
1907 /* Then we store the 32-bit character value big-endian. This lets us */
1908 /* continue to compare arrays bytewise, which is a nice simplification. */
1909 /* ------------------------------------------------------------------------- */
1911 extern int compare_sorts(uchar *d1, uchar *d2)
1913 for (i=0; i<DICT_WORD_BYTES; i++)
1914 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1915 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1920 extern void copy_sorts(uchar *d1, uchar *d2)
1922 for (i=0; i<DICT_WORD_BYTES; i++)
1926 static memory_list prepared_sort_memlist;
1927 static uchar *prepared_sort; /* Holds the sort code of current word */
1929 static int prepared_dictflags_pos; /* Dict flags set by the current word */
1930 static int prepared_dictflags_neg; /* Dict flags *not* set by the word */
1932 /* Also used by verbs.c */
1933 static void dictionary_prepare_z(char *dword, uchar *optresult)
1934 { int i, j, k, k2, wd[13];
1938 /* A rapid text translation algorithm using only the simplified rules
1939 applying to the text of dictionary entries: first produce a sequence
1940 of 6 (v3) or 9 (v4+) Z-characters */
1942 int dictsize = (version_number==3) ? 6 : 9;
1944 prepared_dictflags_pos = 0;
1945 prepared_dictflags_neg = 0;
1947 for (i=0, j=0; dword[j]!=0; j++)
1949 if ((dword[j] == '/') && (dword[j+1] == '/'))
1951 /* The rest of the word is dict flags. Run through them. */
1953 for (j+=2; dword[j] != 0; j++)
1959 error_named("'//~' with no flag character (pn) in dict word", dword);
1964 prepared_dictflags_pos |= 4;
1966 prepared_dictflags_neg |= 4;
1971 prepared_dictflags_pos |= 128;
1973 prepared_dictflags_neg |= 128;
1977 error_named("Expected flag character (pn~) after '//' in dict word", dword);
1984 /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping
1986 if (LONG_DICT_FLAG_BUG && i>=dictsize)
1991 warning_named("Obsolete usage: use the ^ character for the \
1992 apostrophe in", dword);
1993 if (k==(int) '^') k=(int) '\'';
1996 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1997 { int unicode = text_to_unicode(dword+j);
1998 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1999 k = unicode_to_zscii(unicode);
2000 j += textual_form_length - 1;
2001 if ((k == 5) || (k >= 0x100))
2002 { unicode_char_error(
2003 "Character can be printed but not input:", unicode);
2006 k2 = zscii_to_alphabet_grid[(uchar) k];
2009 { if (isupper(k)) k = tolower(k);
2010 k2 = iso_to_alphabet_grid[(uchar) k];
2014 { if ((k2 == -5) || (k2 <= -0x100))
2015 char_error("Character can be printed but not input:", k);
2017 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
2030 { alphabet_used[k2] = 'Y';
2031 if ((k2/26)!=0 && i<dictsize)
2032 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
2034 wd[i++]=6+(k2%26); /* Write the Z character */
2039 compiler_error("dict word buffer overflow");
2041 /* Fill up to the end of the dictionary block with PAD characters
2042 (for safety, we right-pad to 9 chars even in V3) */
2044 for (; i<9; i++) wd[i]=5;
2046 /* The array of Z-chars is converted to two or three 2-byte blocks */
2047 ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES);
2049 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
2050 prepared_sort[1]=tot%0x100;
2051 prepared_sort[0]=(tot/0x100)%0x100;
2052 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
2053 prepared_sort[3]=tot%0x100;
2054 prepared_sort[2]=(tot/0x100)%0x100;
2055 if (version_number==3)
2058 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
2059 prepared_sort[5]=tot%0x100;
2060 prepared_sort[4]=(tot/0x100)%0x100;
2062 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
2064 if (version_number==3) prepared_sort[2]+=0x80;
2065 else prepared_sort[4]+=0x80;
2067 if (optresult) copy_sorts(optresult, prepared_sort);
2070 /* Also used by verbs.c */
2071 static void dictionary_prepare_g(char *dword, uchar *optresult)
2077 prepared_dictflags_pos = 0;
2078 prepared_dictflags_neg = 0;
2080 for (i=0, j=0; (dword[j]!=0); j++) {
2081 if ((dword[j] == '/') && (dword[j+1] == '/')) {
2082 /* The rest of the word is dict flags. Run through them. */
2084 for (j+=2; dword[j] != 0; j++) {
2088 error_named("'//~' with no flag character (pn) in dict word", dword);
2093 prepared_dictflags_pos |= 4;
2095 prepared_dictflags_neg |= 4;
2100 prepared_dictflags_pos |= 128;
2102 prepared_dictflags_neg |= 128;
2106 error_named("Expected flag character (pn~) after '//' in dict word", dword);
2113 /* LONG_DICT_FLAG_BUG emulates the old behavior where we stop looping
2114 at DICT_WORD_SIZE. */
2115 if (LONG_DICT_FLAG_BUG && i>=DICT_WORD_SIZE)
2118 k= ((unsigned char *)dword)[j];
2120 warning_named("Obsolete usage: use the ^ character for the \
2121 apostrophe in", dword);
2124 if (k=='~') /* as in iso_to_alphabet_grid */
2127 if (k=='@' || (character_set_unicode && (k & 0x80))) {
2128 unicode = text_to_unicode(dword+j);
2129 j += textual_form_length - 1;
2132 unicode = iso_to_unicode_grid[k];
2135 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
2139 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
2140 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
2144 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
2147 ensure_memory_list_available(&prepared_sort_memlist, DICT_WORD_BYTES);
2149 if (DICT_CHAR_SIZE == 1) {
2150 if (i<DICT_WORD_SIZE)
2151 prepared_sort[i++] = k;
2154 if (i<DICT_WORD_SIZE) {
2155 prepared_sort[4*i] = (k >> 24) & 0xFF;
2156 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2157 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
2158 prepared_sort[4*i+3] = (k) & 0xFF;
2164 if (i > DICT_WORD_SIZE)
2165 compiler_error("dict word buffer overflow");
2167 /* Right-pad with zeroes */
2168 if (DICT_CHAR_SIZE == 1) {
2169 for (; i<DICT_WORD_SIZE; i++)
2170 prepared_sort[i] = 0;
2173 for (; i<DICT_WORD_SIZE; i++) {
2174 prepared_sort[4*i] = 0;
2175 prepared_sort[4*i+1] = 0;
2176 prepared_sort[4*i+2] = 0;
2177 prepared_sort[4*i+3] = 0;
2181 if (optresult) copy_sorts(optresult, prepared_sort);
2184 extern void dictionary_prepare(char *dword, uchar *optresult)
2187 dictionary_prepare_z(dword, optresult);
2189 dictionary_prepare_g(dword, optresult);
2192 /* ------------------------------------------------------------------------- */
2193 /* The arrays below are all concerned with the problem of alphabetically */
2194 /* sorting the dictionary during the compilation pass. */
2195 /* Note that it is not enough simply to apply qsort to the dictionary at */
2196 /* the end of the pass: we need to ensure that no duplicates are ever */
2199 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
2200 /* word to be entered into the dictionary, where */
2201 /* n counts upward from 0 */
2202 /* (n is also called the "accession number") */
2204 /* The tree structure encodes an ordering. The special value VACANT means */
2205 /* "no node here": otherwise, node numbers are the same as accession */
2206 /* numbers. At all times, "root" holds the node number of the top of the */
2207 /* tree; each node has up to two branches, such that the subtree of the */
2208 /* left branch is always alphabetically before what's at the node, and */
2209 /* the subtree to the right is always after; and all branches are coloured */
2210 /* either "black" or "red". These colours are used to detect points where */
2211 /* the tree is growing asymmetrically (and therefore becoming inefficient */
2213 /* ------------------------------------------------------------------------- */
2220 typedef struct dict_tree_node_s
2221 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
2222 char colour; /* The colour of the branch to the parent */
2225 static dict_tree_node *dtree; /* Allocated to dict_entries */
2226 static memory_list dtree_memlist;
2228 static uchar *dict_sort_codes; /* Allocated to dict_entries*DICT_WORD_BYTES */
2229 static memory_list dict_sort_codes_memlist;
2231 int *final_dict_order; /* Allocated at sort_dictionary() time */
2233 static void dictionary_begin_pass(void)
2235 /* Leave room for the 7-byte header (added in "tables.c" much later) */
2236 /* Glulx has a 4-byte header instead. */
2243 ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2249 static int fdo_count;
2250 static void recursively_sort(int node)
2251 { if (dtree[node].branch[0] != VACANT)
2252 recursively_sort(dtree[node].branch[0]);
2253 final_dict_order[node] = fdo_count++;
2254 if (dtree[node].branch[1] != VACANT)
2255 recursively_sort(dtree[node].branch[1]);
2258 extern void sort_dictionary(void)
2260 final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2263 { fdo_count = 0; recursively_sort(root);
2267 /* ------------------------------------------------------------------------- */
2268 /* If "dword" is in the dictionary, return its accession number plus 1; */
2269 /* If not, return 0. */
2270 /* ------------------------------------------------------------------------- */
2272 static int dictionary_find(char *dword)
2275 dictionary_prepare(dword, NULL);
2277 while (at != VACANT)
2278 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2279 if (n==0) return at + 1;
2280 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2285 /* ------------------------------------------------------------------------- */
2286 /* Add "dword" to the dictionary with (flag1,flag2,flag3) as its data */
2287 /* fields; unless it already exists, in which case OR the data fields with */
2290 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
2292 /* Returns: the accession number. */
2293 /* ------------------------------------------------------------------------- */
2295 extern int dictionary_add(char *dword, int flag1, int flag2, int flag3)
2297 int ggfr = 0, gfr = 0, fr = 0, r = 0;
2298 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2300 int res=((version_number==3)?4:6);
2302 /* Fill in prepared_sort and prepared_dictflags. */
2303 dictionary_prepare(dword, NULL);
2305 /* Adjust flag1 according to prepared_dictflags. */
2306 flag1 &= (~prepared_dictflags_neg);
2307 flag1 |= prepared_dictflags_pos;
2310 { root = 0; goto CreateEntry;
2314 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2318 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2319 p[0] |= flag1; p[1] |= flag2;
2320 if (!ZCODE_LESS_DICT_DATA)
2324 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2325 p[0] |= (flag1/256); p[1] |= (flag1%256);
2326 p[2] |= (flag2/256); p[3] |= (flag2%256);
2327 p[4] |= (flag3/256); p[5] |= (flag3%256);
2331 if (n>0) r=1; else r=0;
2333 a = dtree[at].branch[0]; b = dtree[at].branch[1];
2334 if ((a != VACANT) && (dtree[a].colour == RED) &&
2335 (b != VACANT) && (dtree[b].colour == RED))
2336 { dtree[a].colour = BLACK;
2337 dtree[b].colour = BLACK;
2339 dtree[at].colour = RED;
2341 /* A tree rotation may be needed to avoid two red links in a row:
2343 ggf (or else gf is root) ggf (or f is root)
2346 / \(red) / \ (both red)
2352 In effect we rehang the "gf" subtree from "f".
2353 See the Technical Manual for further details.
2356 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2359 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2360 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2361 dtree[f].branch[1-fr] = gf;
2362 dtree[f].colour = BLACK;
2363 dtree[gf].colour = RED;
2364 gf = ggf; gfr = ggfr;
2367 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2368 dtree[at].colour = BLACK;
2369 dtree[gf].colour = RED;
2370 dtree[f].branch[fr] = dtree[at].branch[gfr];
2371 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2372 dtree[at].branch[gfr] = f;
2373 dtree[at].branch[fr] = gf;
2375 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2376 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2381 if (dtree[at].branch[r] == VACANT)
2382 { dtree[at].colour = RED;
2384 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2386 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2387 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2388 dtree[f].branch[1-fr] = gf;
2389 dtree[f].colour = BLACK;
2390 dtree[gf].colour = RED;
2393 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2394 dtree[at].colour = BLACK;
2395 dtree[gf].colour = RED;
2396 dtree[f].branch[fr] = dtree[at].branch[gfr];
2397 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2398 dtree[at].branch[gfr] = f;
2399 dtree[at].branch[fr] = gf;
2401 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2405 dtree[at].branch[r] = dict_entries;
2408 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2409 ggfr = gfr; gfr = fr; fr = r;
2414 ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2415 ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2417 dtree[dict_entries].branch[0] = VACANT;
2418 dtree[dict_entries].branch[1] = VACANT;
2419 dtree[dict_entries].colour = BLACK;
2421 /* Address in Inform's own dictionary table to write the record to */
2425 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2426 p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2428 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2431 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2432 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2433 if (version_number > 3)
2434 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2435 p[res]=flag1; p[res+1]=flag2;
2436 if (!ZCODE_LESS_DICT_DATA) p[res+2]=flag3;
2438 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2443 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2444 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2445 p[0] = 0x60; /* type byte -- dict word */
2447 p += DICT_CHAR_SIZE;
2448 for (i=0; i<DICT_WORD_BYTES; i++)
2449 p[i] = prepared_sort[i];
2451 p += DICT_WORD_BYTES;
2452 p[0] = (flag1/256); p[1] = (flag1%256);
2453 p[2] = (flag2/256); p[3] = (flag2%256);
2454 p[4] = (flag3/256); p[5] = (flag3%256);
2456 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2460 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2462 return dict_entries++;
2465 /* ------------------------------------------------------------------------- */
2466 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2467 /* new verb syntax of its own. (Otherwise existing verb-words never */
2468 /* change their verb-numbers.) */
2469 /* ------------------------------------------------------------------------- */
2471 extern void dictionary_set_verb_number(char *dword, int to)
2473 int res=((version_number==3)?4:6);
2474 i=dictionary_find(dword);
2478 p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2482 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2483 p[2]=to/256; p[3]=to%256;
2488 /* ------------------------------------------------------------------------- */
2489 /* Tracing code for the dictionary: used by "trace" and text */
2490 /* transcription. */
2491 /* ------------------------------------------------------------------------- */
2493 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2494 printed directly. (The "Trace dictionary" directive does this.)
2495 If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2496 until it's a page-width.
2498 static char *d_show_buf = NULL;
2499 static int d_show_size; /* allocated size */
2500 static int d_show_len; /* current length */
2502 static void show_char(char c)
2504 if (d_show_buf == NULL) {
2508 if (d_show_len+2 >= d_show_size) {
2509 int newsize = 2 * d_show_len + 16;
2510 my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2511 d_show_size = newsize;
2513 d_show_buf[d_show_len++] = c;
2514 d_show_buf[d_show_len] = '\0';
2518 /* Display a Unicode character in user-readable form. This uses the same
2519 character encoding as the source code. */
2520 static void show_uchar(uint32 c)
2526 /* ASCII always works */
2530 if (character_set_unicode) {
2531 /* UTF-8 the character */
2535 else if (c < 0x800) {
2536 show_char((0xC0 | ((c & 0x7C0) >> 6)));
2537 show_char((0x80 | (c & 0x03F) ));
2539 else if (c < 0x10000) {
2540 show_char((0xE0 | ((c & 0xF000) >> 12)));
2541 show_char((0x80 | ((c & 0x0FC0) >> 6)));
2542 show_char((0x80 | (c & 0x003F) ));
2544 else if (c < 0x200000) {
2545 show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2546 show_char((0x80 | ((c & 0x03F000) >> 12)));
2547 show_char((0x80 | ((c & 0x000FC0) >> 6)));
2548 show_char((0x80 | (c & 0x00003F) ));
2555 if (character_set_setting == 1 && c < 0x100) {
2556 /* Fits in Latin-1 */
2560 /* Supporting other character_set_setting is harder; not currently implemented. */
2562 /* Use the escaped form */
2563 sprintf(buf, "@{%x}", c);
2564 for (ix=0; buf[ix]; ix++)
2568 extern void word_to_ascii(uchar *p, char *results)
2569 { int i, shift, cc, zchar; uchar encoded_word[9];
2570 encoded_word[0] = (((int) p[0])&0x7c)/4;
2571 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2572 encoded_word[2] = ((int) p[1])&0x1f;
2573 encoded_word[3] = (((int) p[2])&0x7c)/4;
2574 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2575 encoded_word[5] = ((int) p[3])&0x1f;
2576 if (version_number > 3)
2577 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2578 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2579 encoded_word[8] = ((int) p[5])&0x1f;
2583 encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2587 for (i=0; i< ((version_number==3)?6:9); i++)
2588 { zchar = encoded_word[i];
2590 if (zchar == 4) shift = 1;
2592 if (zchar == 5) shift = 2;
2594 { if ((shift == 2) && (zchar == 6))
2595 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2597 if ((zchar>=32) && (zchar<=126))
2598 results[cc++] = zchar;
2600 { zscii_to_text(results+cc, zchar);
2601 cc = strlen(results);
2605 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2606 cc = strlen(results);
2614 /* Print a dictionary word to stdout.
2615 (This assumes that d_show_buf is null.)
2617 void print_dict_word(int node)
2623 char textual_form[32];
2624 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2626 word_to_ascii(p, textual_form);
2628 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2629 show_char(textual_form[cprinted]);
2632 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2634 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2637 if (DICT_CHAR_SIZE == 1)
2640 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2648 static void recursively_show_z(int node, int level)
2649 { int i, cprinted, flags; uchar *p;
2650 char textual_form[32];
2651 int res = (version_number == 3)?4:6; /* byte length of encoded text */
2653 if (dtree[node].branch[0] != VACANT)
2654 recursively_show_z(dtree[node].branch[0], level);
2656 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2658 word_to_ascii(p, textual_form);
2660 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2661 show_char(textual_form[cprinted]);
2662 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2665 /* The level-1 info can only be printfed (d_show_buf must be null). */
2666 if (d_show_buf == NULL && level >= 1)
2669 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2672 flags = (int) p[res];
2682 { if (grammar_version_number == 1)
2683 printf("preposition:%d ", (int) p[res+2]);
2685 printf("preposition ");
2687 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2688 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2692 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2693 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2695 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2699 if (dtree[node].branch[1] != VACANT)
2700 recursively_show_z(dtree[node].branch[1], level);
2703 static void recursively_show_g(int node, int level)
2707 if (dtree[node].branch[0] != VACANT)
2708 recursively_show_g(dtree[node].branch[0], level);
2710 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2712 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2715 if (DICT_CHAR_SIZE == 1)
2718 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2723 for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2726 /* The level-1 info can only be printfed (d_show_buf must be null). */
2727 if (d_show_buf == NULL && level >= 1)
2728 { int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2729 int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2730 int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2732 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2743 { printf("preposition ");
2745 if ((flags & 3) == 3) printf("metaverb:%d ", verbnum);
2746 else if ((flags & 3) == 1) printf("verb:%d ", verbnum);
2750 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2751 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2753 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2757 if (dtree[node].branch[1] != VACANT)
2758 recursively_show_g(dtree[node].branch[1], level);
2761 static void show_alphabet(int i)
2762 { int j, c; char chartext[8];
2764 for (j=0; j<26; j++)
2765 { c = alphabet[i][j];
2767 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2769 zscii_to_text(chartext, c);
2770 printf("%s", chartext);
2772 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2777 extern void show_dictionary(int level)
2779 /* Level 0: show words only. Level 1: show words and flags.
2780 Level 2: also show bytes.*/
2781 printf("Dictionary contains %d entries:\n",dict_entries);
2782 if (dict_entries != 0)
2783 { d_show_len = 0; d_show_buf = NULL;
2785 recursively_show_z(root, level);
2787 recursively_show_g(root, level);
2791 printf("\nZ-machine alphabet entries:\n");
2798 extern void write_dictionary_to_transcript(void)
2800 d_show_size = 80; /* initial size */
2801 d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2803 write_to_transcript_file("", STRCTX_INFO);
2804 sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2805 write_to_transcript_file(d_show_buf, STRCTX_INFO);
2809 if (dict_entries != 0)
2812 recursively_show_z(root, 0);
2814 recursively_show_g(root, 0);
2816 if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2818 my_free(&d_show_buf, "dictionary display buffer");
2819 d_show_len = 0; d_show_buf = NULL;
2822 /* ========================================================================= */
2823 /* Data structure management routines */
2824 /* ------------------------------------------------------------------------- */
2826 extern void init_text_vars(void)
2837 translated_text = NULL;
2841 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2843 total_zchars_trans = 0;
2848 final_dict_order = NULL;
2849 dict_sort_codes = NULL;
2850 prepared_sort = NULL;
2853 static_strings_area = NULL;
2854 abbreviations_optimal_parse_schedule = NULL;
2855 abbreviations_optimal_parse_scores = NULL;
2857 compressed_offsets = NULL;
2858 huff_entities = NULL;
2860 unicode_usage_entries = NULL;
2863 extern void text_begin_pass(void)
2864 { abbrevs_lookup_table_made = FALSE;
2866 abbreviations_totaltext=0;
2867 total_chars_trans=0; total_bytes_trans=0;
2869 dictionary_begin_pass();
2870 low_strings_top = 0;
2872 static_strings_extent = 0;
2874 no_dynamic_strings = 0;
2875 no_unicode_chars = 0;
2878 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2880 extern void text_allocate_arrays(void)
2884 initialise_memory_list(&translated_text_memlist,
2885 sizeof(uchar), 8000, (void**)&translated_text,
2886 "translated text holding area");
2888 initialise_memory_list(&temp_symbol_memlist,
2889 sizeof(char), 32, (void**)&temp_symbol,
2890 "temporary symbol name");
2892 initialise_memory_list(&all_text_memlist,
2893 sizeof(char), 0, (void**)&all_text,
2894 "transcription text for optimise");
2896 initialise_memory_list(&static_strings_area_memlist,
2897 sizeof(uchar), 128, (void**)&static_strings_area,
2898 "static strings area");
2900 initialise_memory_list(&abbreviations_text_memlist,
2901 sizeof(char), 64, (void**)&abbreviations_text,
2902 "abbreviation text");
2904 initialise_memory_list(&abbreviations_memlist,
2905 sizeof(abbreviation), 64, (void**)&abbreviations,
2908 initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2909 sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2910 "abbreviations optimal parse schedule");
2911 initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2912 sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2913 "abbreviations optimal parse scores");
2915 initialise_memory_list(&dtree_memlist,
2916 sizeof(dict_tree_node), 1500, (void**)&dtree,
2917 "red-black tree for dictionary");
2918 initialise_memory_list(&dict_sort_codes_memlist,
2919 sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2920 "dictionary sort codes");
2921 initialise_memory_list(&prepared_sort_memlist,
2922 sizeof(uchar), DICT_WORD_BYTES, (void**)&prepared_sort,
2923 "prepared sort buffer");
2925 final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2927 /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2928 4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2929 allocation; we don't have to be precise. */
2930 initialise_memory_list(&dictionary_memlist,
2931 sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2934 initialise_memory_list(&low_strings_memlist,
2935 sizeof(uchar), 1024, (void**)&low_strings,
2936 "low (abbreviation) strings");
2942 huff_entities = NULL;
2944 unicode_usage_entries = NULL;
2945 done_compression = FALSE;
2946 compression_table_size = 0;
2947 compressed_offsets = NULL;
2949 initialise_memory_list(&unicode_usage_entries_memlist,
2950 sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2951 "unicode entity entries");
2953 /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2955 /* This hash table is only used in Glulx */
2956 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2957 unicode_usage_hash[ix] = -1;
2959 initialise_memory_list(&compressed_offsets_memlist,
2960 sizeof(int32), 0, (void**)&compressed_offsets,
2961 "static strings index table");
2964 extern void extract_all_text()
2966 /* optimise_abbreviations() is called after free_arrays(). Therefore,
2967 we need to preserve the text transcript where it will not be
2968 freed up. We do this by copying the pointer to opttext. */
2970 opttextlen = all_text_top;
2972 /* Re-init all_text_memlist. This causes it to forget all about the
2973 old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2974 initialise_memory_list(&all_text_memlist,
2975 sizeof(char), 0, (void**)&all_text,
2976 "dummy transcription text");
2979 extern void text_free_arrays(void)
2981 deallocate_memory_list(&translated_text_memlist);
2982 deallocate_memory_list(&temp_symbol_memlist);
2984 deallocate_memory_list(&all_text_memlist);
2986 deallocate_memory_list(&low_strings_memlist);
2987 deallocate_memory_list(&abbreviations_text_memlist);
2988 deallocate_memory_list(&abbreviations_memlist);
2990 deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2991 deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2993 deallocate_memory_list(&dtree_memlist);
2994 deallocate_memory_list(&dict_sort_codes_memlist);
2995 deallocate_memory_list(&prepared_sort_memlist);
2996 my_free(&final_dict_order, "final dictionary ordering table");
2998 deallocate_memory_list(&dictionary_memlist);
3000 deallocate_memory_list(&compressed_offsets_memlist);
3001 my_free(&hufflist, "huffman node list");
3002 my_free(&huff_entities, "huffman entities");
3004 deallocate_memory_list(&unicode_usage_entries_memlist);
3006 deallocate_memory_list(&static_strings_area_memlist);
3009 extern void ao_free_arrays(void)
3011 /* Called only after optimise_abbreviations() runs. */
3015 for (i=0; i<MAX_BESTYET; i++) {
3016 my_free(&bestyet[i].text, "bestyet.text");
3020 for (i=0; i<MAX_ABBREVS; i++) {
3021 my_free(&bestyet2[i].text, "bestyet2.text");
3025 my_free (&opttext,"stashed transcript for optimisation");
3026 my_free (&bestyet,"bestyet");
3027 my_free (&bestyet2,"bestyet2");
3028 my_free (&grandtable,"grandtable");
3029 my_free (&grandflags,"grandflags");
3031 deallocate_memory_list(&tlbtab_memlist);
3033 /* This was re-inited, so we should re-deallocate it. */
3034 deallocate_memory_list(&all_text_memlist);
3037 /* ========================================================================= */