1 /* ------------------------------------------------------------------------- */
2 /* "text" : Text translation, the abbreviations optimiser, the dictionary */
4 /* Part of Inform 6.41 */
5 /* copyright (c) Graham Nelson 1993 - 2022 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ */
20 /* ------------------------------------------------------------------------- */
24 uchar *low_strings; /* Allocated to low_strings_top */
25 int32 low_strings_top;
26 static memory_list low_strings_memlist;
28 int32 static_strings_extent; /* Number of bytes of static strings
30 uchar *static_strings_area; /* Used to hold the static strings
32 Allocated to static_strings_extent */
33 memory_list static_strings_area_memlist;
35 static char *all_text; /* Text buffer holding the entire text
36 of the game, when it is being
38 (Allocated to all_text_top) */
39 static memory_list all_text_memlist;
40 static int32 all_text_top;
42 int abbrevs_lookup_table_made, /* The abbreviations lookup table is
43 constructed when the first non-
44 abbreviation string is translated:
45 this flag is TRUE after that */
46 abbrevs_lookup[256]; /* Once this has been constructed,
47 abbrevs_lookup[n] = the smallest
48 number of any abbreviation beginning
49 with ASCII character n, or -1
50 if none of the abbreviations do */
51 int no_abbreviations; /* No of abbreviations defined so far */
52 /* ------------------------------------------------------------------------- */
53 /* Glulx string compression storage */
54 /* ------------------------------------------------------------------------- */
56 int no_strings; /* No of strings in static strings
58 int no_dynamic_strings; /* No. of @.. string escapes used
59 (actually, the highest value used
61 int no_unicode_chars; /* Number of distinct Unicode chars
62 used. (Beyond 0xFF.) */
64 huffentity_t *huff_entities; /* The list of entities (characters,
65 abbreviations, @.. escapes, and
67 static huffentity_t **hufflist; /* Copy of the list, for sorting */
69 int no_huff_entities; /* The number of entities in the list */
70 int huff_unicode_start; /* Position in the list where Unicode
72 int huff_abbrev_start; /* Position in the list where string
73 abbreviations begin. */
74 int huff_dynam_start; /* Position in the list where @..
76 int huff_entity_root; /* The position in the list of the root
77 entry (when considering the table
80 int done_compression; /* Has the game text been compressed? */
81 int32 compression_table_size; /* Length of the Huffman table, in
83 int32 compression_string_size; /* Length of the compressed string
85 int32 *compressed_offsets; /* The beginning of every string in
86 the game, relative to the beginning
87 of the Huffman table. (So entry 0
88 is equal to compression_table_size).
89 Allocated to no_strings at
90 compress_game_text() time. */
91 static memory_list compressed_offsets_memlist;
93 unicode_usage_t *unicode_usage_entries; /* Allocated to no_unicode_chars */
94 static memory_list unicode_usage_entries_memlist;
96 #define UNICODE_HASH_BUCKETS (64)
97 static int unicode_usage_hash[UNICODE_HASH_BUCKETS];
99 static int unicode_entity_index(int32 unicode);
101 /* ------------------------------------------------------------------------- */
102 /* Abbreviation arrays */
103 /* ------------------------------------------------------------------------- */
105 abbreviation *abbreviations; /* Allocated up to no_abbreviations */
106 static memory_list abbreviations_memlist;
108 /* Memory to hold the text of any abbreviation strings declared. This is
109 counted in units of MAX_ABBREV_LENGTH bytes. (An abbreviation must fit
110 in that many bytes, null included.) */
111 uchar *abbreviations_at; /* Allocated up to no_abbreviations */
112 static memory_list abbreviations_at_memlist;
114 static int *abbreviations_optimal_parse_schedule;
115 static memory_list abbreviations_optimal_parse_schedule_memlist;
117 static int *abbreviations_optimal_parse_scores;
118 static memory_list abbreviations_optimal_parse_scores_memlist;
120 /* ------------------------------------------------------------------------- */
122 int32 total_chars_trans, /* Number of ASCII chars of text in */
123 total_bytes_trans, /* Number of bytes of Z-code text out */
124 zchars_trans_in_last_string; /* Number of Z-chars in last string:
125 needed only for abbrev efficiency
126 calculation in "directs.c" */
127 static int32 total_zchars_trans; /* Number of Z-chars of text out
128 (only used to calculate the above) */
130 static int zchars_out_buffer[3], /* During text translation, a buffer of
131 3 Z-chars at a time: when it's full
132 these are written as a 2-byte word */
133 zob_index; /* Index (0 to 2) into it */
135 uchar *translated_text; /* Area holding translated strings
136 until they are moved into the
137 static_strings_area below */
138 static memory_list translated_text_memlist;
140 static int32 text_out_pos; /* The "program counter" during text
141 translation: the next position to
142 write Z-coded text output to */
144 static int32 text_out_limit; /* The upper limit of text_out_pos
145 during text translation (or -1
148 static int text_out_overflow; /* During text translation, becomes
149 true if text_out_pos tries to pass
152 /* ------------------------------------------------------------------------- */
153 /* For variables/arrays used by the dictionary manager, see below */
154 /* ------------------------------------------------------------------------- */
156 /* ------------------------------------------------------------------------- */
157 /* Prepare the abbreviations lookup table (used to speed up abbreviation */
158 /* detection in text translation). We first bubble-sort the abbrevs into */
159 /* alphabetical order (this is necessary for the detection algorithm to */
160 /* to work). Since the table is only prepared once, and for a table */
161 /* of size at most 96, there's no point using an efficient sort algorithm. */
162 /* ------------------------------------------------------------------------- */
164 static void make_abbrevs_lookup(void)
165 { int bubble_sort, j, k, l; char p[MAX_ABBREV_LENGTH]; char *p1, *p2;
167 { bubble_sort = FALSE;
168 for (j=0; j<no_abbreviations; j++)
169 for (k=j+1; k<no_abbreviations; k++)
170 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
171 p2=(char *)abbreviations_at+k*MAX_ABBREV_LENGTH;
173 { strcpy(p,p1); strcpy(p1,p2); strcpy(p2,p);
174 l=abbreviations[j].value; abbreviations[j].value=abbreviations[k].value;
175 abbreviations[k].value=l;
176 l=abbreviations[j].quality; abbreviations[j].quality=abbreviations[k].quality;
177 abbreviations[k].quality=l;
181 } while (bubble_sort);
183 for (j=no_abbreviations-1; j>=0; j--)
184 { p1=(char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
185 abbrevs_lookup[(uchar)p1[0]]=j;
186 abbreviations[j].freq=0;
188 abbrevs_lookup_table_made = TRUE;
191 /* ------------------------------------------------------------------------- */
192 /* Search the abbreviations lookup table (a routine which must be fast). */
193 /* The source text to compare is text[i], text[i+1], ... and this routine */
194 /* is only called if text[i] is indeed the first character of at least one */
195 /* abbreviation, "from" begin the least index into the abbreviations table */
196 /* of an abbreviation for which text[i] is the first character. Recall */
197 /* that the abbrevs table is in alphabetical order. */
199 /* The return value is -1 if there is no match. If there is a match, the */
200 /* text to be abbreviated out is over-written by a string of null chars */
201 /* with "ASCII" value 1, and the abbreviation number is returned. */
203 /* In Glulx, we *do not* do this overwriting with 1's. */
204 /* ------------------------------------------------------------------------- */
206 static int try_abbreviations_from(unsigned char *text, int i, int from)
207 { int j, k; uchar *p, c;
209 for (j=from, p=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
210 (j<no_abbreviations)&&(c==p[0]); j++, p+=MAX_ABBREV_LENGTH)
211 { if (text[i+1]==p[1])
212 { for (k=2; p[k]!=0; k++)
213 if (text[i+k]!=p[k]) goto NotMatched;
215 for (k=0; p[k]!=0; k++) text[i+k]=1;
217 abbreviations[j].freq++;
225 extern void make_abbreviation(char *text)
227 /* If -e mode is off, we won't waste space creating an abbreviation entry. */
231 ensure_memory_list_available(&abbreviations_memlist, no_abbreviations+1);
232 ensure_memory_list_available(&abbreviations_at_memlist, no_abbreviations+1);
234 strcpy((char *)abbreviations_at
235 + no_abbreviations*MAX_ABBREV_LENGTH, text);
237 abbreviations[no_abbreviations].value = compile_string(text, STRCTX_ABBREV);
238 abbreviations[no_abbreviations].freq = 0;
240 /* The quality is the number of Z-chars saved by using this */
241 /* abbreviation: note that it takes 2 Z-chars to print it. */
243 abbreviations[no_abbreviations].quality = zchars_trans_in_last_string - 2;
245 if (abbreviations[no_abbreviations].quality <= 0) {
246 warning_named("Abbreviation does not save any characters:", text);
252 /* ------------------------------------------------------------------------- */
253 /* The front end routine for text translation. */
254 /* strctx indicates the purpose of the string. This is mostly used for */
255 /* informational output (gametext.txt), but we treat some string contexts */
256 /* specially during compilation. */
257 /* ------------------------------------------------------------------------- */
259 extern int32 compile_string(char *b, int strctx)
264 if (execution_never_reaches_here) {
265 /* No need to put strings into gametext.txt or the static/low
267 if (strctx == STRCTX_GAME || strctx == STRCTX_GAMEOPC || strctx == STRCTX_LOWSTRING || strctx == STRCTX_INFIX) {
268 /* VENEER and VENEEROPC are only used at the translate_text level,
269 so we don't have to catch them here. */
274 /* In Z-code, abbreviations go in the low memory pool (0x100). So
275 do strings explicitly defined with the Lowstring directive.
276 (In Glulx, the in_low_memory flag is ignored.) */
277 in_low_memory = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
279 if (!glulx_mode && in_low_memory)
281 k = translate_text(-1, b, strctx);
283 error("text translation failed");
286 ensure_memory_list_available(&low_strings_memlist, low_strings_top+k);
287 memcpy(low_strings+low_strings_top, translated_text, k);
289 low_strings_top += k;
293 if (glulx_mode && done_compression)
294 compiler_error("Tried to add a string after compression was done.");
296 i = translate_text(-1, b, strctx);
298 error("text translation failed");
302 /* Insert null bytes as needed to ensure that the next static string */
303 /* also occurs at an address expressible as a packed address */
307 if (oddeven_packing_switch)
308 textalign = scale_factor*2;
310 textalign = scale_factor;
311 while ((i%textalign)!=0)
313 ensure_memory_list_available(&translated_text_memlist, i+2);
314 translated_text[i++] = 0;
315 translated_text[i++] = 0;
319 j = static_strings_extent;
321 ensure_memory_list_available(&static_strings_area_memlist, static_strings_extent+i);
322 for (c=translated_text; c<translated_text+i;
323 c++, static_strings_extent++)
324 static_strings_area[static_strings_extent] = *c;
327 return(j/scale_factor);
330 /* The marker value is a one-based string number. (We reserve zero
331 to mean "not a string at all". */
332 return (++no_strings);
336 /* ------------------------------------------------------------------------- */
337 /* Output a single Z-character into the buffer, and flush it if full */
338 /* ------------------------------------------------------------------------- */
340 static void write_z_char_z(int i)
343 total_zchars_trans++;
344 zchars_out_buffer[zob_index++]=(i%32);
345 if (zob_index!=3) return;
347 j= zchars_out_buffer[0]*0x0400 + zchars_out_buffer[1]*0x0020
348 + zchars_out_buffer[2];
350 if (text_out_limit >= 0) {
351 if (text_out_pos+2 > text_out_limit) {
352 text_out_overflow = TRUE;
357 ensure_memory_list_available(&translated_text_memlist, text_out_pos+2);
360 translated_text[text_out_pos++] = j/256; translated_text[text_out_pos++] = j%256;
361 total_bytes_trans+=2;
364 static void write_zscii(int zsc)
366 int lookup_value, in_alphabet;
373 if (zsc < 0x100) lookup_value = zscii_to_alphabet_grid[zsc];
375 else lookup_value = -1;
377 if (lookup_value >= 0)
378 { alphabet_used[lookup_value] = 'Y';
379 in_alphabet = lookup_value/26;
380 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
381 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
382 write_z_char_z(lookup_value%26 + 6);
385 { write_z_char_z(5); write_z_char_z(6);
386 write_z_char_z(zsc/32); write_z_char_z(zsc%32);
390 /* ------------------------------------------------------------------------- */
391 /* Finish a Z-coded string, padding out with Z-char 5s if necessary and */
392 /* setting the "end" bit on the final 2-byte word */
393 /* ------------------------------------------------------------------------- */
395 static void end_z_chars(void)
397 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
398 while (zob_index!=0) write_z_char_z(5);
399 if (text_out_pos < 2) {
400 /* Something went wrong. */
401 text_out_overflow = TRUE;
404 translated_text[text_out_pos-2] += 128;
407 /* Glulx handles this much more simply -- compression is done elsewhere. */
408 static void write_z_char_g(int i)
411 if (text_out_limit >= 0) {
412 if (text_out_pos+1 > text_out_limit) {
413 text_out_overflow = TRUE;
418 ensure_memory_list_available(&translated_text_memlist, text_out_pos+1);
420 total_zchars_trans++;
421 translated_text[text_out_pos++] = i;
425 /* Helper routine to compute the weight, in units, of a character handled by the Z-Machine */
426 static int zchar_weight(int c)
428 int lookup = iso_to_alphabet_grid[c];
429 if (lookup < 0) return 4;
430 if (lookup < 26) return 1;
434 /* ------------------------------------------------------------------------- */
435 /* The main routine "text.c" provides to the rest of Inform: the text */
436 /* translator. s_text is the source text and the return value is the */
437 /* number of bytes translated. */
438 /* The translated text will be stored in translated_text. */
440 /* If p_limit is >= 0, the text length will not exceed that many bytes. */
441 /* If the translation tries to overflow this boundary, the return value */
442 /* will be -1. (You should display an error and not read translated_text.) */
444 /* If p_limit is negative, any amount of text is accepted (up to int32 */
447 /* Note that the source text may be corrupted by this routine. */
448 /* ------------------------------------------------------------------------- */
450 extern int32 translate_text(int32 p_limit, char *s_text, int strctx)
451 { int i, j, k, in_alphabet, lookup_value, is_abbreviation;
452 int32 unicode; int zscii;
453 unsigned char *text_in;
456 ensure_memory_list_available(&translated_text_memlist, p_limit);
459 /* For STRCTX_ABBREV, the string being translated is itself an
460 abbreviation string, so it can't make use of abbreviations. Set
461 the is_abbreviation flag to indicate this.
462 The compiler has historically set this flag for the Lowstring
463 directive as well -- the in_low_memory and is_abbreviation flag were
464 always the same. I am preserving that convention. */
465 is_abbreviation = (strctx == STRCTX_ABBREV || strctx == STRCTX_LOWSTRING);
468 /* Cast the input and output streams to unsigned char: text_out_pos will
469 advance as bytes of Z-coded text are written, but text_in doesn't */
471 text_in = (unsigned char *) s_text;
473 text_out_limit = p_limit;
474 text_out_overflow = FALSE;
476 /* Remember the Z-chars total so that later we can subtract to find the
477 number of Z-chars translated on this string */
479 zchars_trans_in_last_string = total_zchars_trans;
481 /* Start with the Z-characters output buffer empty */
485 /* If this is the first text translated since the abbreviations were
486 declared, and if some were declared, then it's time to make the
487 lookup table for abbreviations
489 (Except: we don't if the text being translated is itself
490 the text of an abbreviation currently being defined) */
492 if ((!abbrevs_lookup_table_made) && (no_abbreviations > 0)
493 && (!is_abbreviation))
494 make_abbrevs_lookup();
496 /* If we're storing the whole game text to memory, then add this text.
497 We will put two newlines between each text and four at the very end.
498 (The optimise code does a lot of sloppy text[i+2], so the extra
499 two newlines past all_text_top are necessary.) */
501 if ((!is_abbreviation) && (store_the_text))
502 { int addlen = strlen(s_text);
503 ensure_memory_list_available(&all_text_memlist, all_text_top+addlen+5);
504 sprintf(all_text+all_text_top, "%s\n\n\n\n", s_text);
505 /* Advance past two newlines. */
506 all_text_top += (addlen+2);
509 if (transcript_switch) {
510 /* Omit veneer strings, unless we're using the new transcript format, which includes everything. */
511 if ((!veneer_mode) || TRANSCRIPT_FORMAT == 1) {
514 if (label == STRCTX_GAME)
515 label = STRCTX_VENEER;
516 else if (label == STRCTX_GAMEOPC)
517 label = STRCTX_VENEEROPC;
519 write_to_transcript_file(s_text, label);
523 /* Computing the optimal way to parse strings to insert abbreviations with dynamic programming */
524 /* (ref: R.A. Wagner , "Common phrases and minimum-space text storage", Commun. ACM, 16 (3) (1973)) */
525 /* We compute this optimal way here; it's stored in abbreviations_optimal_parse_schedule */
529 int l, min_score, from;
532 text_in_length = strlen( (char*) text_in);
533 ensure_memory_list_available(&abbreviations_optimal_parse_schedule_memlist, text_in_length);
534 ensure_memory_list_available(&abbreviations_optimal_parse_scores_memlist, text_in_length+1);
536 abbreviations_optimal_parse_scores[text_in_length] = 0;
537 for(j=text_in_length-1; j>=0; j--)
538 { /* Initial values: empty schedule, score = just write the letter without abbreviating. */
539 abbreviations_optimal_parse_schedule[j] = -1;
540 min_score = zchar_weight(text_in[j]) + abbreviations_optimal_parse_scores[j+1];
541 /* If there's an abbreviation starting with that letter... */
542 if ( (from = abbrevs_lookup[text_in[j]]) != -1)
545 /* Loop on all abbreviations starting with what is in c. */
546 for (k=from, q=(uchar *)abbreviations_at+from*MAX_ABBREV_LENGTH;
547 (k<no_abbreviations)&&(c==q[0]); k++, q+=MAX_ABBREV_LENGTH)
549 /* Let's compare; we also keep track of the length of the abbreviation. */
550 for (l=1; q[l]!=0; l++)
551 { if (text_in[j+l]!=q[l]) {goto NotMatched;}
553 /* We have a match (length l), but is it smaller in size? */
554 if (min_score > 2 + abbreviations_optimal_parse_scores[j+l])
555 { /* It is indeed smaller, so let's write it down in our schedule. */
556 min_score = 2 + abbreviations_optimal_parse_scores[j+l];
557 abbreviations_optimal_parse_schedule[j] = k;
562 /* We gave it our best, this is the smallest we got. */
563 abbreviations_optimal_parse_scores[j] = min_score;
571 /* The empty string of Z-text is illegal, since it can't carry an end
572 bit: so we translate an empty string of ASCII text to just the
573 pad character 5. Printing this causes nothing to appear on screen. */
575 if (text_in[0]==0) write_z_char_z(5);
577 /* Loop through the characters of the null-terminated input text: note
578 that if 1 is written over a character in the input text, it is
579 afterwards ignored */
581 for (i=0; text_in[i]!=0; i++)
582 { total_chars_trans++;
584 /* Contract ". " into ". " if double-space-removing switch set:
585 likewise "? " and "! " if the setting is high enough */
587 if ((double_space_setting >= 1)
588 && (text_in[i+1]==' ') && (text_in[i+2]==' '))
589 { if (text_in[i]=='.') text_in[i+2]=1;
590 if (double_space_setting >= 2)
591 { if (text_in[i]=='?') text_in[i+2]=1;
592 if (text_in[i]=='!') text_in[i+2]=1;
596 /* Try abbreviations if the economy switch set. */
597 /* Look at the abbreviation schedule to see if we should abbreviate here. */
598 /* Note: Just because the schedule has something doesn't mean we should abbreviate there; */
599 /* sometimes you abbreviate before because it's better. If we have already replaced the */
600 /* char by a '1', it means we're in the middle of an abbreviation; don't try to abbreviate then. */
601 if ((economy_switch) && (!is_abbreviation) && text_in[i] != 1 &&
602 ((j = abbreviations_optimal_parse_schedule[i]) != -1))
604 /* Fill with 1s, which will get ignored by everyone else. */
605 uchar *p = (uchar *)abbreviations_at+j*MAX_ABBREV_LENGTH;
606 for (k=0; p[k]!=0; k++) text_in[i+k]=1;
607 /* Actually write the abbreviation in the story file. */
608 abbreviations[j].freq++;
609 /* Abbreviations run from MAX_DYNAMIC_STRINGS to 96. */
610 j += MAX_DYNAMIC_STRINGS;
611 write_z_char_z(j/32+1); write_z_char_z(j%32);
615 /* If Unicode switch set, use text_to_unicode to perform UTF-8
617 if (character_set_unicode && (text_in[i] & 0x80))
618 { unicode = text_to_unicode((char *) (text_in+i));
619 zscii = unicode_to_zscii(unicode);
620 if (zscii != 5) write_zscii(zscii);
622 { unicode_char_error(
623 "Character can only be used if declared in \
624 advance as part of 'Zcharacter table':", unicode);
626 i += textual_form_length - 1;
630 /* '@' is the escape character in Inform string notation: the various
633 @@decimalnumber : write this ZSCII char (0 to 1023)
634 @twodigits or : write the abbreviation string with this
635 @(digits) decimal number
636 @(symbol) : write the abbreviation string with this
638 @accentcode : this accented character: e.g.,
639 for @'e write an E-acute
640 @{...} : this Unicode char (in hex) */
643 { if (text_in[i+1]=='@')
645 /* @@... (ascii value) */
647 i+=2; j=atoi((char *) (text_in+i));
649 { /* Prevent ~ and ^ from being translated to double-quote
650 and new-line, as they ordinarily would be */
652 case 94: write_z_char_z(5); write_z_char_z(6);
653 write_z_char_z(94/32); write_z_char_z(94%32);
655 case 126: write_z_char_z(5); write_z_char_z(6);
656 write_z_char_z(126/32); write_z_char_z(126%32);
659 default: write_zscii(j); break;
661 while (isdigit(text_in[i])) i++; i--;
663 else if (text_in[i+1]=='(')
665 /* @(...) (dynamic string) */
666 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
667 int len = 0, digits = 0;
669 /* This accepts "12xyz" as a symbol, which it really isn't,
670 but that just means it won't be found. */
671 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
672 char ch = text_in[i++];
673 if (isdigit(ch)) digits++;
678 /* We would like to parse dsymbol as *either* a decimal
679 number or a constant symbol. */
680 if (text_in[i] != ')' || len == 0) {
681 error("'@(...)' abbreviation must contain a symbol");
683 else if (digits == len) {
684 /* all digits; parse as decimal */
688 int sym = symbol_index(dsymbol, -1);
689 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
690 error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
693 symbols[sym].flags |= USED_SFLAG;
694 j = symbols[sym].value;
697 if (!glulx_mode && j >= 96) {
698 error_max_dynamic_strings(j);
701 if (j >= MAX_DYNAMIC_STRINGS) {
702 error_max_dynamic_strings(j);
706 write_z_char_z(j/32+1); write_z_char_z(j%32);
709 write_z_char_z(' '); /* error fallback */
712 else if (isdigit(text_in[i+1])!=0)
715 /* @.. (dynamic string) */
717 d1 = character_digit_value[text_in[i+1]];
718 d2 = character_digit_value[text_in[i+2]];
719 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10))
720 error("'@..' must have two decimal digits");
724 if (!glulx_mode && j >= 96) {
725 error_max_dynamic_strings(j);
728 if (j >= MAX_DYNAMIC_STRINGS) {
729 /* Shouldn't get here with two digits */
730 error_max_dynamic_strings(j);
735 write_z_char_z(j/32+1); write_z_char_z(j%32);
738 write_z_char_z(' '); /* error fallback */
744 /* A string escape specifying an unusual character */
746 unicode = text_to_unicode((char *) (text_in+i));
747 zscii = unicode_to_zscii(unicode);
748 if (zscii != 5) write_zscii(zscii);
750 { unicode_char_error(
751 "Character can only be used if declared in \
752 advance as part of 'Zcharacter table':", unicode);
754 i += textual_form_length - 1;
758 { /* Skip a character which has been over-written with the null
759 value 1 earlier on */
762 { if (text_in[i]==' ') write_z_char_z(0);
764 { j = (int) text_in[i];
765 lookup_value = iso_to_alphabet_grid[j];
766 if (lookup_value < 0)
767 { /* The character isn't in the standard alphabets, so
768 we have to use the ZSCII 4-Z-char sequence */
770 if (lookup_value == -5)
771 { /* Character isn't in the ZSCII set at all */
773 unicode = iso_to_unicode(j);
775 "Character can only be used if declared in \
776 advance as part of 'Zcharacter table':", unicode);
777 write_zscii(0x200 + unicode/0x100);
778 write_zscii(0x300 + unicode%0x100);
780 else write_zscii(-lookup_value);
783 { /* The character is in one of the standard alphabets:
784 write a SHIFT to temporarily change alphabet if
785 it isn't in alphabet 0, then write the Z-char */
787 alphabet_used[lookup_value] = 'Y';
788 in_alphabet = lookup_value/26;
789 if (in_alphabet==1) write_z_char_z(4); /* SHIFT to A1 */
790 if (in_alphabet==2) write_z_char_z(5); /* SHIFT to A2 */
791 write_z_char_z(lookup_value%26 + 6);
798 /* Flush the Z-characters output buffer and set the "end" bit */
804 /* The text storage here is, of course, temporary. Compression
805 will occur when we're finished compiling, so that all the
806 clever Huffman stuff will work.
807 In the stored text, we use "@@" to indicate @,
808 "@0" to indicate a zero byte,
809 "@ANNNN" to indicate an abbreviation,
810 "@DNNNN" to indicate a dynamic string thing.
811 "@UNNNN" to indicate a four-byte Unicode value (0x100 or higher).
812 (NNNN is a four-digit hex number using the letters A-P... an
813 ugly representation but a convenient one.)
816 for (i=0; text_in[i]!=0; i++) {
818 /* Contract ". " into ". " if double-space-removing switch set:
819 likewise "? " and "! " if the setting is high enough. */
820 if ((double_space_setting >= 1)
821 && (text_in[i+1]==' ') && (text_in[i+2]==' ')) {
823 || (double_space_setting >= 2
824 && (text_in[i]=='?' || text_in[i]=='!'))) {
825 text_in[i+1] = text_in[i];
832 /* Try abbreviations if the economy switch set. We have to be in
833 compression mode too, since the abbreviation mechanism is part
834 of string decompression. */
836 if ((economy_switch) && (compression_switch) && (!is_abbreviation)
837 && ((k=abbrevs_lookup[text_in[i]])!=-1)
838 && ((j=try_abbreviations_from(text_in, i, k)) != -1)) {
839 char *cx = (char *)abbreviations_at+j*MAX_ABBREV_LENGTH;
843 write_z_char_g('A' + ((j >>12) & 0x0F));
844 write_z_char_g('A' + ((j >> 8) & 0x0F));
845 write_z_char_g('A' + ((j >> 4) & 0x0F));
846 write_z_char_g('A' + ((j ) & 0x0F));
848 else if (text_in[i] == '@') {
849 if (text_in[i+1]=='@') {
851 i+=2; j=atoi((char *) (text_in+i));
852 if (j == '@' || j == '\0') {
856 if (!compression_switch)
857 warning("Ascii @@0 will prematurely terminate non-compressed \
862 while (isdigit(text_in[i])) i++; i--;
864 else if (text_in[i+1]=='(') {
865 char dsymbol[MAX_IDENTIFIER_LENGTH+1];
866 int len = 0, digits = 0;
868 /* This accepts "12xyz" as a symbol, which it really isn't,
869 but that just means it won't be found. */
870 while ((text_in[i] == '_' || isalnum(text_in[i])) && len < MAX_IDENTIFIER_LENGTH) {
871 char ch = text_in[i++];
872 if (isdigit(ch)) digits++;
877 /* We would like to parse dsymbol as *either* a decimal
878 number or a constant symbol. */
879 if (text_in[i] != ')' || len == 0) {
880 error("'@(...)' abbreviation must contain a symbol");
882 else if (digits == len) {
883 /* all digits; parse as decimal */
887 int sym = symbol_index(dsymbol, -1);
888 if ((symbols[sym].flags & UNKNOWN_SFLAG) || symbols[sym].type != CONSTANT_T || symbols[sym].marker) {
889 error_named("'@(...)' abbreviation expected a known constant value, but contained", dsymbol);
892 symbols[sym].flags |= USED_SFLAG;
893 j = symbols[sym].value;
896 if (j >= MAX_DYNAMIC_STRINGS) {
897 error_max_dynamic_strings(j);
900 if (j+1 >= no_dynamic_strings)
901 no_dynamic_strings = j+1;
905 write_z_char_g('A' + ((j >>12) & 0x0F));
906 write_z_char_g('A' + ((j >> 8) & 0x0F));
907 write_z_char_g('A' + ((j >> 4) & 0x0F));
908 write_z_char_g('A' + ((j ) & 0x0F));
911 write_z_char_g(' '); /* error fallback */
914 else if (isdigit(text_in[i+1])) {
916 d1 = character_digit_value[text_in[i+1]];
917 d2 = character_digit_value[text_in[i+2]];
918 if ((d1 == 127) || (d1 >= 10) || (d2 == 127) || (d2 >= 10)) {
919 error("'@..' must have two decimal digits");
922 if (!compression_switch)
923 warning("'@..' print variable will not work in non-compressed \
924 string; substituting ' '.");
927 if (j >= MAX_DYNAMIC_STRINGS) {
928 error_max_dynamic_strings(j);
931 if (j+1 >= no_dynamic_strings)
932 no_dynamic_strings = j+1;
936 write_z_char_g('A' + ((j >>12) & 0x0F));
937 write_z_char_g('A' + ((j >> 8) & 0x0F));
938 write_z_char_g('A' + ((j >> 4) & 0x0F));
939 write_z_char_g('A' + ((j ) & 0x0F));
942 write_z_char_g(' '); /* error fallback */
947 unicode = text_to_unicode((char *) (text_in+i));
948 i += textual_form_length - 1;
949 if (unicode == '@' || unicode == '\0') {
951 write_z_char_g(unicode ? '@' : '0');
953 else if (unicode >= 0 && unicode < 256) {
954 write_z_char_g(unicode);
957 if (!compression_switch) {
958 warning("Unicode characters will not work in non-compressed \
959 string; substituting '?'.");
963 j = unicode_entity_index(unicode);
966 write_z_char_g('A' + ((j >>12) & 0x0F));
967 write_z_char_g('A' + ((j >> 8) & 0x0F));
968 write_z_char_g('A' + ((j >> 4) & 0x0F));
969 write_z_char_g('A' + ((j ) & 0x0F));
974 else if (text_in[i] == '^')
975 write_z_char_g(0x0A);
976 else if (text_in[i] == '~')
978 else if (character_set_unicode) {
979 if (text_in[i] & 0x80) {
980 unicode = text_to_unicode((char *) (text_in+i));
981 i += textual_form_length - 1;
982 if (unicode >= 0 && unicode < 256) {
983 write_z_char_g(unicode);
986 if (!compression_switch) {
987 warning("Unicode characters will not work in non-compressed \
988 string; substituting '?'.");
992 j = unicode_entity_index(unicode);
995 write_z_char_g('A' + ((j >>12) & 0x0F));
996 write_z_char_g('A' + ((j >> 8) & 0x0F));
997 write_z_char_g('A' + ((j >> 4) & 0x0F));
998 write_z_char_g('A' + ((j ) & 0x0F));
1003 write_z_char_g(text_in[i]);
1007 unicode = iso_to_unicode_grid[text_in[i]];
1008 if (unicode >= 0 && unicode < 256) {
1009 write_z_char_g(unicode);
1012 if (!compression_switch) {
1013 warning("Unicode characters will not work in non-compressed \
1014 string; substituting '?'.");
1015 write_z_char_g('?');
1018 j = unicode_entity_index(unicode);
1019 write_z_char_g('@');
1020 write_z_char_g('U');
1021 write_z_char_g('A' + ((j >>12) & 0x0F));
1022 write_z_char_g('A' + ((j >> 8) & 0x0F));
1023 write_z_char_g('A' + ((j >> 4) & 0x0F));
1024 write_z_char_g('A' + ((j ) & 0x0F));
1030 zchars_trans_in_last_string=total_zchars_trans-zchars_trans_in_last_string;
1034 if (text_out_overflow)
1037 return text_out_pos;
1040 static int unicode_entity_index(int32 unicode)
1043 int buck = unicode % UNICODE_HASH_BUCKETS;
1045 for (j = unicode_usage_hash[buck]; j >= 0; j=unicode_usage_entries[j].next) {
1046 if (unicode_usage_entries[j].ch == unicode)
1050 ensure_memory_list_available(&unicode_usage_entries_memlist, no_unicode_chars+1);
1051 j = no_unicode_chars++;
1052 unicode_usage_entries[j].ch = unicode;
1053 unicode_usage_entries[j].next = unicode_usage_hash[buck];
1054 unicode_usage_hash[buck] = j;
1060 /* ------------------------------------------------------------------------- */
1061 /* Glulx compression code */
1062 /* ------------------------------------------------------------------------- */
1065 static void compress_makebits(int entnum, int depth, int prevbit,
1066 huffbitlist_t *bits);
1068 /* The compressor. This uses the usual Huffman compression algorithm. */
1069 void compress_game_text()
1071 int entities=0, branchstart, branches;
1080 if (compression_switch) {
1081 max_char_set = 257 + no_abbreviations + no_dynamic_strings + no_unicode_chars;
1083 huff_entities = my_calloc(sizeof(huffentity_t), max_char_set*2+1,
1084 "huffman entities");
1085 hufflist = my_calloc(sizeof(huffentity_t *), max_char_set,
1086 "huffman node list");
1088 /* How many entities have we currently got? Well, 256 plus the
1089 string-terminator plus Unicode chars plus abbrevations plus
1092 huff_unicode_start = entities;
1093 entities += no_unicode_chars;
1094 huff_abbrev_start = entities;
1096 entities += no_abbreviations;
1097 huff_dynam_start = entities;
1098 entities += no_dynamic_strings;
1100 if (entities > max_char_set)
1101 compiler_error("Too many entities for max_char_set");
1104 for (jx=0; jx<256; jx++) {
1105 huff_entities[jx].type = 2;
1106 huff_entities[jx].count = 0;
1107 huff_entities[jx].u.ch = jx;
1110 huff_entities[256].type = 1;
1111 huff_entities[256].count = 0;
1112 for (jx=0; jx<no_unicode_chars; jx++) {
1113 huff_entities[huff_unicode_start+jx].type = 4;
1114 huff_entities[huff_unicode_start+jx].count = 0;
1115 huff_entities[huff_unicode_start+jx].u.val = jx;
1117 if (economy_switch) {
1118 for (jx=0; jx<no_abbreviations; jx++) {
1119 huff_entities[huff_abbrev_start+jx].type = 3;
1120 huff_entities[huff_abbrev_start+jx].count = 0;
1121 huff_entities[huff_abbrev_start+jx].u.val = jx;
1124 for (jx=0; jx<no_dynamic_strings; jx++) {
1125 huff_entities[huff_dynam_start+jx].type = 9;
1126 huff_entities[huff_dynam_start+jx].count = 0;
1127 huff_entities[huff_dynam_start+jx].u.val = jx;
1131 /* No compression; use defaults that will make it easy to check
1133 no_huff_entities = 257;
1134 huff_unicode_start = 257;
1135 huff_abbrev_start = 257;
1136 huff_dynam_start = 257+no_abbreviations;
1137 compression_table_size = 0;
1140 if (compression_switch) {
1142 for (lx=0, ix=0; lx<no_strings; lx++) {
1143 int escapelen=0, escapetype=0;
1147 ch = static_strings_area[ix];
1149 if (ix > static_strings_extent || ch < 0)
1150 compiler_error("Read too much not-yet-compressed text.");
1151 if (escapelen == -1) {
1156 else if (ch == '0') {
1159 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1166 compiler_error("Strange @ escape in processed text.");
1169 else if (escapelen) {
1170 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1172 if (escapelen == 0) {
1173 if (escapetype == 'A') {
1174 ch = huff_abbrev_start+escapeval;
1176 else if (escapetype == 'D') {
1177 ch = huff_dynam_start+escapeval;
1179 else if (escapetype == 'U') {
1180 ch = huff_unicode_start+escapeval;
1183 compiler_error("Strange @ escape in processed text.");
1199 huff_entities[ch].count++;
1204 for (jx=0; jx<entities; jx++) {
1205 if (huff_entities[jx].count) {
1206 hufflist[numlive] = &(huff_entities[jx]);
1211 branchstart = entities;
1214 while (numlive > 1) {
1216 int best1num, best2num;
1219 if (hufflist[0]->count < hufflist[1]->count) {
1228 best1num = hufflist[best1]->count;
1229 best2num = hufflist[best2]->count;
1231 for (jx=2; jx<numlive; jx++) {
1232 if (hufflist[jx]->count < best1num) {
1234 best2num = best1num;
1236 best1num = hufflist[best1]->count;
1238 else if (hufflist[jx]->count < best2num) {
1240 best2num = hufflist[best2]->count;
1244 bran = &(huff_entities[branchstart+branches]);
1247 bran->count = hufflist[best1]->count + hufflist[best2]->count;
1248 bran->u.branch[0] = (hufflist[best1] - huff_entities);
1249 bran->u.branch[1] = (hufflist[best2] - huff_entities);
1250 hufflist[best1] = bran;
1251 if (best2 < numlive-1) {
1252 memmove(&(hufflist[best2]), &(hufflist[best2+1]),
1253 ((numlive-1) - best2) * sizeof(huffentity_t *));
1258 huff_entity_root = (hufflist[0] - huff_entities);
1260 for (ix=0; ix<MAXHUFFBYTES; ix++)
1262 compression_table_size = 12;
1264 no_huff_entities = 0; /* compress_makebits will total this up */
1265 compress_makebits(huff_entity_root, 0, -1, &bits);
1268 /* Now, sadly, we have to compute the size of the string section,
1269 without actually doing the compression. */
1270 compression_string_size = 0;
1272 ensure_memory_list_available(&compressed_offsets_memlist, no_strings);
1274 for (lx=0, ix=0; lx<no_strings; lx++) {
1275 int escapelen=0, escapetype=0;
1279 compressed_offsets[lx] = compression_table_size + compression_string_size;
1280 compression_string_size++; /* for the type byte */
1282 ch = static_strings_area[ix];
1284 if (ix > static_strings_extent || ch < 0)
1285 compiler_error("Read too much not-yet-compressed text.");
1286 if (escapelen == -1) {
1291 else if (ch == '0') {
1294 else if (ch == 'A' || ch == 'D' || ch == 'U') {
1301 compiler_error("Strange @ escape in processed text.");
1304 else if (escapelen) {
1305 escapeval = (escapeval << 4) | ((ch-'A') & 0x0F);
1307 if (escapelen == 0) {
1308 if (escapetype == 'A') {
1309 ch = huff_abbrev_start+escapeval;
1311 else if (escapetype == 'D') {
1312 ch = huff_dynam_start+escapeval;
1314 else if (escapetype == 'U') {
1315 ch = huff_unicode_start+escapeval;
1318 compiler_error("Strange @ escape in processed text.");
1335 if (compression_switch) {
1336 jx += huff_entities[ch].depth;
1337 compression_string_size += (jx/8);
1341 if (ch >= huff_dynam_start) {
1342 compression_string_size += 3;
1344 else if (ch >= huff_unicode_start) {
1345 compiler_error("Abbreviation/Unicode in non-compressed string \
1346 should be impossible.");
1349 compression_string_size += 1;
1352 if (compression_switch && jx)
1353 compression_string_size++;
1356 done_compression = TRUE;
1359 static void compress_makebits(int entnum, int depth, int prevbit,
1360 huffbitlist_t *bits)
1362 huffentity_t *ent = &(huff_entities[entnum]);
1366 ent->addr = compression_table_size;
1371 ent->bits.b[(depth-1) / 8] |= (1 << ((depth-1) % 8));
1374 switch (ent->type) {
1376 compression_table_size += 9;
1377 compress_makebits(ent->u.branch[0], depth+1, 0, &ent->bits);
1378 compress_makebits(ent->u.branch[1], depth+1, 1, &ent->bits);
1381 compression_table_size += 1;
1384 compression_table_size += 2;
1387 cx = (char *)abbreviations_at + ent->u.val*MAX_ABBREV_LENGTH;
1388 compression_table_size += (1 + 1 + strlen(cx));
1392 compression_table_size += 5;
1397 /* ------------------------------------------------------------------------- */
1398 /* The abbreviations optimiser */
1400 /* This is a very complex, memory and time expensive algorithm to */
1401 /* approximately solve the problem of which abbreviation strings would */
1402 /* minimise the total number of Z-chars to which the game text translates. */
1403 /* It is in some ways a quite separate program but remains inside Inform */
1404 /* for compatibility with previous releases. */
1405 /* ------------------------------------------------------------------------- */
1407 /* The complete game text. */
1408 static char *opttext;
1409 static int32 opttextlen;
1411 typedef struct tlb_s
1413 int32 intab, occurrences;
1415 static tlb *tlbtab; /* Three-letter blocks (allocated up to no_occs) */
1416 static memory_list tlbtab_memlist;
1417 static int32 no_occs;
1419 static int32 *grandtable;
1420 static int32 *grandflags;
1421 typedef struct optab_s
1426 char text[MAX_ABBREV_LENGTH];
1428 static int32 MAX_BESTYET;
1429 static optab *bestyet; /* High-score entries (up to MAX_BESTYET used/allocated) */
1430 static optab *bestyet2; /* The selected entries (up to selected used; allocated to MAX_ABBREVS) */
1434 static void optimise_pass(void)
1439 int32 j, j2, k, nl, matches, noflags, score, min, minat=0, x, scrabble, c;
1440 for (i=0; i<MAX_BESTYET; i++) bestyet[i].length=0;
1441 for (i=0; i<no_occs; i++)
1442 { if ((*(tlbtab[i].text)!=(int) '\n')&&(tlbtab[i].occurrences!=0))
1445 if (i%((**g_pm_hndl).linespercheck) == 0)
1446 { ProcessEvents (&g_proc);
1449 longjmp (g_fallback, 1);
1453 if (optabbrevs_trace_setting >= 2) {
1454 printf("Pass %d, %4ld/%ld '%s' (%ld occurrences) ",
1455 pass_no, (long int) i, (long int) no_occs, tlbtab[i].text,
1456 (long int) tlbtab[i].occurrences);
1459 for (j=0; j<tlbtab[i].occurrences; j++)
1460 { for (j2=0; j2<tlbtab[i].occurrences; j2++) grandflags[j2]=1;
1461 nl=2; noflags=tlbtab[i].occurrences;
1462 while ((noflags>=2)&&(nl<MAX_ABBREV_LENGTH-1))
1464 for (j2=0; j2<nl; j2++)
1465 if (opttext[grandtable[tlbtab[i].intab+j]+j2]=='\n')
1468 for (j2=j; j2<tlbtab[i].occurrences; j2++)
1469 { if (grandflags[j2]==1)
1470 { x=grandtable[tlbtab[i].intab+j2]
1471 - grandtable[tlbtab[i].intab+j];
1472 if (((x>-nl)&&(x<nl))
1473 || (memcmp(opttext+grandtable[tlbtab[i].intab+j],
1474 opttext+grandtable[tlbtab[i].intab+j2],
1476 { grandflags[j2]=0; noflags--; }
1481 for (k=0; k<nl; k++)
1483 c=opttext[grandtable[tlbtab[i].intab+j+k]];
1485 { if (iso_to_alphabet_grid[c]<0)
1488 if (iso_to_alphabet_grid[c]>=26)
1492 score=(matches-1)*(scrabble-2);
1494 for (j2=0; j2<MAX_BESTYET; j2++)
1495 { if ((nl==bestyet[j2].length)
1496 && (memcmp(opttext+bestyet[j2].location,
1497 opttext+grandtable[tlbtab[i].intab+j],
1499 { j2=MAX_BESTYET; min=score; }
1501 { if (bestyet[j2].score<min)
1502 { min=bestyet[j2].score; minat=j2;
1507 { bestyet[minat].score=score;
1508 bestyet[minat].length=nl;
1509 bestyet[minat].location=grandtable[tlbtab[i].intab+j];
1510 bestyet[minat].popularity=matches;
1515 if (optabbrevs_trace_setting >= 2) {
1517 duration = TIMEVALUE_DIFFERENCE(&t1, &t2);
1518 printf(" (%.4f seconds)\n", duration);
1524 static int any_overlap(char *s1, char *s2)
1525 { int a, b, i, j, flag;
1526 a=strlen(s1); b=strlen(s2);
1527 for (i=1-b; i<a; i++)
1530 if ((0<=i+j)&&(i+j<=a-1))
1531 if (s1[i+j]!=s2[j]) flag=1;
1532 if (flag==0) return(1);
1537 extern void optimise_abbreviations(void)
1538 { int32 i, j, tcount, max=0, MAX_GTABLE;
1539 int32 j2, selected, available, maxat=0, nl;
1541 if (opttext == NULL)
1544 /* We insist that the first two abbreviations will be ". " and ", ". */
1545 if (MAX_ABBREVS < 2)
1548 /* Note that it's safe to access opttext[opttextlen+2]. There are
1549 two newlines and a null beyond opttextlen. */
1551 printf("Beginning calculation of optimal abbreviations...\n");
1555 initialise_memory_list(&tlbtab_memlist,
1556 sizeof(tlb), 1000, (void**)&tlbtab,
1557 "three-letter-blocks buffer");
1561 /* Not sure what the optimal size is for MAX_BESTYET. The original code always created 64 abbreviations and used MAX_BESTYET=256. I'm guessing that 4*MAX_ABBREVS is reasonable. */
1562 MAX_BESTYET = 4 * MAX_ABBREVS;
1564 bestyet=my_calloc(sizeof(optab), MAX_BESTYET, "bestyet");
1565 bestyet2=my_calloc(sizeof(optab), MAX_ABBREVS, "bestyet2");
1567 bestyet2[0].text[0]='.';
1568 bestyet2[0].text[1]=' ';
1569 bestyet2[0].text[2]=0;
1571 bestyet2[1].text[0]=',';
1572 bestyet2[1].text[1]=' ';
1573 bestyet2[1].text[2]=0;
1577 for (i=0; i<opttextlen; i++)
1579 if ((opttext[i]=='.') && (opttext[i+1]==' ') && (opttext[i+2]==' '))
1580 { opttext[i]='\n'; opttext[i+1]='\n'; opttext[i+2]='\n';
1581 bestyet2[0].popularity++;
1584 if ((opttext[i]=='.') && (opttext[i+1]==' '))
1585 { opttext[i]='\n'; opttext[i+1]='\n';
1586 bestyet2[0].popularity++;
1589 if ((opttext[i]==',') && (opttext[i+1]==' '))
1590 { opttext[i]='\n'; opttext[i+1]='\n';
1591 bestyet2[1].popularity++;
1595 MAX_GTABLE=opttextlen+1;
1596 grandtable=my_calloc(4*sizeof(int32), MAX_GTABLE/4, "grandtable");
1598 for (i=0, tcount=0; i<opttextlen; i++)
1601 test.text[0]=opttext[i];
1602 test.text[1]=opttext[i+1];
1603 test.text[2]=opttext[i+2];
1605 if ((test.text[0]=='\n')||(test.text[1]=='\n')||(test.text[2]=='\n'))
1607 for (j=0; j<no_occs; j++) {
1608 if (strcmp(test.text,tlbtab[j].text)==0)
1613 for (j=i+3; j<opttextlen; j++)
1616 if (j%((**g_pm_hndl).linespercheck) == 0)
1617 { ProcessEvents (&g_proc);
1620 longjmp (g_fallback, 1);
1624 if ((opttext[i]==opttext[j])
1625 && (opttext[i+1]==opttext[j+1])
1626 && (opttext[i+2]==opttext[j+2]))
1627 { grandtable[tcount+test.occurrences]=j;
1629 if (tcount+test.occurrences==MAX_GTABLE)
1630 { printf("All %ld cross-references used\n",
1631 (long int) MAX_GTABLE);
1636 if (test.occurrences>=2)
1638 ensure_memory_list_available(&tlbtab_memlist, no_occs+1);
1639 tlbtab[no_occs]=test;
1640 tlbtab[no_occs].intab=tcount;
1641 tcount += tlbtab[no_occs].occurrences;
1642 if (max<tlbtab[no_occs].occurrences)
1643 max=tlbtab[no_occs].occurrences;
1650 grandflags=my_calloc(sizeof(int), max, "grandflags");
1653 if (optabbrevs_trace_setting >= 1) {
1654 printf("Cross-reference table (%ld entries) built...\n",
1655 (long int) no_occs);
1657 /* for (i=0; i<no_occs; i++)
1658 printf("%4d %4d '%s' %d\n",i,tlbtab[i].intab,tlbtab[i].text,
1659 tlbtab[i].occurrences);
1662 for (i=0; i<MAX_ABBREVS; i++) bestyet2[i].length=0;
1663 available=MAX_BESTYET;
1664 while ((available>0)&&(selected<MAX_ABBREVS))
1667 if (optabbrevs_trace_setting >= 1) {
1668 printf("Pass %d\n", pass_no);
1673 for (i=0; i<MAX_BESTYET; i++)
1674 if (bestyet[i].score!=0)
1676 nl=bestyet[i].length;
1677 for (j2=0; j2<nl; j2++) bestyet[i].text[j2]=
1678 opttext[bestyet[i].location+j2];
1679 bestyet[i].text[nl]=0;
1682 /* printf("End of pass results:\n");
1683 printf("\nno score freq string\n");
1684 for (i=0; i<MAX_BESTYET; i++)
1685 if (bestyet[i].score>0)
1686 printf("%02d: %4d %4d '%s'\n", i, bestyet[i].score,
1687 bestyet[i].popularity, bestyet[i].text);
1692 for (i=0; i<MAX_BESTYET; i++)
1693 if (max<bestyet[i].score)
1694 { max=bestyet[i].score;
1701 bestyet2[selected++]=bestyet[maxat];
1703 if (optabbrevs_trace_setting >= 1) {
1705 "Selection %2ld: '%s' (repeated %ld times, scoring %ld)\n",
1706 (long int) selected,bestyet[maxat].text,
1707 (long int) bestyet[maxat].popularity,
1708 (long int) bestyet[maxat].score);
1711 testtext[0]=bestyet[maxat].text[0];
1712 testtext[1]=bestyet[maxat].text[1];
1713 testtext[2]=bestyet[maxat].text[2];
1716 for (i=0; i<no_occs; i++)
1717 if (strcmp(testtext,tlbtab[i].text)==0)
1720 for (j=0; j<tlbtab[i].occurrences; j++)
1721 { if (memcmp(bestyet[maxat].text,
1722 opttext+grandtable[tlbtab[i].intab+j],
1723 bestyet[maxat].length)==0)
1724 { for (j2=0; j2<bestyet[maxat].length; j2++)
1725 opttext[grandtable[tlbtab[i].intab+j]+j2]='\n';
1729 for (i=0; i<MAX_BESTYET; i++)
1730 if ((bestyet[i].score>0)&&
1731 (any_overlap(bestyet[maxat].text,bestyet[i].text)==1))
1732 { bestyet[i].score=0;
1733 /* printf("Discarding '%s' as overlapping\n",
1734 bestyet[i].text); */
1737 } while ((max>0)&&(available>0)&&(selected<MAX_ABBREVS));
1740 printf("\nChosen abbreviations (in Inform syntax):\n\n");
1741 for (i=0; i<selected; i++)
1742 printf("Abbreviate \"%s\";\n", bestyet2[i].text);
1747 /* ------------------------------------------------------------------------- */
1748 /* The dictionary manager begins here. */
1750 /* Speed is extremely important in these algorithms. If a linear-time */
1751 /* routine were used to search the dictionary words so far built up, then */
1752 /* Inform would crawl. */
1754 /* Instead, the dictionary is stored as a binary tree, which is kept */
1755 /* balanced with the red-black algorithm. */
1756 /* ------------------------------------------------------------------------- */
1757 /* A dictionary table similar to the Z-machine format is kept: there is a */
1758 /* 7-byte header (left blank here to be filled in at the */
1759 /* construct_storyfile() stage in "tables.c") and then a sequence of */
1760 /* records, one per word, in the form */
1762 /* <Z-coded text> <flags> <verbnumber> <adjectivenumber> */
1763 /* 4 or 6 bytes byte byte byte */
1765 /* For Glulx, the form is instead: (See below about Unicode-valued */
1766 /* dictionaries and DICT_WORD_BYTES.) */
1768 /* <tag> <plain text> <flags> <verbnumber> <adjectivenumber> */
1769 /* $60 DICT_WORD_BYTES short short short */
1771 /* These records are stored in "accession order" (i.e. in order of their */
1772 /* first being received by these routines) and only alphabetically sorted */
1773 /* by construct_storyfile() (using the array below). */
1774 /* ------------------------------------------------------------------------- */
1776 /* Further notes about the data fields... */
1777 /* The flags are currently: */
1778 /* bit 0: word is used as a verb (in verb grammar) */
1779 /* bit 1: word is used as a meta verb */
1780 /* bit 2: word is plural (set by '//p') */
1781 /* bit 3: word is used as a preposition (in verb grammar) */
1782 /* bit 6: set for all verbs, but not used by the parser? */
1783 /* bit 7: word is used as a noun (set for every word that appears in */
1784 /* code or in an object property) */
1786 /* In grammar version 2, the third field (adjectivenumber) is unused (and */
1789 /* The compiler generates special constants #dict_par1, #dict_par2, */
1790 /* #dict_par3 to refer to the byte offsets of the three fields. In */
1791 /* Z-code v3, these are 4/5/6; in v4+, they are 6/7/8. In Glulx, they */
1792 /* are $DICT_WORD_SIZE+2/4/6, referring to the *low* bytes of the three */
1793 /* fields. (The high bytes are $DICT_WORD_SIZE+1/3/5.) */
1794 /* ------------------------------------------------------------------------- */
1796 uchar *dictionary; /* (These two variables are externally
1797 used only in "tables.c" when
1798 building the story-file) */
1799 static memory_list dictionary_memlist;
1800 int32 dictionary_top; /* Position of the next free record
1801 in dictionary (i.e., the current
1804 int dict_entries; /* Total number of records entered */
1806 /* ------------------------------------------------------------------------- */
1807 /* dict_word was originally a typedef for a struct of 6 unsigned chars. */
1808 /* It held the (4 or) 6 bytes of Z-coded text of a word. */
1809 /* Usefully, because the PAD character 5 is < all alphabetic characters, */
1810 /* alphabetic order corresponds to numeric order. For this reason, the */
1811 /* dict_word is called the "sort code" of the original text word. */
1813 /* In modifying the compiler for Glulx, I found it easier to discard the */
1814 /* typedef, and operate directly on uchar arrays of length DICT_WORD_SIZE. */
1815 /* In Z-code, DICT_WORD_SIZE will be 6, so the Z-code compiler will work */
1816 /* as before. In Glulx, it can be any value up to MAX_DICT_WORD_SIZE. */
1817 /* (That limit is defined as 40 in the header; it exists only for a few */
1818 /* static buffers, and can be increased without using significant memory.) */
1820 /* ...Well, that certainly bit me on the butt, didn't it. In further */
1821 /* modifying the compiler to generate a Unicode dictionary, I have to */
1822 /* store four-byte values in the uchar array. This is handled by making */
1823 /* the array size DICT_WORD_BYTES (which is DICT_WORD_SIZE*DICT_CHAR_SIZE).*/
1824 /* Then we store the 32-bit character value big-endian. This lets us */
1825 /* continue to compare arrays bytewise, which is a nice simplification. */
1826 /* ------------------------------------------------------------------------- */
1828 extern int compare_sorts(uchar *d1, uchar *d2)
1830 for (i=0; i<DICT_WORD_BYTES; i++)
1831 if (d1[i]!=d2[i]) return((int)(d1[i]) - (int)(d2[i]));
1832 /* (since memcmp(d1, d2, DICT_WORD_BYTES); runs into a bug on some Unix
1837 extern void copy_sorts(uchar *d1, uchar *d2)
1839 for (i=0; i<DICT_WORD_BYTES; i++)
1843 static uchar prepared_sort[MAX_DICT_WORD_BYTES]; /* Holds the sort code
1846 static int number_and_case;
1848 /* Also used by verbs.c */
1849 static void dictionary_prepare_z(char *dword, uchar *optresult)
1850 { int i, j, k, k2, wd[13]; int32 tot;
1852 /* A rapid text translation algorithm using only the simplified rules
1853 applying to the text of dictionary entries: first produce a sequence
1854 of 6 (v3) or 9 (v4+) Z-characters */
1856 int dictsize = (version_number==3) ? 6 : 9;
1858 number_and_case = 0;
1860 for (i=0, j=0; dword[j]!=0; i++, j++)
1861 { if ((dword[j] == '/') && (dword[j+1] == '/'))
1862 { for (j+=2; dword[j] != 0; j++)
1864 { case 'p': number_and_case |= 4; break;
1866 error_named("Expected 'p' after '//' \
1867 to give number of dictionary word", dword);
1873 if (i>=dictsize) break;
1877 warning_named("Obsolete usage: use the ^ character for the \
1878 apostrophe in", dword);
1879 if (k==(int) '^') k=(int) '\'';
1882 if (k==(int) '@' || (character_set_unicode && (k & 0x80)))
1883 { int unicode = text_to_unicode(dword+j);
1884 if ((unicode < 128) && isupper(unicode)) unicode = tolower(unicode);
1885 k = unicode_to_zscii(unicode);
1886 j += textual_form_length - 1;
1887 if ((k == 5) || (k >= 0x100))
1888 { unicode_char_error(
1889 "Character can be printed but not input:", unicode);
1892 k2 = zscii_to_alphabet_grid[(uchar) k];
1895 { if (isupper(k)) k = tolower(k);
1896 k2 = iso_to_alphabet_grid[(uchar) k];
1900 { if ((k2 == -5) || (k2 <= -0x100))
1901 char_error("Character can be printed but not input:", k);
1903 { /* Use 4 more Z-chars to encode a ZSCII escape sequence */
1905 wd[i++] = 5; wd[i++] = 6;
1907 wd[i++] = k2/32; wd[i] = k2%32;
1911 { alphabet_used[k2] = 'Y';
1913 wd[i++]=3+(k2/26); /* Change alphabet for symbols */
1914 wd[i]=6+(k2%26); /* Write the Z character */
1918 /* Fill up to the end of the dictionary block with PAD characters */
1920 for (; i<9; i++) wd[i]=5;
1922 /* The array of Z-chars is converted to two or three 2-byte blocks */
1924 tot = wd[2] + wd[1]*(1<<5) + wd[0]*(1<<10);
1925 prepared_sort[1]=tot%0x100;
1926 prepared_sort[0]=(tot/0x100)%0x100;
1927 tot = wd[5] + wd[4]*(1<<5) + wd[3]*(1<<10);
1928 prepared_sort[3]=tot%0x100;
1929 prepared_sort[2]=(tot/0x100)%0x100;
1930 if (version_number==3)
1933 tot = wd[8] + wd[7]*(1<<5) + wd[6]*(1<<10);
1934 prepared_sort[5]=tot%0x100;
1935 prepared_sort[4]=(tot/0x100)%0x100;
1937 /* Set the "end bit" on the 2nd (in v3) or the 3rd (v4+) 2-byte block */
1939 if (version_number==3) prepared_sort[2]+=0x80;
1940 else prepared_sort[4]+=0x80;
1942 if (optresult) copy_sorts(optresult, prepared_sort);
1945 /* Also used by verbs.c */
1946 static void dictionary_prepare_g(char *dword, uchar *optresult)
1951 number_and_case = 0;
1953 for (i=0, j=0; (dword[j]!=0); i++, j++) {
1954 if ((dword[j] == '/') && (dword[j+1] == '/')) {
1955 for (j+=2; dword[j] != 0; j++) {
1958 number_and_case |= 4;
1961 error_named("Expected 'p' after '//' \
1962 to give gender or number of dictionary word", dword);
1968 if (i>=DICT_WORD_SIZE) break;
1970 k= ((unsigned char *)dword)[j];
1972 warning_named("Obsolete usage: use the ^ character for the \
1973 apostrophe in", dword);
1976 if (k=='~') /* as in iso_to_alphabet_grid */
1979 if (k=='@' || (character_set_unicode && (k & 0x80))) {
1980 unicode = text_to_unicode(dword+j);
1981 j += textual_form_length - 1;
1984 unicode = iso_to_unicode_grid[k];
1987 if (DICT_CHAR_SIZE != 1 || (unicode >= 0 && unicode < 256)) {
1991 error("The dictionary cannot contain Unicode characters beyond Latin-1. \
1992 Define DICT_CHAR_SIZE=4 for a Unicode-compatible dictionary.");
1996 if (k >= (unsigned)'A' && k <= (unsigned)'Z')
1999 if (DICT_CHAR_SIZE == 1) {
2000 prepared_sort[i] = k;
2003 prepared_sort[4*i] = (k >> 24) & 0xFF;
2004 prepared_sort[4*i+1] = (k >> 16) & 0xFF;
2005 prepared_sort[4*i+2] = (k >> 8) & 0xFF;
2006 prepared_sort[4*i+3] = (k) & 0xFF;
2010 if (DICT_CHAR_SIZE == 1) {
2011 for (; i<DICT_WORD_SIZE; i++)
2012 prepared_sort[i] = 0;
2015 for (; i<DICT_WORD_SIZE; i++) {
2016 prepared_sort[4*i] = 0;
2017 prepared_sort[4*i+1] = 0;
2018 prepared_sort[4*i+2] = 0;
2019 prepared_sort[4*i+3] = 0;
2023 if (optresult) copy_sorts(optresult, prepared_sort);
2026 extern void dictionary_prepare(char *dword, uchar *optresult)
2029 dictionary_prepare_z(dword, optresult);
2031 dictionary_prepare_g(dword, optresult);
2034 /* ------------------------------------------------------------------------- */
2035 /* The arrays below are all concerned with the problem of alphabetically */
2036 /* sorting the dictionary during the compilation pass. */
2037 /* Note that it is not enough simply to apply qsort to the dictionary at */
2038 /* the end of the pass: we need to ensure that no duplicates are ever */
2041 /* dict_sort_codes[n] the sort code of record n: i.e., of the nth */
2042 /* word to be entered into the dictionary, where */
2043 /* n counts upward from 0 */
2044 /* (n is also called the "accession number") */
2046 /* The tree structure encodes an ordering. The special value VACANT means */
2047 /* "no node here": otherwise, node numbers are the same as accession */
2048 /* numbers. At all times, "root" holds the node number of the top of the */
2049 /* tree; each node has up to two branches, such that the subtree of the */
2050 /* left branch is always alphabetically before what's at the node, and */
2051 /* the subtree to the right is always after; and all branches are coloured */
2052 /* either "black" or "red". These colours are used to detect points where */
2053 /* the tree is growing asymmetrically (and therefore becoming inefficient */
2055 /* ------------------------------------------------------------------------- */
2062 typedef struct dict_tree_node_s
2063 { int branch[2]; /* Branch 0 is "left", 1 is "right" */
2064 char colour; /* The colour of the branch to the parent */
2067 static dict_tree_node *dtree; /* Allocated to dict_entries */
2068 static memory_list dtree_memlist;
2070 static uchar *dict_sort_codes; /* Allocated to dict_entries*DICT_WORD_BYTES */
2071 static memory_list dict_sort_codes_memlist;
2073 int *final_dict_order; /* Allocated at sort_dictionary() time */
2075 static void dictionary_begin_pass(void)
2077 /* Leave room for the 7-byte header (added in "tables.c" much later) */
2078 /* Glulx has a 4-byte header instead. */
2085 ensure_memory_list_available(&dictionary_memlist, dictionary_top);
2091 static int fdo_count;
2092 static void recursively_sort(int node)
2093 { if (dtree[node].branch[0] != VACANT)
2094 recursively_sort(dtree[node].branch[0]);
2095 final_dict_order[node] = fdo_count++;
2096 if (dtree[node].branch[1] != VACANT)
2097 recursively_sort(dtree[node].branch[1]);
2100 extern void sort_dictionary(void)
2102 final_dict_order = my_calloc(sizeof(int), dict_entries, "final dictionary ordering table");
2105 { fdo_count = 0; recursively_sort(root);
2109 /* ------------------------------------------------------------------------- */
2110 /* If "dword" is in the dictionary, return its accession number plus 1; */
2111 /* If not, return 0. */
2112 /* ------------------------------------------------------------------------- */
2114 static int dictionary_find(char *dword)
2117 dictionary_prepare(dword, NULL);
2119 while (at != VACANT)
2120 { n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2121 if (n==0) return at + 1;
2122 if (n>0) at = dtree[at].branch[1]; else at = dtree[at].branch[0];
2127 /* ------------------------------------------------------------------------- */
2128 /* Add "dword" to the dictionary with (x,y,z) as its data fields; unless */
2129 /* it already exists, in which case OR the data with (x,y,z) */
2131 /* These fields are one byte each in Z-code, two bytes each in Glulx. */
2133 /* Returns: the accession number. */
2134 /* ------------------------------------------------------------------------- */
2136 extern int dictionary_add(char *dword, int x, int y, int z)
2138 int ggfr = 0, gfr = 0, fr = 0, r = 0;
2139 int ggf = VACANT, gf = VACANT, f = VACANT, at = root;
2141 int res=((version_number==3)?4:6);
2143 dictionary_prepare(dword, NULL);
2146 { root = 0; goto CreateEntry;
2150 n = compare_sorts(prepared_sort, dict_sort_codes+at*DICT_WORD_BYTES);
2154 p = dictionary+7 + at*DICT_ENTRY_BYTE_LENGTH + res;
2155 p[0]=(p[0])|x; p[1]=(p[1])|y;
2156 if (!ZCODE_LESS_DICT_DATA)
2158 if (x & 128) p[0] = (p[0])|number_and_case;
2161 p = dictionary+4 + at*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2162 p[0]=(p[0])|(x/256); p[1]=(p[1])|(x%256);
2163 p[2]=(p[2])|(y/256); p[3]=(p[3])|(y%256);
2164 p[4]=(p[4])|(z/256); p[5]=(p[5])|(z%256);
2165 if (x & 128) p[1] = (p[1]) | number_and_case;
2169 if (n>0) r=1; else r=0;
2171 a = dtree[at].branch[0]; b = dtree[at].branch[1];
2172 if ((a != VACANT) && (dtree[a].colour == RED) &&
2173 (b != VACANT) && (dtree[b].colour == RED))
2174 { dtree[a].colour = BLACK;
2175 dtree[b].colour = BLACK;
2177 dtree[at].colour = RED;
2179 /* A tree rotation may be needed to avoid two red links in a row:
2181 ggf (or else gf is root) ggf (or f is root)
2184 / \(red) / \ (both red)
2190 In effect we rehang the "gf" subtree from "f".
2191 See the Technical Manual for further details.
2194 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2197 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2198 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2199 dtree[f].branch[1-fr] = gf;
2200 dtree[f].colour = BLACK;
2201 dtree[gf].colour = RED;
2202 gf = ggf; gfr = ggfr;
2205 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2206 dtree[at].colour = BLACK;
2207 dtree[gf].colour = RED;
2208 dtree[f].branch[fr] = dtree[at].branch[gfr];
2209 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2210 dtree[at].branch[gfr] = f;
2211 dtree[at].branch[fr] = gf;
2213 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2214 f = n; gf = ggf; fr = 1-r; gfr = ggfr;
2219 if (dtree[at].branch[r] == VACANT)
2220 { dtree[at].colour = RED;
2222 if ((f != VACANT) && (gf != VACANT) && (dtree[f].colour == RED))
2224 { if (ggf == VACANT) root = f; else dtree[ggf].branch[ggfr] = f;
2225 dtree[gf].branch[gfr] = dtree[f].branch[1-fr];
2226 dtree[f].branch[1-fr] = gf;
2227 dtree[f].colour = BLACK;
2228 dtree[gf].colour = RED;
2231 { if (ggf == VACANT) root = at; else dtree[ggf].branch[ggfr] = at;
2232 dtree[at].colour = BLACK;
2233 dtree[gf].colour = RED;
2234 dtree[f].branch[fr] = dtree[at].branch[gfr];
2235 dtree[gf].branch[gfr] = dtree[at].branch[fr];
2236 dtree[at].branch[gfr] = f;
2237 dtree[at].branch[fr] = gf;
2239 r = 1-r; n = at; if (r==fr) at = f; else at = gf;
2243 dtree[at].branch[r] = dict_entries;
2246 ggf = gf; gf = f; f = at; at = dtree[at].branch[r];
2247 ggfr = gfr; gfr = fr; fr = r;
2252 ensure_memory_list_available(&dtree_memlist, dict_entries+1);
2253 ensure_memory_list_available(&dict_sort_codes_memlist, (dict_entries+1)*DICT_WORD_BYTES);
2255 dtree[dict_entries].branch[0] = VACANT;
2256 dtree[dict_entries].branch[1] = VACANT;
2257 dtree[dict_entries].colour = BLACK;
2259 /* Address in Inform's own dictionary table to write the record to */
2263 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2264 p = dictionary + DICT_ENTRY_BYTE_LENGTH*dict_entries + 7;
2266 /* So copy in the 4 (or 6) bytes of Z-coded text and the 3 data
2269 p[0]=prepared_sort[0]; p[1]=prepared_sort[1];
2270 p[2]=prepared_sort[2]; p[3]=prepared_sort[3];
2271 if (version_number > 3)
2272 { p[4]=prepared_sort[4]; p[5]=prepared_sort[5]; }
2273 p[res]=x; p[res+1]=y;
2274 if (!ZCODE_LESS_DICT_DATA) p[res+2]=z;
2275 if (x & 128) p[res] = (p[res])|number_and_case;
2277 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2282 ensure_memory_list_available(&dictionary_memlist, dictionary_top + DICT_ENTRY_BYTE_LENGTH);
2283 p = dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*dict_entries;
2284 p[0] = 0x60; /* type byte -- dict word */
2286 p += DICT_CHAR_SIZE;
2287 for (i=0; i<DICT_WORD_BYTES; i++)
2288 p[i] = prepared_sort[i];
2290 p += DICT_WORD_BYTES;
2292 p[2] = y/256; p[3] = y%256;
2295 p[1] |= number_and_case;
2297 dictionary_top += DICT_ENTRY_BYTE_LENGTH;
2301 copy_sorts(dict_sort_codes+dict_entries*DICT_WORD_BYTES, prepared_sort);
2303 return dict_entries++;
2306 /* ------------------------------------------------------------------------- */
2307 /* Used in "tables.c" for "Extend ... only", to renumber a verb-word to a */
2308 /* new verb syntax of its own. (Otherwise existing verb-words never */
2309 /* change their verb-numbers.) */
2310 /* ------------------------------------------------------------------------- */
2312 extern void dictionary_set_verb_number(char *dword, int to)
2314 int res=((version_number==3)?4:6);
2315 i=dictionary_find(dword);
2319 p=dictionary+7+(i-1)*DICT_ENTRY_BYTE_LENGTH+res;
2323 p=dictionary+4 + (i-1)*DICT_ENTRY_BYTE_LENGTH + DICT_ENTRY_FLAG_POS;
2324 p[2]=to/256; p[3]=to%256;
2329 /* ------------------------------------------------------------------------- */
2330 /* Tracing code for the dictionary: used by "trace" and text */
2331 /* transcription. */
2332 /* ------------------------------------------------------------------------- */
2334 /* In the dictionary-showing code, if d_show_buf is NULL, the text is
2335 printed directly. (The "Trace dictionary" directive does this.)
2336 If d_show_buf is not NULL, we add words to it (reallocing if necessary)
2337 until it's a page-width.
2339 static char *d_show_buf = NULL;
2340 static int d_show_size; /* allocated size */
2341 static int d_show_len; /* current length */
2343 static void show_char(char c)
2345 if (d_show_buf == NULL) {
2349 if (d_show_len+2 >= d_show_size) {
2350 int newsize = 2 * d_show_len + 16;
2351 my_realloc(&d_show_buf, d_show_size, newsize, "dictionary display buffer");
2352 d_show_size = newsize;
2354 d_show_buf[d_show_len++] = c;
2355 d_show_buf[d_show_len] = '\0';
2359 /* Display a Unicode character in user-readable form. This uses the same
2360 character encoding as the source code. */
2361 static void show_uchar(uint32 c)
2367 /* ASCII always works */
2371 if (character_set_unicode) {
2372 /* UTF-8 the character */
2376 else if (c < 0x800) {
2377 show_char((0xC0 | ((c & 0x7C0) >> 6)));
2378 show_char((0x80 | (c & 0x03F) ));
2380 else if (c < 0x10000) {
2381 show_char((0xE0 | ((c & 0xF000) >> 12)));
2382 show_char((0x80 | ((c & 0x0FC0) >> 6)));
2383 show_char((0x80 | (c & 0x003F) ));
2385 else if (c < 0x200000) {
2386 show_char((0xF0 | ((c & 0x1C0000) >> 18)));
2387 show_char((0x80 | ((c & 0x03F000) >> 12)));
2388 show_char((0x80 | ((c & 0x000FC0) >> 6)));
2389 show_char((0x80 | (c & 0x00003F) ));
2396 if (character_set_setting == 1 && c < 0x100) {
2397 /* Fits in Latin-1 */
2401 /* Supporting other character_set_setting is harder; not currently implemented. */
2403 /* Use the escaped form */
2404 sprintf(buf, "@{%x}", c);
2405 for (ix=0; buf[ix]; ix++)
2409 extern void word_to_ascii(uchar *p, char *results)
2410 { int i, shift, cc, zchar; uchar encoded_word[9];
2411 encoded_word[0] = (((int) p[0])&0x7c)/4;
2412 encoded_word[1] = 8*(((int) p[0])&0x3) + (((int) p[1])&0xe0)/32;
2413 encoded_word[2] = ((int) p[1])&0x1f;
2414 encoded_word[3] = (((int) p[2])&0x7c)/4;
2415 encoded_word[4] = 8*(((int) p[2])&0x3) + (((int) p[3])&0xe0)/32;
2416 encoded_word[5] = ((int) p[3])&0x1f;
2417 if (version_number > 3)
2418 { encoded_word[6] = (((int) p[4])&0x7c)/4;
2419 encoded_word[7] = 8*(((int) p[4])&0x3) + (((int) p[5])&0xe0)/32;
2420 encoded_word[8] = ((int) p[5])&0x1f;
2424 encoded_word[6] = encoded_word[7] = encoded_word[8] = 0;
2428 for (i=0; i< ((version_number==3)?6:9); i++)
2429 { zchar = encoded_word[i];
2431 if (zchar == 4) shift = 1;
2433 if (zchar == 5) shift = 2;
2435 { if ((shift == 2) && (zchar == 6))
2436 { zchar = 32*encoded_word[i+1] + encoded_word[i+2];
2438 if ((zchar>=32) && (zchar<=126))
2439 results[cc++] = zchar;
2441 { zscii_to_text(results+cc, zchar);
2442 cc = strlen(results);
2446 { zscii_to_text(results+cc, (alphabet[shift])[zchar-6]);
2447 cc = strlen(results);
2455 /* Print a dictionary word to stdout.
2456 (This assumes that d_show_buf is null.)
2458 void print_dict_word(int node)
2464 char textual_form[32];
2465 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2467 word_to_ascii(p, textual_form);
2469 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2470 show_char(textual_form[cprinted]);
2473 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2475 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2478 if (DICT_CHAR_SIZE == 1)
2481 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2489 static void recursively_show_z(int node, int level)
2490 { int i, cprinted, flags; uchar *p;
2491 char textual_form[32];
2492 int res = (version_number == 3)?4:6; /* byte length of encoded text */
2494 if (dtree[node].branch[0] != VACANT)
2495 recursively_show_z(dtree[node].branch[0], level);
2497 p = (uchar *)dictionary + 7 + DICT_ENTRY_BYTE_LENGTH*node;
2499 word_to_ascii(p, textual_form);
2501 for (cprinted = 0; textual_form[cprinted]!=0; cprinted++)
2502 show_char(textual_form[cprinted]);
2503 for (; cprinted < 4 + ((version_number==3)?6:9); cprinted++)
2506 /* The level-1 info can only be printfed (d_show_buf must be null). */
2507 if (d_show_buf == NULL && level >= 1)
2510 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2513 flags = (int) p[res];
2516 if (flags & 4) printf("p"); else printf(" ");
2521 { if (grammar_version_number == 1)
2522 printf("preposition:%d ", (int) p[res+2]);
2524 printf("preposition ");
2526 if ((flags & 3) == 3) printf("metaverb:%d ", (int) p[res+1]);
2527 else if ((flags & 3) == 1) printf("verb:%d ", (int) p[res+1]);
2531 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2532 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2534 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2538 if (dtree[node].branch[1] != VACANT)
2539 recursively_show_z(dtree[node].branch[1], level);
2542 static void recursively_show_g(int node, int level)
2546 if (dtree[node].branch[0] != VACANT)
2547 recursively_show_g(dtree[node].branch[0], level);
2549 p = (uchar *)dictionary + 4 + DICT_ENTRY_BYTE_LENGTH*node;
2551 for (cprinted = 0; cprinted<DICT_WORD_SIZE; cprinted++)
2554 if (DICT_CHAR_SIZE == 1)
2557 ch = (p[4*cprinted+4] << 24) + (p[4*cprinted+5] << 16) + (p[4*cprinted+6] << 8) + (p[4*cprinted+7]);
2562 for (; cprinted<DICT_WORD_SIZE+4; cprinted++)
2565 /* The level-1 info can only be printfed (d_show_buf must be null). */
2566 if (d_show_buf == NULL && level >= 1)
2567 { int flagpos = (DICT_CHAR_SIZE == 1) ? (DICT_WORD_SIZE+1) : (DICT_WORD_BYTES+4);
2568 int flags = (p[flagpos+0] << 8) | (p[flagpos+1]);
2569 int verbnum = (p[flagpos+2] << 8) | (p[flagpos+3]);
2571 for (i=0; i<DICT_ENTRY_BYTE_LENGTH; i++) printf("%02x ",p[i]);
2575 if (flags & 4) printf("p"); else printf(" ");
2580 { printf("preposition ");
2582 if ((flags & 3) == 3) printf("metaverb:%d ", verbnum);
2583 else if ((flags & 3) == 1) printf("verb:%d ", verbnum);
2587 /* Show five words per line in classic TRANSCRIPT_FORMAT; one per line in the new format. */
2588 if (d_show_buf && (d_show_len >= 64 || TRANSCRIPT_FORMAT == 1))
2590 write_to_transcript_file(d_show_buf, STRCTX_DICT);
2594 if (dtree[node].branch[1] != VACANT)
2595 recursively_show_g(dtree[node].branch[1], level);
2598 static void show_alphabet(int i)
2599 { int j, c; char chartext[8];
2601 for (j=0; j<26; j++)
2602 { c = alphabet[i][j];
2604 if (alphabet_used[26*i+j] == 'N') printf("("); else printf(" ");
2606 zscii_to_text(chartext, c);
2607 printf("%s", chartext);
2609 if (alphabet_used[26*i+j] == 'N') printf(")"); else printf(" ");
2614 extern void show_dictionary(int level)
2616 /* Level 0: show words only. Level 1: show words and flags.
2617 Level 2: also show bytes.*/
2618 printf("Dictionary contains %d entries:\n",dict_entries);
2619 if (dict_entries != 0)
2620 { d_show_len = 0; d_show_buf = NULL;
2622 recursively_show_z(root, level);
2624 recursively_show_g(root, level);
2628 printf("\nZ-machine alphabet entries:\n");
2635 extern void write_dictionary_to_transcript(void)
2637 d_show_size = 80; /* initial size */
2638 d_show_buf = my_malloc(d_show_size, "dictionary display buffer");
2640 write_to_transcript_file("", STRCTX_INFO);
2641 sprintf(d_show_buf, "[Dictionary contains %d entries:]", dict_entries);
2642 write_to_transcript_file(d_show_buf, STRCTX_INFO);
2646 if (dict_entries != 0)
2649 recursively_show_z(root, 0);
2651 recursively_show_g(root, 0);
2653 if (d_show_len != 0) write_to_transcript_file(d_show_buf, STRCTX_DICT);
2655 my_free(&d_show_buf, "dictionary display buffer");
2656 d_show_len = 0; d_show_buf = NULL;
2659 /* ========================================================================= */
2660 /* Data structure management routines */
2661 /* ------------------------------------------------------------------------- */
2663 extern void init_text_vars(void)
2676 for (j=0; j<256; j++) abbrevs_lookup[j] = -1;
2678 total_zchars_trans = 0;
2683 final_dict_order = NULL;
2684 dict_sort_codes = NULL;
2687 static_strings_area = NULL;
2688 abbreviations_optimal_parse_schedule = NULL;
2689 abbreviations_optimal_parse_scores = NULL;
2691 compressed_offsets = NULL;
2692 huff_entities = NULL;
2694 unicode_usage_entries = NULL;
2697 extern void text_begin_pass(void)
2698 { abbrevs_lookup_table_made = FALSE;
2700 total_chars_trans=0; total_bytes_trans=0;
2702 dictionary_begin_pass();
2703 low_strings_top = 0;
2705 static_strings_extent = 0;
2707 no_dynamic_strings = 0;
2708 no_unicode_chars = 0;
2711 /* Note: for allocation and deallocation of all_the_text, see inform.c */
2713 extern void text_allocate_arrays(void)
2717 initialise_memory_list(&translated_text_memlist,
2718 sizeof(uchar), 8000, (void**)&translated_text,
2719 "translated text holding area");
2721 initialise_memory_list(&all_text_memlist,
2722 sizeof(char), 0, (void**)&all_text,
2723 "transcription text for optimise");
2725 initialise_memory_list(&static_strings_area_memlist,
2726 sizeof(uchar), 128, (void**)&static_strings_area,
2727 "static strings area");
2729 initialise_memory_list(&abbreviations_at_memlist,
2730 MAX_ABBREV_LENGTH, 64, (void**)&abbreviations_at,
2731 "abbreviation text");
2733 initialise_memory_list(&abbreviations_memlist,
2734 sizeof(abbreviation), 64, (void**)&abbreviations,
2737 initialise_memory_list(&abbreviations_optimal_parse_schedule_memlist,
2738 sizeof(int), 0, (void**)&abbreviations_optimal_parse_schedule,
2739 "abbreviations optimal parse schedule");
2740 initialise_memory_list(&abbreviations_optimal_parse_scores_memlist,
2741 sizeof(int), 0, (void**)&abbreviations_optimal_parse_scores,
2742 "abbreviations optimal parse scores");
2744 initialise_memory_list(&dtree_memlist,
2745 sizeof(dict_tree_node), 1500, (void**)&dtree,
2746 "red-black tree for dictionary");
2747 initialise_memory_list(&dict_sort_codes_memlist,
2748 sizeof(uchar), 1500*DICT_WORD_BYTES, (void**)&dict_sort_codes,
2749 "dictionary sort codes");
2751 final_dict_order = NULL; /* will be allocated at sort_dictionary() time */
2753 /* The exact size will be 7+7*num for z3, 7+9*num for z4+,
2754 4+DICT_ENTRY_BYTE_LENGTH*num for Glulx. But this is just an initial
2755 allocation; we don't have to be precise. */
2756 initialise_memory_list(&dictionary_memlist,
2757 sizeof(uchar), 1000*DICT_ENTRY_BYTE_LENGTH, (void**)&dictionary,
2760 initialise_memory_list(&low_strings_memlist,
2761 sizeof(uchar), 1024, (void**)&low_strings,
2762 "low (abbreviation) strings");
2768 huff_entities = NULL;
2770 unicode_usage_entries = NULL;
2771 done_compression = FALSE;
2772 compression_table_size = 0;
2773 compressed_offsets = NULL;
2775 initialise_memory_list(&unicode_usage_entries_memlist,
2776 sizeof(unicode_usage_t), 0, (void**)&unicode_usage_entries,
2777 "unicode entity entries");
2779 /* hufflist and huff_entities will be allocated at compress_game_text() time. */
2781 /* This hash table is only used in Glulx */
2782 for (ix=0; ix<UNICODE_HASH_BUCKETS; ix++)
2783 unicode_usage_hash[ix] = -1;
2785 initialise_memory_list(&compressed_offsets_memlist,
2786 sizeof(int32), 0, (void**)&compressed_offsets,
2787 "static strings index table");
2790 extern void extract_all_text()
2792 /* optimise_abbreviations() is called after free_arrays(). Therefore,
2793 we need to preserve the text transcript where it will not be
2794 freed up. We do this by copying the pointer to opttext. */
2796 opttextlen = all_text_top;
2798 /* Re-init all_text_memlist. This causes it to forget all about the
2799 old pointer. Deallocating it in text_free_arrays() will be a no-op. */
2800 initialise_memory_list(&all_text_memlist,
2801 sizeof(char), 0, (void**)&all_text,
2802 "dummy transcription text");
2805 extern void text_free_arrays(void)
2807 deallocate_memory_list(&translated_text_memlist);
2809 deallocate_memory_list(&all_text_memlist);
2811 deallocate_memory_list(&low_strings_memlist);
2812 deallocate_memory_list(&abbreviations_at_memlist);
2813 deallocate_memory_list(&abbreviations_memlist);
2815 deallocate_memory_list(&abbreviations_optimal_parse_schedule_memlist);
2816 deallocate_memory_list(&abbreviations_optimal_parse_scores_memlist);
2818 deallocate_memory_list(&dtree_memlist);
2819 deallocate_memory_list(&dict_sort_codes_memlist);
2820 my_free(&final_dict_order, "final dictionary ordering table");
2822 deallocate_memory_list(&dictionary_memlist);
2824 deallocate_memory_list(&compressed_offsets_memlist);
2825 my_free(&hufflist, "huffman node list");
2826 my_free(&huff_entities, "huffman entities");
2828 deallocate_memory_list(&unicode_usage_entries_memlist);
2830 deallocate_memory_list(&static_strings_area_memlist);
2833 extern void ao_free_arrays(void)
2835 /* Called only after optimise_abbreviations() runs. */
2837 my_free (&opttext,"stashed transcript for optimisation");
2838 my_free (&bestyet,"bestyet");
2839 my_free (&bestyet2,"bestyet2");
2840 my_free (&grandtable,"grandtable");
2841 my_free (&grandflags,"grandflags");
2843 deallocate_memory_list(&tlbtab_memlist);
2845 /* This was re-inited, so we should re-deallocate it. */
2846 deallocate_memory_list(&all_text_memlist);
2849 /* ========================================================================= */