src/read.c

   1 /*
   2 Copyright (C) 2017-2018 Keziah Wesley
   3
   4 You can redistribute and/or modify this file under the terms of the
   5 GNU Affero General Public License as published by the Free Software
   6 Foundation, either version 3 of the License, or (at your option) any
   7 later version.
   8
   9 This file is distributed in the hope that it will be useful, but
  10 WITHOUT ANY WARRANTY; without even the implied warranty of
  11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12 Affero General Public License for more details.
  13
  14 You should have received a copy of the GNU Affero General Public
  15 License along with this file. If not, see
  16 <http://www.gnu.org/licenses/>.
  17 */
  18
  19 #include "read.h"
  20 #include "object.h"
  21 #include "oblist.h"
  22
  23 #include <assert.h>
  24 #include <stdio.h>
  25
  26 /**
  27 Tokenization
  28 */
  29
  30 // Return the number of whitespace characters at the beginning of the input.
  31 static int
  32 count_whitespace (const char *p)
  33 {
  34   for (int i = 0;; i++)
  35     {
  36       switch (p[i])
  37         {
  38         case 0:
  39         case ' ':
  40         case '\n':
  41         case '\r':
  42         case '\t':
  43         case '\v':
  44         case '\f':
  45           break;
  46         default:
  47           return i;
  48         }
  49     }
  50 }
  51
  52 // Return the number of characters at the beginning of the input
  53 // constituting a valid PNAME.
  54 // If the input ends with '\', it is incomplete, and -1 is returned.
  55 static int
  56 count_pname (const char *p)
  57 {
  58   // rule (1): valid FLOAT / FIX isn't a PNAME.
  59   // Caller must try to read as number first!
  60
  61   // rule (2): dot can't be first
  62   if (p[0] == '.')
  63     return 0;
  64
  65   // rule (3): "if you can't type it interactively, it's only valid non-interactively"
  66   // (not currently enforced; just don't give your atoms names like ^L)
  67
  68   for (int i = 0;; i++)
  69     {
  70       switch (p[i])
  71         {
  72           // separators end the PNAME
  73           // rule (4): whitespace
  74           // rule (5): special characters
  75           // rule (6): brackets
  76         case ' ':
  77         case '\n':
  78         case '\r':
  79         case '\t':
  80         case '\v':
  81         case '\f':
  82         case ',':
  83         case '#':
  84         case '\'':
  85         case ';':
  86         case '%':
  87         case '(':
  88         case ')':
  89         case '[':
  90         case ']':
  91         case '<':
  92         case '>':
  93         case '{':
  94         case '}':
  95         case '"':
  96         case 0:
  97           return i;
  98           // escape: next char becomes normal
  99         case '\\':
 100           if (!p[++i])
 101             return -1;
 102         }
 103     }
 104 }
 105
 106 /**
 107 State machine
 108 */
 109
 110 /*
 111 static uint32_t obj_get_fix32(const object *o) {
 112     assert(o->type == EVALTYPE_FIX32);
 113     return o->v.fix32;
 114 }
 115 */
 116
 117 static int
 118 read_num (const char *p, reader_stack * st)
 119 {
 120   int i = 0;
 121   // Use an unsigned intermediate to simplify overflow checks.
 122   uint64_t x = 0;
 123   // Disallow "number" composed of ancillary number components only.
 124   bool gotdigits = false;
 125   // Skip for now, later we'll check again to potentially negate.
 126   if (p[0] == '-')
 127     i++;
 128   // TODO: asterisk-deliminated *octal*
 129   // TODO: other bases?
 130   for (; p[i]; i++)
 131     {
 132       if (p[i] >= '0' && p[i] <= '9')
 133         {
 134           if (x * 10 + (p[i] - '0') < x)
 135             goto read_float;
 136           x = x * 10 + (p[i] - '0');
 137           gotdigits = true;
 138           continue;
 139         }
 140
 141       // TODO: decimal points, exponent notation
 142       // NB. a terminal decimal denotes a FIX
 143       // NB. exponent notation doesn't necessarily indicate a float
 144
 145       // TODO: distinguish 'delimiter' characters that terminate
 146       // number from 'identifier' chars that cause parsing to fail
 147       // (and potentially be parsed as an atom)
 148       break;
 149     }
 150   if (!gotdigits)
 151     return 0;
 152   if (p[0] != '-')
 153     {
 154       if (x <= INT32_MAX)
 155         (--(st->pos))->fix32 = new_fix32 ((int32_t) x);
 156       else if (x <= INT64_MAX)
 157         (--(st->pos))->fix64 = new_fix64 (x);
 158       else
 159         goto read_float;
 160     }
 161   else
 162     {
 163       if (-x >= (uint64_t) INT32_MIN)
 164         (--(st->pos))->fix32 = new_fix32 (0 - (int32_t) x);
 165       else if (-x >= (uint64_t) INT64_MIN)
 166         (--(st->pos))->fix64 = new_fix64 (0 - (int64_t) x);
 167       else
 168         goto read_float;
 169     }
 170   st->framelen++;
 171   return i;
 172 read_float:
 173   assert (0 && "unimplemented: promote num to float");
 174   return i;
 175 }
 176
 177 // stack[0..len]: objs in current list
 178 // stack[len]: parent len
 179 const char *
 180 read_token (const char *p, reader_stack * st)
 181 {
 182   p += count_whitespace (p);
 183   switch (p[0])
 184     {
 185     case '\0':
 186       break;
 187     case '(':
 188     case '<':
 189     case '[':
 190       {
 191         // opener: push current framelen; start new frame
 192         // store child type and parent framelen in a pseudo-object together
 193         evaltype type;
 194         switch (p++[0])
 195           {
 196           case '(':
 197             type = EVALTYPE_LIST;
 198             break;
 199           case '<':
 200             type = EVALTYPE_FORM;
 201             break;
 202           case '[':
 203             type = EVALTYPE_VECTOR;
 204             break;
 205           default:
 206             assert (0 && "martian opener token?");
 207           }
 208         *--(st->pos) = (object)
 209         {
 210           .fix32.type = type,.fix32.rest = 0,.fix32.val = (fix32_val)
 211           {
 212           .n = st->framelen}
 213         ,};
 214         st->framelen = 0;
 215         break;
 216       }
 217     case ')':
 218     case '>':
 219       {
 220         evaltype type;
 221         switch (p++[0])
 222           {
 223           case ')':
 224             type = EVALTYPE_LIST;
 225             break;
 226           case '>':
 227             type = EVALTYPE_FORM;
 228             break;
 229           default:
 230             assert (0 && "martian list closer token?");
 231           }
 232         // build list from current stack frame
 233         pool_ptr o =
 234           pool_copy_array_rev ((pool_object *) st->pos, st->framelen);
 235         // pop frame, push new LIST
 236         st->pos += st->framelen;
 237         assert (st->pos->type == type);
 238         st->framelen = st->pos->fix32.val.n + 1;
 239         // overwrite the frame marker with the collection it became
 240         st->pos->list = new_list (o);
 241         st->pos->list.type = type;
 242         break;
 243       }
 244     case ']':
 245       {
 246         p++;
 247         // build vector from current stack frame
 248         heap_ptr h = heap_copy_array_rev (st->pos, st->framelen);
 249         // pop frame, push new VECTOR
 250         uint32_t len = st->framelen;
 251         st->pos += st->framelen;
 252         assert (st->pos->type == EVALTYPE_VECTOR);
 253         st->framelen = st->pos->fix32.val.n + 1;
 254         st->pos->vector = new_vector (h, len);
 255         break;
 256       }
 257     default:
 258       {
 259         int n = read_num (p, st);
 260         if (n)
 261           return p + n;
 262
 263         n = count_pname (p);
 264         if (n > 0)
 265           {
 266             (--(st->pos))->atom = oblist_find_or_insert (root, p, n);
 267             st->framelen++;
 268             return p + n;
 269           }
 270
 271         fprintf (stderr, "read unimplemented for char: '%c'\n", *p);
 272         assert (0 && "read unimplemented for char");
 273       }
 274     }
 275   return p;
 276 }