MimIR 0.1
MimIR is my Intermediate Representation
Loading...
Searching...
No Matches
lexer.cpp
Go to the documentation of this file.
1#include "mim/ast/lexer.h"
2
3#include "mim/ast/ast.h"
4
5using namespace std::literals;
6
7namespace mim::ast {
8
9namespace utf8 = fe::utf8;
10using Tag = Tok::Tag;
11
12Lexer::Lexer(AST& ast, std::istream& istream, const fs::path* path /*= nullptr*/, std::ostream* md /*= nullptr*/)
13 : Super(istream, path)
14 , ast_(ast)
15 , md_(md) {
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
18#undef CODE
19
20#define CODE(str, t) \
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
23#undef CODE
24
25 if (start_md())
26 emit_md(true);
27 else
28 md_fence();
29}
30
32 while (true) {
33 if (auto cache = cache_) {
34 cache_.reset();
35 return *cache;
36 }
37
38 start();
39
40 if (accept(utf8::EoF)) return tok(Tag::EoF);
41 if (accept(utf8::isspace)) continue;
42 if (accept(utf8::Null)) {
43 ast().error(loc_, "invalid UTF-8 character");
44 continue;
45 }
46
47 // clang-format off
48 // delimiters
49 if (accept( '(')) return tok(Tag::D_paren_l);
50 if (accept( ')')) return tok(Tag::D_paren_r);
51 if (accept( '[')) return tok(Tag::D_brckt_l);
52 if (accept( ']')) return tok(Tag::D_brckt_r);
53 if (accept( '{')) return tok(Tag::D_brace_l);
54 if (accept( '}')) return tok(Tag::D_brace_r);
55 if (accept(U'«')) return tok(Tag::D_quote_l);
56 if (accept(U'»')) return tok(Tag::D_quote_r);
57 if (accept(U'⟪')) return tok(Tag::D_quote_l);
58 if (accept(U'⟫')) return tok(Tag::D_quote_r);
59 if (accept(U'‹')) return tok(Tag::D_angle_l);
60 if (accept(U'›')) return tok(Tag::D_angle_r);
61 if (accept(U'⟨')) return tok(Tag::D_angle_l);
62 if (accept(U'⟩')) return tok(Tag::D_angle_r);
63 if (accept( '<')) {
64 if (accept( '<')) return tok(Tag::D_quote_l);
65 return tok(Tag::D_angle_l);
66 }
67 if (accept( '>')) {
68 if (accept( '>')) return tok(Tag::D_quote_r);
69 return tok(Tag::D_angle_r);
70 }
71 // further tokens
72 if (accept('`')) return tok(Tag::T_backtick);
73 if (accept(U'→')) return tok(Tag::T_arrow);
74 if (accept( '@')) return tok(Tag::T_at);
75 if (accept( '=')) return tok(Tag::T_assign);
76 if (accept(U'⊥')) return tok(Tag::T_bot);
77 if (accept(U'⊤')) return tok(Tag::T_top);
78 if (accept(U'□')) return tok(Tag::T_box);
79 if (accept( ',')) return tok(Tag::T_comma);
80 if (accept( '$')) return tok(Tag::T_dollar);
81 if (accept( '#')) return tok(Tag::T_extract);
82 if (accept(U'λ')) return tok(Tag::T_lm);
83 if (accept(U'Π')) return tok(Tag::T_Pi);
84 if (accept( ';')) return tok(Tag::T_semicolon);
85 if (accept(U'★')) return tok(Tag::T_star);
86 if (accept( '*')) return tok(Tag::T_star);
87 if (accept( ':')) {
88 if (accept( ':')) return tok(Tag::T_colon_colon);
89 return tok(Tag::T_colon);
90 }
91 if (accept( '|')) {
92 if (accept('~')) {
93 if (accept('|')) return tok(Tag::T_Pi);
94 }
95 ast().error(loc_, "invalid input char '{}'; maybe you wanted to use '|~|'?", str_);
96 continue;
97 }
98 // clang-format on
99
100 if (accept('%')) {
101 if (lex_id()) {
102 auto loc = cache_trailing_dot();
103 return {loc, Tag::M_anx, sym()};
104 }
105 ast().error(loc_, "invalid axiom name '{}'", str_);
106 }
107
108 if (accept('.')) {
109 if (lex_id()) {
110 if (auto i = keywords_.find(sym()); i != keywords_.end()) return tok(i->second);
111 // Split non-keyword into T_dot and M_id; M_id goes into cache_ for next lex().
112 assert(!cache_.has_value());
113 auto id_loc = loc();
114 ++id_loc.begin.col;
115 cache_.emplace(id_loc, Tag::M_id, ast().sym(str_.substr(1)));
116 return {loc().anew_begin(), Tag::T_dot};
117 }
118
119 if (accept(utf8::isdigit)) {
120 parse_digits();
121 parse_exp();
122 return {loc_, f64(std::strtod(str_.c_str(), nullptr))};
123 }
124
125 return tok(Tag::T_dot);
126 }
127
128 if (accept('\'')) {
129 auto c = lex_char();
130 if (accept('\'')) return {loc(), c};
131 ast().error(loc_, "invalid character literal {}", str_);
132 continue;
133 }
134
135 if (accept<Append::Off>('\"')) {
136 while (lex_char() != '"') {}
137 str_.pop_back(); // remove final '"'
138 return {loc_, Tag::L_str, sym()};
139 }
140
141 if (lex_id()) {
142 auto loc = cache_trailing_dot();
143 return {loc, Tag::M_id, sym()};
144 }
145
146 if (utf8::isdigit(ahead()) || utf8::any('+', '-')(ahead())) {
147 if (auto lit = parse_lit()) return *lit;
148 continue;
149 }
150
151 if (start_md()) {
152 emit_md();
153 continue;
154 }
155
156 // comments
157 if (accept('/')) {
158 if (accept('*')) {
159 eat_comments();
160 continue;
161 }
162 if (accept('/')) {
163 while (ahead() != utf8::EoF && ahead() != '\n') next();
164 continue;
165 }
166
167 ast().error({loc_.path, peek_}, "invalid input char '/'; maybe you wanted to start a comment?");
168 continue;
169 }
170
171 ast().error({loc_.path, peek_}, "invalid input char '{}'", utf8::Char32(ahead()));
172 next();
173 }
174}
175
176// A trailing T_dot does not belong to an annex name or identifier and goes into cache_ for next lex().
177Loc Lexer::cache_trailing_dot() {
178 auto l = loc();
179 if (str_.back() == '.') {
180 str_.pop_back();
181 assert(!cache_.has_value());
182 cache_.emplace(l.anew_finis(), Tag::T_dot);
183 --l.finis.col;
184 }
185 return l;
186}
187
188bool Lexer::lex_id() {
189 if (accept([](char32_t c) { return c == '_' || utf8::isalpha(c); })) {
190 while (accept([](char32_t c) { return c == '_' || c == '.' || utf8::isalnum(c); })) {}
191 return true;
192 }
193 return false;
194}
195
196// clang-format off
197std::optional<Tok> Lexer::parse_lit() {
198 int base = 10;
199 std::optional<bool> sign;
200
201 if (accept<Append::Off>('+')) {
202 sign = false;
203 } else if (accept<Append::Off>('-')) {
204 if (accept('>')) return tok(Tag::T_arrow);
205 sign = true;
206 }
207
208 // prefix starting with '0'
209 if (accept<Append::Off>('0')) {
210 if (accept<Append::Off>('b')) base = 2;
211 else if (accept<Append::Off>('B')) base = 2;
212 else if (accept<Append::Off>('o')) base = 8;
213 else if (accept<Append::Off>('O')) base = 8;
214 else if (accept<Append::Off>('x')) base = 16;
215 else if (accept<Append::Off>('X')) base = 16;
216 }
217
218 parse_digits(base);
219
220 if (accept(utf8::any('i', 'I'))) {
221 if (sign) str_.insert(0, "-"sv);
222 auto val = std::strtoull(str_.c_str(), nullptr, base);
223 str_.clear();
224 parse_digits();
225 auto width = std::strtoull(str_.c_str(), nullptr, 10);
226 return Tok{loc_, ast().world().lit_int(width, val)};
227 }
228
229 if (!sign && base == 10) {
230 if (utf8::isrange(ahead(), U'₀', U'₉')) {
231 auto i = std::strtoull(str_.c_str(), nullptr, 10);
232 std::string mod;
233 while (utf8::isrange(ahead(), U'₀', U'₉')) mod += next() - U'₀' + '0';
234 auto m = std::strtoull(mod.c_str(), nullptr, 10);
235 return Tok{loc_, ast().world().lit_idx_mod(m, i)};
236 } else if (accept<Append::Off>('_')) {
237 auto i = std::strtoull(str_.c_str(), nullptr, 10);
238 str_.clear();
239 if (accept(utf8::isdigit)) {
240 parse_digits(10);
241 auto m = std::strtoull(str_.c_str(), nullptr, 10);
242 return Tok{loc_, ast().world().lit_idx_mod(m, i)};
243 } else {
244 ast().error(loc_, "stray underscore in .Idx literal; size is missing");
245 auto i = std::strtoull(str_.c_str(), nullptr, 10);
246 return Tok{loc_, u64(i)};
247 }
248 }
249 }
250
251 bool is_float = false;
252 if (base == 10 || base == 16) {
253 // parse fractional part
254 if (accept('.')) {
255 is_float = true;
256 parse_digits(base);
257 }
258
259 bool has_exp = parse_exp(base);
260 if (base == 16 && is_float && !has_exp) ast().error(loc_, "hexadecimal floating constants require an exponent");
261 is_float |= has_exp;
262 }
263
264 if (sign && str_.empty()) {
265 ast().error(loc_, "stray '{}'", *sign ? "-" : "+");
266 return {};
267 }
268
269 if (is_float && base == 16) str_.insert(0, "0x"sv);
270 if (sign && *sign) str_.insert(0, "-"sv);
271
272 if (is_float) return Tok{loc_, f64(std::strtod (str_.c_str(), nullptr ))};
273 if (sign) return Tok{loc_, u64(std::strtoll (str_.c_str(), nullptr, base))};
274 else return Tok{loc_, u64(std::strtoull(str_.c_str(), nullptr, base))};
275}
276
277void Lexer::parse_digits(int base /*= 10*/) {
278 switch (base) {
279 // clang-format off
280 case 2: while (accept(utf8::isbdigit)) {} break;
281 case 8: while (accept(utf8::isodigit)) {} break;
282 case 10: while (accept(utf8::isdigit)) {} break;
283 case 16: while (accept(utf8::isxdigit)) {} break;
284 // clang-format on
285 default: fe::unreachable();
286 }
287}
288
289bool Lexer::parse_exp(int base /*= 10*/) {
290 if (accept(base == 10 ? utf8::any('e', 'E') : utf8::any('p', 'P'))) {
291 accept(utf8::any('+', '-'));
292 if (!utf8::isdigit(ahead())) ast().error(loc_, "exponent has no digits");
293 parse_digits();
294 return true;
295 }
296 return false;
297}
298// clang-format on
299
300char8_t Lexer::lex_char() {
301 if (accept<Append::Off>('\\')) {
302 // clang-format off
303 if (false) {}
304 else if (accept<Append::Off>('\'')) str_ += '\'';
305 else if (accept<Append::Off>('\\')) str_ += '\\';
306 else if (accept<Append::Off>( '"')) str_ += '\"';
307 else if (accept<Append::Off>( '0')) str_ += '\0';
308 else if (accept<Append::Off>( 'a')) str_ += '\a';
309 else if (accept<Append::Off>( 'b')) str_ += '\b';
310 else if (accept<Append::Off>( 'f')) str_ += '\f';
311 else if (accept<Append::Off>( 'n')) str_ += '\n';
312 else if (accept<Append::Off>( 'r')) str_ += '\r';
313 else if (accept<Append::Off>( 't')) str_ += '\t';
314 else if (accept<Append::Off>( 'v')) str_ += '\v';
315 else ast().error(loc_.anew_finis(), "invalid escape character '\\{}'", (char)ahead());
316 // clang-format on
317 return str_.back();
318 }
319 auto c = next();
320 str_ += c;
321 if (utf8::isascii(c)) return c;
322 ast().error(loc_, "invalid character '{}'", (char)c);
323 return '\0';
324}
325
326void Lexer::eat_comments() {
327 while (true) {
328 while (ahead() != utf8::EoF && ahead() != '*') next();
329 if (accept(utf8::EoF)) {
330 ast().error(loc_, "non-terminated multiline comment");
331 return;
332 }
333 next();
334 if (accept('/')) break;
335 }
336}
337
338void Lexer::emit_md(bool start_of_file) {
339 if (!start_of_file) md_fence();
340
341 do {
342 out_ = false;
343 for (int i = 0; i < 3; ++i) next();
344 accept(' ');
345 out_ = true;
346
347 while (ahead() != utf8::EoF && ahead() != '\n') next();
348 accept('\n');
349 } while (start_md());
350
351 if (ahead() == utf8::EoF)
352 out_ = false;
353 else
354 md_fence();
355}
356
357Sym Lexer::sym() { return ast().sym(str_); }
358
359} // namespace mim::ast
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
Definition world.h:423
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Definition world.h:410
World & world()
Definition ast.h:60
Error & error()
Definition ast.h:62
Sym sym(const char *s)
Definition ast.h:68
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).
Definition lexer.cpp:12
AST & ast()
Definition lexer.h:24
Loc loc() const
Definition lexer.h:26
Definition ast.h:13
double f64
Definition types.h:41
uint64_t u64
Definition types.h:34
#define MIM_SUBST(m)
Definition tok.h:99
#define CODE(t, str)
Definition tok.h:53
#define MIM_KEY(m)
Definition tok.h:13