MimIR 0.1
MimIR is my Intermediate Representation
Loading...
Searching...
No Matches
lexer.cpp
Go to the documentation of this file.
1#include "mim/ast/lexer.h"
2
3#include "mim/ast/ast.h"
4
5using namespace std::literals;
6
7namespace mim::ast {
8
9namespace utf8 = fe::utf8;
10using Tag = Tok::Tag;
11
12Lexer::Lexer(AST& ast, std::istream& istream, const fs::path* path /*= nullptr*/, std::ostream* md /*= nullptr*/)
13 : Super(istream, path)
14 , ast_(ast)
15 , md_(md) {
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
18#undef CODE
19
20#define CODE(str, t) \
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
23#undef CODE
24
25 if (start_md())
26 emit_md(true);
27 else
28 md_fence();
29}
30
32 while (true) {
33 start();
34
35 if (accept(utf8::EoF)) return tok(Tag::EoF);
36 if (accept(utf8::isspace)) continue;
37 if (accept(utf8::Null)) {
38 ast().error(loc_, "invalid UTF-8 character");
39 continue;
40 }
41
42 // clang-format off
43 // delimiters
44 if (accept( '(')) return tok(Tag::D_paren_l);
45 if (accept( ')')) return tok(Tag::D_paren_r);
46 if (accept( '[')) return tok(Tag::D_brckt_l);
47 if (accept( ']')) return tok(Tag::D_brckt_r);
48 if (accept( '{')) return tok(Tag::D_brace_l);
49 if (accept( '}')) return tok(Tag::D_brace_r);
50 if (accept(U'⦃')) return tok(Tag::D_curly_l);
51 if (accept(U'⦄')) return tok(Tag::D_curly_r);
52 if (accept(U'«')) return tok(Tag::D_quote_l);
53 if (accept(U'»')) return tok(Tag::D_quote_r);
54 if (accept(U'⟪')) return tok(Tag::D_quote_l);
55 if (accept(U'⟫')) return tok(Tag::D_quote_r);
56 if (accept(U'‹')) return tok(Tag::D_angle_l);
57 if (accept(U'›')) return tok(Tag::D_angle_r);
58 if (accept(U'⟨')) return tok(Tag::D_angle_l);
59 if (accept(U'⟩')) return tok(Tag::D_angle_r);
60 if (accept( '<')) {
61 if (accept( '<')) return tok(Tag::D_quote_l);
62 return tok(Tag::D_angle_l);
63 }
64 if (accept( '>')) {
65 if (accept( '>')) return tok(Tag::D_quote_r);
66 return tok(Tag::D_angle_r);
67 }
68 // further tokens
69 if (accept(U'→')) return tok(Tag::T_arrow);
70 if (accept( '@')) return tok(Tag::T_at);
71 if (accept( '=')) {
72 if (accept('>')) return tok(Tag::T_fat_arrow);
73 return tok(Tag::T_assign);
74 }
75 if (accept(U'⊥')) return tok(Tag::T_bot);
76 if (accept(U'⊤')) return tok(Tag::T_top);
77 if (accept(U'□')) return tok(Tag::T_box);
78 if (accept( ',')) return tok(Tag::T_comma);
79 if (accept( '$')) return tok(Tag::T_dollar);
80 if (accept( '#')) return tok(Tag::T_extract);
81 if (accept(U'λ')) return tok(Tag::T_lm);
82 if (accept( '|')) return tok(Tag::T_pipe);
83 if (accept( ';')) return tok(Tag::T_semicolon);
84 if (accept(U'★')) return tok(Tag::T_star);
85 if (accept( '*')) return tok(Tag::T_star);
86 if (accept( ':')) return tok(Tag::T_colon);
87 if (accept(U'∪')) return tok(Tag::T_union);
88 // clang-format on
89
90 if (accept('%')) {
91 if (lex_id()) return {loc_, Tag::M_anx, sym()};
92 ast().error(loc_, "invalid axm name '{}'", str_);
93 continue;
94 }
95
96 if (accept('.')) {
97 if (accept(utf8::isdigit)) {
98 parse_digits();
99 parse_exp();
100 return {loc_, f64(std::strtod(str_.c_str(), nullptr))};
101 }
102
103 return tok(Tag::T_dot);
104 }
105
106 if (accept('\'')) {
107 auto c = lex_char();
108 if (accept('\'')) return {loc_, c};
109 ast().error(loc_, "invalid character literal {}", str_);
110 continue;
111 }
112
113 if (accept<Append::Off>('\"')) {
114 while (lex_char() != '"') {}
115 str_.pop_back(); // remove final '"'
116 return {loc_, Tag::L_str, sym()};
117 }
118
119 if (lex_id()) {
120 if (auto i = keywords_.find(sym()); i != keywords_.end()) return tok(i->second);
121 return {loc_, Tag::M_id, sym()};
122 }
123
124 if (utf8::isdigit(ahead()) || utf8::any('+', '-')(ahead())) {
125 if (auto lit = parse_lit()) return *lit;
126 continue;
127 }
128
129 if (start_md()) {
130 emit_md();
131 continue;
132 }
133
134 // comments
135 if (accept('/')) {
136 if (accept('*')) {
137 eat_comments();
138 continue;
139 }
140 if (accept('/')) {
141 while (ahead() != utf8::EoF && ahead() != '\n')
142 next();
143 continue;
144 }
145
146 ast().error({loc_.path, peek_}, "invalid input char '/'; maybe you wanted to start a comment?");
147 continue;
148 }
149
150 ast().error({loc_.path, peek_}, "invalid input char '{}'", utf8::Char32(ahead()));
151 next();
152 }
153}
154
155bool Lexer::lex_id() {
156 if (accept([](char32_t c) { return c == '_' || utf8::isalpha(c); })) {
157 while (accept([](char32_t c) { return c == '_' || c == '.' || utf8::isalnum(c); })) {}
158 return true;
159 }
160 return false;
161}
162
163// clang-format off
164std::optional<Tok> Lexer::parse_lit() {
165 int base = 10;
166 std::optional<bool> sign;
167
168 if (accept<Append::Off>('+')) {
169 sign = false;
170 } else if (accept<Append::Off>('-')) {
171 if (accept('>')) return tok(Tag::T_arrow);
172 sign = true;
173 }
174
175 // prefix starting with '0'
176 if (accept<Append::Off>('0')) {
177 if (accept<Append::Off>('b')) base = 2;
178 else if (accept<Append::Off>('B')) base = 2;
179 else if (accept<Append::Off>('o')) base = 8;
180 else if (accept<Append::Off>('O')) base = 8;
181 else if (accept<Append::Off>('x')) base = 16;
182 else if (accept<Append::Off>('X')) base = 16;
183 }
184
185 parse_digits(base);
186
187 if (accept(utf8::any('i', 'I'))) {
188 if (sign) str_.insert(0, "-"sv);
189 auto val = std::strtoull(str_.c_str(), nullptr, base);
190 str_.clear();
191 parse_digits();
192 auto width = std::strtoull(str_.c_str(), nullptr, 10);
193 return Tok{loc_, ast().world().lit_int(width, val)};
194 }
195
196 if (!sign && base == 10) {
197 if (utf8::isrange(ahead(), U'₀', U'₉')) {
198 auto i = std::strtoull(str_.c_str(), nullptr, 10);
199 std::string mod;
200 while (utf8::isrange(ahead(), U'₀', U'₉')) mod += next() - U'₀' + '0';
201 auto m = std::strtoull(mod.c_str(), nullptr, 10);
202 return Tok{loc_, ast().world().lit_idx_mod(m, i)};
203 } else if (accept<Append::Off>('_')) {
204 auto i = std::strtoull(str_.c_str(), nullptr, 10);
205 str_.clear();
206 if (accept(utf8::isdigit)) {
207 parse_digits(10);
208 auto m = std::strtoull(str_.c_str(), nullptr, 10);
209 return Tok{loc_, ast().world().lit_idx_mod(m, i)};
210 } else {
211 ast().error(loc_, "stray underscore in Idx literal; size is missing");
212 auto i = std::strtoull(str_.c_str(), nullptr, 10);
213 return Tok{loc_, u64(i)};
214 }
215 }
216 }
217
218 bool is_float = false;
219 if (base == 10 || base == 16) {
220 // parse fractional part
221 if (accept('.')) {
222 is_float = true;
223 parse_digits(base);
224 }
225
226 bool has_exp = parse_exp(base);
227 if (base == 16 && is_float && !has_exp) ast().error(loc_, "hexadecimal floating constants require an exponent");
228 is_float |= has_exp;
229 }
230
231 if (sign && str_.empty()) {
232 ast().error(loc_, "stray '{}'", *sign ? "-" : "+");
233 return {};
234 }
235
236 if (is_float && base == 16) str_.insert(0, "0x"sv);
237 if (sign && *sign) str_.insert(0, "-"sv);
238
239 if (is_float) return Tok{loc_, f64(std::strtod (str_.c_str(), nullptr ))};
240 if (sign) return Tok{loc_, u64(std::strtoll (str_.c_str(), nullptr, base))};
241 else return Tok{loc_, u64(std::strtoull(str_.c_str(), nullptr, base))};
242}
243
244void Lexer::parse_digits(int base /*= 10*/) {
245 switch (base) {
246 // clang-format off
247 case 2: while (accept(utf8::isbdigit)) {} break;
248 case 8: while (accept(utf8::isodigit)) {} break;
249 case 10: while (accept(utf8::isdigit)) {} break;
250 case 16: while (accept(utf8::isxdigit)) {} break;
251 // clang-format on
252 default: fe::unreachable();
253 }
254}
255
256bool Lexer::parse_exp(int base /*= 10*/) {
257 if (accept(base == 10 ? utf8::any('e', 'E') : utf8::any('p', 'P'))) {
258 accept(utf8::any('+', '-'));
259 if (!utf8::isdigit(ahead())) ast().error(loc_, "exponent has no digits");
260 parse_digits();
261 return true;
262 }
263 return false;
264}
265// clang-format on
266
267char8_t Lexer::lex_char() {
268 if (accept<Append::Off>('\\')) {
269 // clang-format off
270 if (false) {}
271 else if (accept<Append::Off>('\'')) str_ += '\'';
272 else if (accept<Append::Off>('\\')) str_ += '\\';
273 else if (accept<Append::Off>( '"')) str_ += '\"';
274 else if (accept<Append::Off>( '0')) str_ += '\0';
275 else if (accept<Append::Off>( 'a')) str_ += '\a';
276 else if (accept<Append::Off>( 'b')) str_ += '\b';
277 else if (accept<Append::Off>( 'f')) str_ += '\f';
278 else if (accept<Append::Off>( 'n')) str_ += '\n';
279 else if (accept<Append::Off>( 'r')) str_ += '\r';
280 else if (accept<Append::Off>( 't')) str_ += '\t';
281 else if (accept<Append::Off>( 'v')) str_ += '\v';
282 else ast().error(loc_.anew_finis(), "invalid escape character '\\{}'", (char)ahead());
283 // clang-format on
284 return str_.back();
285 }
286 auto c = next();
287 str_ += c;
288 if (utf8::isascii(c)) return char8_t(c);
289 ast().error(loc_, "invalid character '{}'", (char)c);
290 return '\0';
291}
292
293void Lexer::eat_comments() {
294 while (true) {
295 while (ahead() != utf8::EoF && ahead() != '*')
296 next();
297 if (accept(utf8::EoF)) {
298 ast().error(loc_, "non-terminated multiline comment");
299 return;
300 }
301 next();
302 if (accept('/')) break;
303 }
304}
305
306void Lexer::emit_md(bool start_of_file) {
307 if (!start_of_file) md_fence();
308
309 do {
310 out_ = false;
311 for (int i = 0; i < 3; ++i)
312 next();
313 accept(' ');
314 out_ = true;
315
316 while (ahead() != utf8::EoF && ahead() != '\n')
317 next();
318 accept('\n');
319 } while (start_md());
320
321 if (ahead() == utf8::EoF)
322 out_ = false;
323 else
324 md_fence();
325}
326
327Sym Lexer::sym() { return ast().sym(str_); }
328
329} // namespace mim::ast
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
Definition world.h:490
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size 2^width.
Definition world.h:477
World & world()
Definition ast.h:61
Error & error()
Definition ast.h:63
Sym sym(const char *s)
Definition ast.h:69
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).
Definition lexer.cpp:12
AST & ast()
Definition lexer.h:22
const fs::path * path() const
Definition lexer.h:23
Definition ast.h:14
Tok::Tag Tag
Definition bind.cpp:7
double f64
Definition types.h:42
uint64_t u64
Definition types.h:35
CODE(node, _)
Definition def.h:113
#define MIM_SUBST(m)
Definition tok.h:108
#define MIM_KEY(m)
Definition tok.h:13