Thorin 1.9.0
The Higher ORder INtermediate representation
Loading...
Searching...
No Matches
lexer.cpp
Go to the documentation of this file.
1#include "thorin/fe/lexer.h"
2
3#include "thorin/world.h"
4
5using namespace std::literals;
6
7namespace thorin {
8
9namespace utf8 = fe::utf8;
10using Tag = Tok::Tag;
11
12Lexer::Lexer(World& world, std::istream& istream, const fs::path* path /*= nullptr*/, std::ostream* md /*= nullptr*/)
13 : Super(istream, path)
14 , world_(world)
15 , md_(md) {
16#define CODE(t, str) keywords_[world.sym(str)] = Tag::t;
18#undef CODE
19
20#define CODE(str, t) \
21 if (Tag::t != Tag::Nil) keywords_[world.sym(str)] = Tag::t;
23#undef CODE
24
25 if (start_md())
26 emit_md(true);
27 else
28 md_fence();
29}
30
32 while (true) {
33 if (auto cache = cache_) {
34 cache_.reset();
35 return *cache;
36 }
37
38 start();
39
40 if (accept(utf8::EoF)) return tok(Tag::EoF);
41 if (accept(utf8::isspace)) continue;
42 if (accept(utf8::Null)) error(loc_, "invalid UTF-8 character");
43
44 // clang-format off
45 // delimiters
46 if (accept( '(')) return tok(Tag::D_paren_l);
47 if (accept( ')')) return tok(Tag::D_paren_r);
48 if (accept( '[')) return tok(Tag::D_brckt_l);
49 if (accept( ']')) return tok(Tag::D_brckt_r);
50 if (accept( '{')) return tok(Tag::D_brace_l);
51 if (accept( '}')) return tok(Tag::D_brace_r);
52 if (accept(U'«')) return tok(Tag::D_quote_l);
53 if (accept(U'»')) return tok(Tag::D_quote_r);
54 if (accept(U'⟪')) return tok(Tag::D_quote_l);
55 if (accept(U'⟫')) return tok(Tag::D_quote_r);
56 if (accept(U'‹')) return tok(Tag::D_angle_l);
57 if (accept(U'›')) return tok(Tag::D_angle_r);
58 if (accept(U'⟨')) return tok(Tag::D_angle_l);
59 if (accept(U'⟩')) return tok(Tag::D_angle_r);
60 if (accept( '<')) {
61 if (accept( '<')) return tok(Tag::D_quote_l);
62 return tok(Tag::D_angle_l);
63 }
64 if (accept( '>')) {
65 if (accept( '>')) return tok(Tag::D_quote_r);
66 return tok(Tag::D_angle_r);
67 }
68 // further tokens
69 if (accept('`')) return tok(Tag::T_backtick);
70 if (accept(U'→')) return tok(Tag::T_arrow);
71 if (accept( '@')) return tok(Tag::T_at);
72 if (accept( '=')) return tok(Tag::T_assign);
73 if (accept( '!')) return tok(Tag::T_bang);
74 if (accept(U'⊥')) return tok(Tag::T_bot);
75 if (accept(U'⊤')) return tok(Tag::T_top);
76 if (accept(U'□')) return tok(Tag::T_box);
77 if (accept( ',')) return tok(Tag::T_comma);
78 if (accept( '$')) return tok(Tag::T_dollar);
79 if (accept( '#')) return tok(Tag::T_extract);
80 if (accept(U'λ')) return tok(Tag::T_lm);
81 if (accept(U'Π')) return tok(Tag::T_Pi);
82 if (accept( ';')) return tok(Tag::T_semicolon);
83 if (accept(U'★')) return tok(Tag::T_star);
84 if (accept( '*')) return tok(Tag::T_star);
85 if (accept( ':')) {
86 if (accept( ':')) return tok(Tag::T_colon_colon);
87 return tok(Tag::T_colon);
88 }
89 if (accept( '|')) {
90 if (accept('~')) {
91 if (accept('|')) return tok(Tag::T_Pi);
92 }
93 error(loc_, "invalid input char '{}'; maybe you wanted to use '|~|'?", str_);
94 continue;
95 }
96 // clang-format on
97
98 if (accept('%')) {
99 if (lex_id()) {
100 auto loc = cache_trailing_dot();
101 return {loc, Tag::M_anx, sym()};
102 }
103 error(loc_, "invalid axiom name '{}'", str_);
104 }
105
106 if (accept('.')) {
107 if (lex_id()) {
108 if (auto i = keywords_.find(sym()); i != keywords_.end()) return tok(i->second);
109 // Split non-keyword into T_dot and M_id; M_id goes into cache_ for next lex().
110 assert(!cache_.has_value());
111 auto id_loc = loc();
112 ++id_loc.begin.col;
113 cache_.emplace(id_loc, Tag::M_id, world().sym(str_.substr(1)));
114 return {loc().anew_begin(), Tag::T_dot};
115 }
116
117 if (accept(utf8::isdigit)) {
118 parse_digits();
119 parse_exp();
120 return {loc_, f64(std::strtod(str_.c_str(), nullptr))};
121 }
122
123 return tok(Tag::T_dot);
124 }
125
126 if (accept('\'')) {
127 auto c = lex_char();
128 if (accept('\'')) return {loc(), c};
129 error(loc_, "invalid character literal {}", str_);
130 continue;
131 }
132
133 if (accept<Append::Off>('\"')) {
134 while (lex_char() != '"') {}
135 str_.pop_back(); // remove final '"'
136 return {loc_, Tag::M_str, sym()};
137 }
138
139 if (lex_id()) {
140 auto loc = cache_trailing_dot();
141 return {loc, Tag::M_id, sym()};
142 }
143
144 if (utf8::isdigit(ahead()) || utf8::any('+', '-')(ahead())) {
145 if (auto lit = parse_lit()) return *lit;
146 continue;
147 }
148
149 if (start_md()) {
150 emit_md();
151 continue;
152 }
153
154 // comments
155 if (accept('/')) {
156 if (accept('*')) {
157 eat_comments();
158 continue;
159 }
160 if (accept('/')) {
161 while (ahead() != utf8::EoF && ahead() != '\n') next();
162 continue;
163 }
164
165 error({loc_.path, peek_}, "invalid input char '/'; maybe you wanted to start a comment?");
166 continue;
167 }
168
169 error({loc_.path, peek_}, "invalid input char '{}'", utf8::Char32(ahead()));
170 next();
171 }
172}
173
174// A trailing T_dot does not belong to an annex name or identifier and goes into cache_ for next lex().
175Loc Lexer::cache_trailing_dot() {
176 auto l = loc();
177 if (str_.back() == '.') {
178 str_.pop_back();
179 assert(!cache_.has_value());
180 cache_.emplace(l.anew_finis(), Tag::T_dot);
181 --l.finis.col;
182 }
183 return l;
184}
185
186bool Lexer::lex_id() {
187 if (accept([](char32_t c) { return c == '_' || utf8::isalpha(c); })) {
188 while (accept([](char32_t c) { return c == '_' || c == '.' || utf8::isalnum(c); })) {}
189 return true;
190 }
191 return false;
192}
193
194// clang-format off
195std::optional<Tok> Lexer::parse_lit() {
196 int base = 10;
197 std::optional<bool> sign;
198
199 if (accept<Append::Off>('+')) {
200 sign = false;
201 } else if (accept<Append::Off>('-')) {
202 if (accept('>')) return tok(Tag::T_arrow);
203 sign = true;
204 }
205
206 // prefix starting with '0'
207 if (accept<Append::Off>('0')) {
208 if (accept<Append::Off>('b')) base = 2;
209 else if (accept<Append::Off>('B')) base = 2;
210 else if (accept<Append::Off>('o')) base = 8;
211 else if (accept<Append::Off>('O')) base = 8;
212 else if (accept<Append::Off>('x')) base = 16;
213 else if (accept<Append::Off>('X')) base = 16;
214 }
215
216 parse_digits(base);
217
218 if (accept<Append::Off>('I')) {
219 if (sign) str_.insert(0, "-"sv);
220 auto val = std::strtoull(str_.c_str(), nullptr, base);
221 str_.clear();
222 parse_digits();
223 auto width = std::strtoull(str_.c_str(), nullptr, 10);
224 return Tok{loc_, world().lit_int(width, val)};
225 }
226
227 if (!sign && base == 10) {
228 if (utf8::isrange(ahead(), U'₀', U'₉')) {
229 auto i = std::strtoull(str_.c_str(), nullptr, 10);
230 std::string mod;
231 while (utf8::isrange(ahead(), U'₀', U'₉')) mod += next() - U'₀' + '0';
232 auto m = std::strtoull(mod.c_str(), nullptr, 10);
233 return Tok{loc_, world().lit_idx_mod(m, i)};
234 } else if (accept<Append::Off>('_')) {
235 auto i = std::strtoull(str_.c_str(), nullptr, 10);
236 str_.clear();
237 if (accept(utf8::isdigit)) {
238 parse_digits(10);
239 auto m = std::strtoull(str_.c_str(), nullptr, 10);
240 return Tok{loc_, world().lit_idx_mod(m, i)};
241 } else {
242 error(loc_, "stray underscore in unsigned literal");
243 auto i = std::strtoull(str_.c_str(), nullptr, 10);
244 return Tok{loc_, u64(i)};
245 }
246 }
247 }
248
249 bool is_float = false;
250 if (base == 10 || base == 16) {
251 // parse fractional part
252 if (accept('.')) {
253 is_float = true;
254 parse_digits(base);
255 }
256
257 bool has_exp = parse_exp(base);
258 if (base == 16 && is_float && !has_exp) error(loc_, "hexadecimal floating constants require an exponent");
259 is_float |= has_exp;
260 }
261
262 if (sign && str_.empty()) {
263 error(loc_, "stray '{}'", *sign ? "-" : "+");
264 return {};
265 }
266
267 if (is_float && base == 16) str_.insert(0, "0x"sv);
268 if (sign && *sign) str_.insert(0, "-"sv);
269
270 if (is_float) return Tok{loc_, f64(std::strtod (str_.c_str(), nullptr ))};
271 if (sign) return Tok{loc_, u64(std::strtoll (str_.c_str(), nullptr, base))};
272 else return Tok{loc_, u64(std::strtoull(str_.c_str(), nullptr, base))};
273}
274
275void Lexer::parse_digits(int base /*= 10*/) {
276 switch (base) {
277 // clang-format off
278 case 2: while (accept(utf8::isbdigit)) {} break;
279 case 8: while (accept(utf8::isodigit)) {} break;
280 case 10: while (accept(utf8::isdigit)) {} break;
281 case 16: while (accept(utf8::isxdigit)) {} break;
282 // clang-format on
283 default: fe::unreachable();
284 }
285}
286
287bool Lexer::parse_exp(int base /*= 10*/) {
288 if (accept(base == 10 ? utf8::any('e', 'E') : utf8::any('p', 'P'))) {
289 accept(utf8::any('+', '-'));
290 if (!utf8::isdigit(ahead())) error(loc_, "exponent has no digits");
291 parse_digits();
292 return true;
293 }
294 return false;
295}
296// clang-format on
297
298char8_t Lexer::lex_char() {
299 if (accept<Append::Off>('\\')) {
300 // clang-format off
301 if (false) {}
302 else if (accept<Append::Off>('\'')) str_ += '\'';
303 else if (accept<Append::Off>('\\')) str_ += '\\';
304 else if (accept<Append::Off>( '"')) str_ += '\"';
305 else if (accept<Append::Off>( '0')) str_ += '\0';
306 else if (accept<Append::Off>( 'a')) str_ += '\a';
307 else if (accept<Append::Off>( 'b')) str_ += '\b';
308 else if (accept<Append::Off>( 'f')) str_ += '\f';
309 else if (accept<Append::Off>( 'n')) str_ += '\n';
310 else if (accept<Append::Off>( 'r')) str_ += '\r';
311 else if (accept<Append::Off>( 't')) str_ += '\t';
312 else if (accept<Append::Off>( 'v')) str_ += '\v';
313 else error(loc_.anew_finis(), "invalid escape character '\\{}'", (char)ahead());
314 // clang-format on
315 return str_.back();
316 }
317 auto c = next();
318 str_ += c;
319 if (utf8::isascii(c)) return c;
320 error(loc_, "invalid character '{}'", (char)c);
321}
322
323void Lexer::eat_comments() {
324 while (true) {
325 while (ahead() != utf8::EoF && ahead() != '*') next();
326 if (accept(utf8::EoF)) {
327 error(loc_, "non-terminated multiline comment");
328 return;
329 }
330 next();
331 if (accept('/')) break;
332 }
333}
334
335void Lexer::emit_md(bool start_of_file) {
336 if (!start_of_file) md_fence();
337
338 do {
339 out_ = false;
340 for (int i = 0; i < 3; ++i) next();
341 accept(' ');
342 out_ = true;
343
344 while (ahead() != utf8::EoF && ahead() != '\n') next();
345 accept('\n');
346 } while (start_md());
347
348 if (ahead() == utf8::EoF)
349 out_ = false;
350 else
351 md_fence();
352}
353
354Sym Lexer::sym() { return world().sym(str_); }
355
356} // namespace thorin
Loc loc() const
Definition lexer.h:23
Tok lex()
Definition lexer.cpp:31
Lexer(World &world, std::istream &istream, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read Thorin files (see Lexical Structure).
Definition lexer.cpp:12
World & world()
Definition lexer.h:21
The World represents the whole program and manages creation of Thorin nodes (Defs).
Definition world.h:35
Sym sym(std::string_view)
Definition world.cpp:77
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Definition world.h:389
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
Definition world.h:402
#define CODE(node, name)
Definition def.h:40
Definition cfg.h:11
void error(const Def *def, const char *fmt, Args &&... args)
Definition def.h:622
uint64_t u64
Definition types.h:35
double f64
Definition types.h:42
#define THORIN_SUBST(m)
Definition tok.h:98
#define THORIN_KEY(m)
Definition tok.h:14