5using namespace std::literals;
9namespace utf8 = fe::utf8;
13 : Super(istream, path)
16#define CODE(t, str) keywords_[world.sym(str)] = Tag::t;
21 if (Tag::t != Tag::Nil) keywords_[world.sym(str)] = Tag::t;
33 if (
auto cache = cache_) {
40 if (accept(utf8::EoF))
return tok(Tag::EoF);
41 if (accept(utf8::isspace))
continue;
42 if (accept(utf8::Null))
error(loc_,
"invalid UTF-8 character");
46 if (accept(
'('))
return tok(Tag::D_paren_l);
47 if (accept(
')'))
return tok(Tag::D_paren_r);
48 if (accept(
'['))
return tok(Tag::D_brckt_l);
49 if (accept(
']'))
return tok(Tag::D_brckt_r);
50 if (accept(
'{'))
return tok(Tag::D_brace_l);
51 if (accept(
'}'))
return tok(Tag::D_brace_r);
52 if (accept(U
'«'))
return tok(Tag::D_quote_l);
53 if (accept(U
'»'))
return tok(Tag::D_quote_r);
54 if (accept(U
'⟪'))
return tok(Tag::D_quote_l);
55 if (accept(U
'⟫'))
return tok(Tag::D_quote_r);
56 if (accept(U
'‹'))
return tok(Tag::D_angle_l);
57 if (accept(U
'›'))
return tok(Tag::D_angle_r);
58 if (accept(U
'⟨'))
return tok(Tag::D_angle_l);
59 if (accept(U
'⟩'))
return tok(Tag::D_angle_r);
61 if (accept(
'<'))
return tok(Tag::D_quote_l);
62 return tok(Tag::D_angle_l);
65 if (accept(
'>'))
return tok(Tag::D_quote_r);
66 return tok(Tag::D_angle_r);
69 if (accept(
'`'))
return tok(Tag::T_backtick);
70 if (accept(U
'→'))
return tok(Tag::T_arrow);
71 if (accept(
'@'))
return tok(Tag::T_at);
72 if (accept(
'='))
return tok(Tag::T_assign);
73 if (accept(
'!'))
return tok(Tag::T_bang);
74 if (accept(U
'⊥'))
return tok(Tag::T_bot);
75 if (accept(U
'⊤'))
return tok(Tag::T_top);
76 if (accept(U
'□'))
return tok(Tag::T_box);
77 if (accept(
','))
return tok(Tag::T_comma);
78 if (accept(
'$'))
return tok(Tag::T_dollar);
79 if (accept(
'#'))
return tok(Tag::T_extract);
80 if (accept(U
'λ'))
return tok(Tag::T_lm);
81 if (accept(U
'Π'))
return tok(Tag::T_Pi);
82 if (accept(
';'))
return tok(Tag::T_semicolon);
83 if (accept(U
'★'))
return tok(Tag::T_star);
84 if (accept(
'*'))
return tok(Tag::T_star);
86 if (accept(
':'))
return tok(Tag::T_colon_colon);
87 return tok(Tag::T_colon);
91 if (accept(
'|'))
return tok(Tag::T_Pi);
93 error(loc_,
"invalid input char '{}'; maybe you wanted to use '|~|'?", str_);
100 auto loc = cache_trailing_dot();
101 return {
loc, Tag::M_anx, sym()};
103 error(loc_,
"invalid axiom name '{}'", str_);
108 if (
auto i = keywords_.find(sym()); i != keywords_.end())
return tok(i->second);
110 assert(!cache_.has_value());
113 cache_.emplace(id_loc, Tag::M_id,
world().sym(str_.substr(1)));
114 return {
loc().anew_begin(), Tag::T_dot};
117 if (accept(utf8::isdigit)) {
120 return {loc_,
f64(std::strtod(str_.c_str(),
nullptr))};
123 return tok(Tag::T_dot);
128 if (accept(
'\''))
return {
loc(), c};
129 error(loc_,
"invalid character literal {}", str_);
133 if (accept<Append::Off>(
'\"')) {
134 while (lex_char() !=
'"') {}
136 return {loc_, Tag::M_str, sym()};
140 auto loc = cache_trailing_dot();
141 return {
loc, Tag::M_id, sym()};
144 if (utf8::isdigit(ahead()) || utf8::any(
'+',
'-')(ahead())) {
145 if (
auto lit = parse_lit())
return *lit;
161 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
165 error({loc_.path, peek_},
"invalid input char '/'; maybe you wanted to start a comment?");
169 error({loc_.path, peek_},
"invalid input char '{}'", utf8::Char32(ahead()));
175Loc Lexer::cache_trailing_dot() {
177 if (str_.back() ==
'.') {
179 assert(!cache_.has_value());
180 cache_.emplace(l.anew_finis(), Tag::T_dot);
186bool Lexer::lex_id() {
187 if (accept([](
char32_t c) {
return c ==
'_' || utf8::isalpha(c); })) {
188 while (accept([](
char32_t c) {
return c ==
'_' ||
c ==
'.' || utf8::isalnum(c); })) {}
195std::optional<Tok> Lexer::parse_lit() {
197 std::optional<bool> sign;
199 if (accept<Append::Off>(
'+')) {
201 }
else if (accept<Append::Off>(
'-')) {
202 if (accept(
'>'))
return tok(Tag::T_arrow);
207 if (accept<Append::Off>(
'0')) {
208 if (accept<Append::Off>(
'b')) base = 2;
209 else if (accept<Append::Off>(
'B')) base = 2;
210 else if (accept<Append::Off>(
'o')) base = 8;
211 else if (accept<Append::Off>(
'O')) base = 8;
212 else if (accept<Append::Off>(
'x')) base = 16;
213 else if (accept<Append::Off>(
'X')) base = 16;
218 if (accept<Append::Off>(
'I')) {
219 if (sign) str_.insert(0,
"-"sv);
220 auto val = std::strtoull(str_.c_str(),
nullptr, base);
223 auto width = std::strtoull(str_.c_str(),
nullptr, 10);
227 if (!sign && base == 10) {
228 if (utf8::isrange(ahead(), U
'₀', U
'₉')) {
229 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
231 while (utf8::isrange(ahead(), U
'₀', U
'₉')) mod += next() - U
'₀' +
'0';
232 auto m = std::strtoull(mod.c_str(),
nullptr, 10);
234 }
else if (accept<Append::Off>(
'_')) {
235 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
237 if (accept(utf8::isdigit)) {
239 auto m = std::strtoull(str_.c_str(),
nullptr, 10);
242 error(loc_,
"stray underscore in unsigned literal");
243 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
244 return Tok{loc_,
u64(i)};
249 bool is_float =
false;
250 if (base == 10 || base == 16) {
257 bool has_exp = parse_exp(base);
258 if (base == 16 && is_float && !has_exp)
error(loc_,
"hexadecimal floating constants require an exponent");
262 if (sign && str_.empty()) {
263 error(loc_,
"stray '{}'", *sign ?
"-" :
"+");
267 if (is_float && base == 16) str_.insert(0,
"0x"sv);
268 if (sign && *sign) str_.insert(0,
"-"sv);
270 if (is_float)
return Tok{loc_,
f64(std::strtod (str_.c_str(),
nullptr ))};
271 if (sign)
return Tok{loc_,
u64(std::strtoll (str_.c_str(),
nullptr, base))};
272 else return Tok{loc_,
u64(std::strtoull(str_.c_str(),
nullptr, base))};
275void Lexer::parse_digits(
int base ) {
278 case 2:
while (accept(utf8::isbdigit)) {}
break;
279 case 8:
while (accept(utf8::isodigit)) {}
break;
280 case 10:
while (accept(utf8::isdigit)) {}
break;
281 case 16:
while (accept(utf8::isxdigit)) {}
break;
283 default: fe::unreachable();
287bool Lexer::parse_exp(
int base ) {
288 if (accept(base == 10 ? utf8::any(
'e',
'E') : utf8::
any(
'p',
'P'))) {
289 accept(utf8::any(
'+',
'-'));
290 if (!utf8::isdigit(ahead()))
error(loc_,
"exponent has no digits");
298char8_t Lexer::lex_char() {
299 if (accept<Append::Off>(
'\\')) {
302 else if (accept<Append::Off>(
'\'')) str_ +=
'\'';
303 else if (accept<Append::Off>(
'\\')) str_ +=
'\\';
304 else if (accept<Append::Off>(
'"')) str_ +=
'\"';
305 else if (accept<Append::Off>(
'0')) str_ +=
'\0';
306 else if (accept<Append::Off>(
'a')) str_ +=
'\a';
307 else if (accept<Append::Off>(
'b')) str_ +=
'\b';
308 else if (accept<Append::Off>(
'f')) str_ +=
'\f';
309 else if (accept<Append::Off>(
'n')) str_ +=
'\n';
310 else if (accept<Append::Off>(
'r')) str_ +=
'\r';
311 else if (accept<Append::Off>(
't')) str_ +=
'\t';
312 else if (accept<Append::Off>(
'v')) str_ +=
'\v';
313 else error(loc_.anew_finis(),
"invalid escape character '\\{}'", (
char)ahead());
319 if (utf8::isascii(c))
return c;
320 error(loc_,
"invalid character '{}'", (
char)c);
323void Lexer::eat_comments() {
325 while (ahead() != utf8::EoF && ahead() !=
'*') next();
326 if (accept(utf8::EoF)) {
327 error(loc_,
"non-terminated multiline comment");
331 if (accept(
'/'))
break;
335void Lexer::emit_md(
bool start_of_file) {
336 if (!start_of_file) md_fence();
340 for (
int i = 0; i < 3; ++i) next();
344 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
346 }
while (start_md());
348 if (ahead() == utf8::EoF)
354Sym Lexer::sym() {
return world().
sym(str_); }
Lexer(World &world, std::istream &istream, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read Thorin files (see Lexical Structure).
The World represents the whole program and manages creation of Thorin nodes (Defs).
Sym sym(std::string_view)
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
void error(const Def *def, const char *fmt, Args &&... args)