5using namespace std::literals;
9namespace utf8 = fe::utf8;
13 : Super(istream,
path)
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
35 if (accept(utf8::EoF))
return tok(Tag::EoF);
36 if (accept(utf8::isspace))
continue;
37 if (accept(utf8::Null)) {
38 ast().
error(loc_,
"invalid UTF-8 character");
44 if (accept(
'('))
return tok(Tag::D_paren_l);
45 if (accept(
')'))
return tok(Tag::D_paren_r);
46 if (accept(
'['))
return tok(Tag::D_brckt_l);
47 if (accept(
']'))
return tok(Tag::D_brckt_r);
48 if (accept(
'{'))
return tok(Tag::D_brace_l);
49 if (accept(
'}'))
return tok(Tag::D_brace_r);
50 if (accept(U
'⦃'))
return tok(Tag::D_curly_l);
51 if (accept(U
'⦄'))
return tok(Tag::D_curly_r);
52 if (accept(U
'«'))
return tok(Tag::D_quote_l);
53 if (accept(U
'»'))
return tok(Tag::D_quote_r);
54 if (accept(U
'⟪'))
return tok(Tag::D_quote_l);
55 if (accept(U
'⟫'))
return tok(Tag::D_quote_r);
56 if (accept(U
'‹'))
return tok(Tag::D_angle_l);
57 if (accept(U
'›'))
return tok(Tag::D_angle_r);
58 if (accept(U
'⟨'))
return tok(Tag::D_angle_l);
59 if (accept(U
'⟩'))
return tok(Tag::D_angle_r);
61 if (accept(
'<'))
return tok(Tag::D_quote_l);
62 return tok(Tag::D_angle_l);
65 if (accept(
'>'))
return tok(Tag::D_quote_r);
66 return tok(Tag::D_angle_r);
69 if (accept(U
'→'))
return tok(Tag::T_arrow);
70 if (accept(
'@'))
return tok(Tag::T_at);
72 if (accept(
'>'))
return tok(Tag::T_fat_arrow);
73 return tok(Tag::T_assign);
75 if (accept(U
'⊥'))
return tok(Tag::T_bot);
76 if (accept(U
'⊤'))
return tok(Tag::T_top);
77 if (accept(U
'□'))
return tok(Tag::T_box);
78 if (accept(
','))
return tok(Tag::T_comma);
79 if (accept(
'$'))
return tok(Tag::T_dollar);
80 if (accept(
'#'))
return tok(Tag::T_extract);
81 if (accept(U
'λ'))
return tok(Tag::T_lm);
82 if (accept(
';'))
return tok(Tag::T_semicolon);
83 if (accept(U
'★'))
return tok(Tag::T_star);
84 if (accept(
'*'))
return tok(Tag::T_star);
85 if (accept(
':'))
return tok(Tag::T_colon);
86 if (accept(U
'∪'))
return tok(Tag::T_union);
90 if (lex_id())
return {loc_, Tag::M_anx, sym()};
91 ast().
error(loc_,
"invalid axm name '{}'", str_);
96 if (accept(utf8::isdigit)) {
99 return {loc_,
f64(std::strtod(str_.c_str(),
nullptr))};
102 return tok(Tag::T_dot);
107 if (accept(
'\''))
return {loc_, c};
108 ast().
error(loc_,
"invalid character literal {}", str_);
112 if (accept<Append::Off>(
'\"')) {
113 while (lex_char() !=
'"') {}
115 return {loc_, Tag::L_str, sym()};
119 if (
auto i = keywords_.find(sym()); i != keywords_.end())
return tok(i->second);
120 return {loc_, Tag::M_id, sym()};
123 if (utf8::isdigit(ahead()) || utf8::any(
'+',
'-')(ahead())) {
124 if (
auto lit = parse_lit())
return *lit;
140 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
144 ast().
error({loc_.path, peek_},
"invalid input char '/'; maybe you wanted to start a comment?");
148 ast().
error({loc_.path, peek_},
"invalid input char '{}'", utf8::Char32(ahead()));
153bool Lexer::lex_id() {
154 if (accept([](
char32_t c) {
return c ==
'_' || utf8::isalpha(c); })) {
155 while (accept([](
char32_t c) {
return c ==
'_' || c ==
'.' || utf8::isalnum(c); })) {}
162std::optional<Tok> Lexer::parse_lit() {
164 std::optional<bool> sign;
166 if (accept<Append::Off>(
'+')) {
168 }
else if (accept<Append::Off>(
'-')) {
169 if (accept(
'>'))
return tok(Tag::T_arrow);
174 if (accept<Append::Off>(
'0')) {
175 if (accept<Append::Off>(
'b')) base = 2;
176 else if (accept<Append::Off>(
'B')) base = 2;
177 else if (accept<Append::Off>(
'o')) base = 8;
178 else if (accept<Append::Off>(
'O')) base = 8;
179 else if (accept<Append::Off>(
'x')) base = 16;
180 else if (accept<Append::Off>(
'X')) base = 16;
185 if (accept(utf8::any(
'i',
'I'))) {
186 if (sign) str_.insert(0,
"-"sv);
187 auto val = std::strtoull(str_.c_str(),
nullptr, base);
190 auto width = std::strtoull(str_.c_str(),
nullptr, 10);
194 if (!sign && base == 10) {
195 if (utf8::isrange(ahead(), U
'₀', U
'₉')) {
196 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
198 while (utf8::isrange(ahead(), U
'₀', U
'₉')) mod += next() - U
'₀' +
'0';
199 auto m = std::strtoull(mod.c_str(),
nullptr, 10);
201 }
else if (accept<Append::Off>(
'_')) {
202 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
204 if (accept(utf8::isdigit)) {
206 auto m = std::strtoull(str_.c_str(),
nullptr, 10);
209 ast().
error(loc_,
"stray underscore in Idx literal; size is missing");
210 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
211 return Tok{loc_,
u64(i)};
216 bool is_float =
false;
217 if (base == 10 || base == 16) {
224 bool has_exp = parse_exp(base);
225 if (base == 16 && is_float && !has_exp)
ast().
error(loc_,
"hexadecimal floating constants require an exponent");
229 if (sign && str_.empty()) {
230 ast().
error(loc_,
"stray '{}'", *sign ?
"-" :
"+");
234 if (is_float && base == 16) str_.insert(0,
"0x"sv);
235 if (sign && *sign) str_.insert(0,
"-"sv);
237 if (is_float)
return Tok{loc_,
f64(std::strtod (str_.c_str(),
nullptr ))};
238 if (sign)
return Tok{loc_,
u64(std::strtoll (str_.c_str(),
nullptr, base))};
239 else return Tok{loc_,
u64(std::strtoull(str_.c_str(),
nullptr, base))};
242void Lexer::parse_digits(
int base ) {
245 case 2:
while (accept(utf8::isbdigit)) {}
break;
246 case 8:
while (accept(utf8::isodigit)) {}
break;
247 case 10:
while (accept(utf8::isdigit)) {}
break;
248 case 16:
while (accept(utf8::isxdigit)) {}
break;
250 default: fe::unreachable();
254bool Lexer::parse_exp(
int base ) {
255 if (accept(base == 10 ? utf8::any(
'e',
'E') : utf8::any(
'p',
'P'))) {
256 accept(utf8::any(
'+',
'-'));
257 if (!utf8::isdigit(ahead()))
ast().
error(loc_,
"exponent has no digits");
265char8_t Lexer::lex_char() {
266 if (accept<Append::Off>(
'\\')) {
269 else if (accept<Append::Off>(
'\'')) str_ +=
'\'';
270 else if (accept<Append::Off>(
'\\')) str_ +=
'\\';
271 else if (accept<Append::Off>(
'"')) str_ +=
'\"';
272 else if (accept<Append::Off>(
'0')) str_ +=
'\0';
273 else if (accept<Append::Off>(
'a')) str_ +=
'\a';
274 else if (accept<Append::Off>(
'b')) str_ +=
'\b';
275 else if (accept<Append::Off>(
'f')) str_ +=
'\f';
276 else if (accept<Append::Off>(
'n')) str_ +=
'\n';
277 else if (accept<Append::Off>(
'r')) str_ +=
'\r';
278 else if (accept<Append::Off>(
't')) str_ +=
'\t';
279 else if (accept<Append::Off>(
'v')) str_ +=
'\v';
280 else ast().
error(loc_.anew_finis(),
"invalid escape character '\\{}'", (
char)ahead());
286 if (utf8::isascii(c))
return c;
287 ast().
error(loc_,
"invalid character '{}'", (
char)c);
291void Lexer::eat_comments() {
293 while (ahead() != utf8::EoF && ahead() !=
'*') next();
294 if (accept(utf8::EoF)) {
295 ast().
error(loc_,
"non-terminated multiline comment");
299 if (accept(
'/'))
break;
303void Lexer::emit_md(
bool start_of_file) {
304 if (!start_of_file) md_fence();
308 for (
int i = 0; i < 3; ++i) next();
312 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
314 }
while (start_md());
316 if (ahead() == utf8::EoF)
322Sym Lexer::sym() {
return ast().
sym(str_); }
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size 2^width.
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).
const fs::path * path() const