5using namespace std::literals;
9namespace utf8 = fe::utf8;
12Lexer::Lexer(
AST& ast, std::istream& istream,
const fs::path* path , std::ostream* md )
13 : Super(istream, path)
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
35 if (accept(utf8::EoF))
return tok(Tag::EoF);
36 if (accept(utf8::isspace))
continue;
37 if (accept(utf8::Null)) {
38 ast().
error(loc_,
"invalid UTF-8 character");
44 if (accept(
'('))
return tok(Tag::D_paren_l);
45 if (accept(
')'))
return tok(Tag::D_paren_r);
46 if (accept(
'['))
return tok(Tag::D_brckt_l);
47 if (accept(
']'))
return tok(Tag::D_brckt_r);
48 if (accept(
'{'))
return tok(Tag::D_brace_l);
49 if (accept(
'}'))
return tok(Tag::D_brace_r);
50 if (accept(U
'⦃'))
return tok(Tag::D_curly_l);
51 if (accept(U
'⦄'))
return tok(Tag::D_curly_r);
52 if (accept(U
'«'))
return tok(Tag::D_quote_l);
53 if (accept(U
'»'))
return tok(Tag::D_quote_r);
54 if (accept(U
'⟪'))
return tok(Tag::D_quote_l);
55 if (accept(U
'⟫'))
return tok(Tag::D_quote_r);
56 if (accept(U
'‹'))
return tok(Tag::D_angle_l);
57 if (accept(U
'›'))
return tok(Tag::D_angle_r);
58 if (accept(U
'⟨'))
return tok(Tag::D_angle_l);
59 if (accept(U
'⟩'))
return tok(Tag::D_angle_r);
61 if (accept(
'<'))
return tok(Tag::D_quote_l);
62 return tok(Tag::D_angle_l);
65 if (accept(
'>'))
return tok(Tag::D_quote_r);
66 return tok(Tag::D_angle_r);
69 if (accept(U
'→'))
return tok(Tag::T_arrow);
70 if (accept(
'@'))
return tok(Tag::T_at);
71 if (accept(
'='))
return tok(Tag::T_assign);
72 if (accept(U
'⊥'))
return tok(Tag::T_bot);
73 if (accept(U
'⊤'))
return tok(Tag::T_top);
74 if (accept(U
'□'))
return tok(Tag::T_box);
75 if (accept(
','))
return tok(Tag::T_comma);
76 if (accept(
'$'))
return tok(Tag::T_dollar);
77 if (accept(
'#'))
return tok(Tag::T_extract);
78 if (accept(U
'λ'))
return tok(Tag::T_lm);
79 if (accept(
';'))
return tok(Tag::T_semicolon);
80 if (accept(U
'★'))
return tok(Tag::T_star);
81 if (accept(
'*'))
return tok(Tag::T_star);
82 if (accept(
':'))
return tok(Tag::T_colon);
86 if (lex_id())
return {loc_, Tag::M_anx, sym()};
87 ast().
error(loc_,
"invalid axiom name '{}'", str_);
92 if (accept(utf8::isdigit)) {
95 return {loc_,
f64(std::strtod(str_.c_str(),
nullptr))};
98 return tok(Tag::T_dot);
103 if (accept(
'\''))
return {loc_, c};
104 ast().
error(loc_,
"invalid character literal {}", str_);
108 if (accept<Append::Off>(
'\"')) {
109 while (lex_char() !=
'"') {}
111 return {loc_, Tag::L_str, sym()};
115 if (
auto i = keywords_.find(sym()); i != keywords_.end())
return tok(i->second);
116 return {loc_, Tag::M_id, sym()};
119 if (utf8::isdigit(ahead()) || utf8::any(
'+',
'-')(ahead())) {
120 if (
auto lit = parse_lit())
return *lit;
136 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
140 ast().
error({loc_.path, peek_},
"invalid input char '/'; maybe you wanted to start a comment?");
144 ast().
error({loc_.path, peek_},
"invalid input char '{}'", utf8::Char32(ahead()));
149bool Lexer::lex_id() {
150 if (accept([](
char32_t c) {
return c ==
'_' || utf8::isalpha(c); })) {
151 while (accept([](
char32_t c) {
return c ==
'_' || c ==
'.' || utf8::isalnum(c); })) {}
158std::optional<Tok> Lexer::parse_lit() {
160 std::optional<bool> sign;
162 if (accept<Append::Off>(
'+')) {
164 }
else if (accept<Append::Off>(
'-')) {
165 if (accept(
'>'))
return tok(Tag::T_arrow);
170 if (accept<Append::Off>(
'0')) {
171 if (accept<Append::Off>(
'b')) base = 2;
172 else if (accept<Append::Off>(
'B')) base = 2;
173 else if (accept<Append::Off>(
'o')) base = 8;
174 else if (accept<Append::Off>(
'O')) base = 8;
175 else if (accept<Append::Off>(
'x')) base = 16;
176 else if (accept<Append::Off>(
'X')) base = 16;
181 if (accept(utf8::any(
'i',
'I'))) {
182 if (sign) str_.insert(0,
"-"sv);
183 auto val = std::strtoull(str_.c_str(),
nullptr, base);
186 auto width = std::strtoull(str_.c_str(),
nullptr, 10);
190 if (!sign && base == 10) {
191 if (utf8::isrange(ahead(), U
'₀', U
'₉')) {
192 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
194 while (utf8::isrange(ahead(), U
'₀', U
'₉')) mod += next() - U
'₀' +
'0';
195 auto m = std::strtoull(mod.c_str(),
nullptr, 10);
197 }
else if (accept<Append::Off>(
'_')) {
198 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
200 if (accept(utf8::isdigit)) {
202 auto m = std::strtoull(str_.c_str(),
nullptr, 10);
205 ast().
error(loc_,
"stray underscore in Idx literal; size is missing");
206 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
207 return Tok{loc_,
u64(i)};
212 bool is_float =
false;
213 if (base == 10 || base == 16) {
220 bool has_exp = parse_exp(base);
221 if (base == 16 && is_float && !has_exp)
ast().
error(loc_,
"hexadecimal floating constants require an exponent");
225 if (sign && str_.empty()) {
226 ast().
error(loc_,
"stray '{}'", *sign ?
"-" :
"+");
230 if (is_float && base == 16) str_.insert(0,
"0x"sv);
231 if (sign && *sign) str_.insert(0,
"-"sv);
233 if (is_float)
return Tok{loc_,
f64(std::strtod (str_.c_str(),
nullptr ))};
234 if (sign)
return Tok{loc_,
u64(std::strtoll (str_.c_str(),
nullptr, base))};
235 else return Tok{loc_,
u64(std::strtoull(str_.c_str(),
nullptr, base))};
238void Lexer::parse_digits(
int base ) {
241 case 2:
while (accept(utf8::isbdigit)) {}
break;
242 case 8:
while (accept(utf8::isodigit)) {}
break;
243 case 10:
while (accept(utf8::isdigit)) {}
break;
244 case 16:
while (accept(utf8::isxdigit)) {}
break;
246 default: fe::unreachable();
250bool Lexer::parse_exp(
int base ) {
251 if (accept(base == 10 ? utf8::any(
'e',
'E') : utf8::any(
'p',
'P'))) {
252 accept(utf8::any(
'+',
'-'));
253 if (!utf8::isdigit(ahead()))
ast().
error(loc_,
"exponent has no digits");
261char8_t Lexer::lex_char() {
262 if (accept<Append::Off>(
'\\')) {
265 else if (accept<Append::Off>(
'\'')) str_ +=
'\'';
266 else if (accept<Append::Off>(
'\\')) str_ +=
'\\';
267 else if (accept<Append::Off>(
'"')) str_ +=
'\"';
268 else if (accept<Append::Off>(
'0')) str_ +=
'\0';
269 else if (accept<Append::Off>(
'a')) str_ +=
'\a';
270 else if (accept<Append::Off>(
'b')) str_ +=
'\b';
271 else if (accept<Append::Off>(
'f')) str_ +=
'\f';
272 else if (accept<Append::Off>(
'n')) str_ +=
'\n';
273 else if (accept<Append::Off>(
'r')) str_ +=
'\r';
274 else if (accept<Append::Off>(
't')) str_ +=
'\t';
275 else if (accept<Append::Off>(
'v')) str_ +=
'\v';
276 else ast().
error(loc_.anew_finis(),
"invalid escape character '\\{}'", (
char)ahead());
282 if (utf8::isascii(c))
return c;
283 ast().
error(loc_,
"invalid character '{}'", (
char)c);
287void Lexer::eat_comments() {
289 while (ahead() != utf8::EoF && ahead() !=
'*') next();
290 if (accept(utf8::EoF)) {
291 ast().
error(loc_,
"non-terminated multiline comment");
295 if (accept(
'/'))
break;
299void Lexer::emit_md(
bool start_of_file) {
300 if (!start_of_file) md_fence();
304 for (
int i = 0; i < 3; ++i) next();
308 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
310 }
while (start_md());
312 if (ahead() == utf8::EoF)
318Sym Lexer::sym() {
return ast().
sym(str_); }
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).