5using namespace std::literals;
9namespace utf8 = fe::utf8;
12Lexer::Lexer(
AST& ast, std::istream& istream,
const fs::path* path , std::ostream* md )
13 : Super(istream, path)
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
35 if (accept(utf8::EoF))
return tok(Tag::EoF);
36 if (accept(utf8::isspace))
continue;
37 if (accept(utf8::Null)) {
38 ast().
error(loc_,
"invalid UTF-8 character");
44 if (accept(
'('))
return tok(Tag::D_paren_l);
45 if (accept(
')'))
return tok(Tag::D_paren_r);
46 if (accept(
'['))
return tok(Tag::D_brckt_l);
47 if (accept(
']'))
return tok(Tag::D_brckt_r);
48 if (accept(
'{'))
return tok(Tag::D_brace_l);
49 if (accept(
'}'))
return tok(Tag::D_brace_r);
50 if (accept(U
'«'))
return tok(Tag::D_quote_l);
51 if (accept(U
'»'))
return tok(Tag::D_quote_r);
52 if (accept(U
'⟪'))
return tok(Tag::D_quote_l);
53 if (accept(U
'⟫'))
return tok(Tag::D_quote_r);
54 if (accept(U
'‹'))
return tok(Tag::D_angle_l);
55 if (accept(U
'›'))
return tok(Tag::D_angle_r);
56 if (accept(U
'⟨'))
return tok(Tag::D_angle_l);
57 if (accept(U
'⟩'))
return tok(Tag::D_angle_r);
59 if (accept(
'<'))
return tok(Tag::D_quote_l);
60 return tok(Tag::D_angle_l);
63 if (accept(
'>'))
return tok(Tag::D_quote_r);
64 return tok(Tag::D_angle_r);
67 if (accept(
'`'))
return tok(Tag::T_backtick);
68 if (accept(U
'→'))
return tok(Tag::T_arrow);
69 if (accept(
'@'))
return tok(Tag::T_at);
70 if (accept(
'='))
return tok(Tag::T_assign);
71 if (accept(U
'⊥'))
return tok(Tag::T_bot);
72 if (accept(U
'⊤'))
return tok(Tag::T_top);
73 if (accept(U
'□'))
return tok(Tag::T_box);
74 if (accept(
','))
return tok(Tag::T_comma);
75 if (accept(
'$'))
return tok(Tag::T_dollar);
76 if (accept(
'#'))
return tok(Tag::T_extract);
77 if (accept(U
'λ'))
return tok(Tag::T_lm);
78 if (accept(
';'))
return tok(Tag::T_semicolon);
79 if (accept(U
'★'))
return tok(Tag::T_star);
80 if (accept(
'*'))
return tok(Tag::T_star);
82 if (accept(
':'))
return tok(Tag::T_colon_colon);
83 return tok(Tag::T_colon);
88 if (lex_id())
return {loc_, Tag::M_anx, sym()};
89 ast().
error(loc_,
"invalid axiom name '{}'", str_);
94 if (accept(utf8::isdigit)) {
97 return {loc_,
f64(std::strtod(str_.c_str(),
nullptr))};
100 return tok(Tag::T_dot);
105 if (accept(
'\''))
return {loc_, c};
106 ast().
error(loc_,
"invalid character literal {}", str_);
110 if (accept<Append::Off>(
'\"')) {
111 while (lex_char() !=
'"') {}
113 return {loc_, Tag::L_str, sym()};
117 if (
auto i = keywords_.find(sym()); i != keywords_.end())
return tok(i->second);
118 return {loc_, Tag::M_id, sym()};
121 if (utf8::isdigit(ahead()) || utf8::any(
'+',
'-')(ahead())) {
122 if (
auto lit = parse_lit())
return *lit;
138 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
142 ast().
error({loc_.path, peek_},
"invalid input char '/'; maybe you wanted to start a comment?");
146 ast().
error({loc_.path, peek_},
"invalid input char '{}'", utf8::Char32(ahead()));
151bool Lexer::lex_id() {
152 if (accept([](
char32_t c) {
return c ==
'_' || utf8::isalpha(c); })) {
153 while (accept([](
char32_t c) {
return c ==
'_' || c ==
'.' || utf8::isalnum(c); })) {}
160std::optional<Tok> Lexer::parse_lit() {
162 std::optional<bool> sign;
164 if (accept<Append::Off>(
'+')) {
166 }
else if (accept<Append::Off>(
'-')) {
167 if (accept(
'>'))
return tok(Tag::T_arrow);
172 if (accept<Append::Off>(
'0')) {
173 if (accept<Append::Off>(
'b')) base = 2;
174 else if (accept<Append::Off>(
'B')) base = 2;
175 else if (accept<Append::Off>(
'o')) base = 8;
176 else if (accept<Append::Off>(
'O')) base = 8;
177 else if (accept<Append::Off>(
'x')) base = 16;
178 else if (accept<Append::Off>(
'X')) base = 16;
183 if (accept(utf8::any(
'i',
'I'))) {
184 if (sign) str_.insert(0,
"-"sv);
185 auto val = std::strtoull(str_.c_str(),
nullptr, base);
188 auto width = std::strtoull(str_.c_str(),
nullptr, 10);
192 if (!sign && base == 10) {
193 if (utf8::isrange(ahead(), U
'₀', U
'₉')) {
194 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
196 while (utf8::isrange(ahead(), U
'₀', U
'₉')) mod += next() - U
'₀' +
'0';
197 auto m = std::strtoull(mod.c_str(),
nullptr, 10);
199 }
else if (accept<Append::Off>(
'_')) {
200 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
202 if (accept(utf8::isdigit)) {
204 auto m = std::strtoull(str_.c_str(),
nullptr, 10);
207 ast().
error(loc_,
"stray underscore in Idx literal; size is missing");
208 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
209 return Tok{loc_,
u64(i)};
214 bool is_float =
false;
215 if (base == 10 || base == 16) {
222 bool has_exp = parse_exp(base);
223 if (base == 16 && is_float && !has_exp)
ast().
error(loc_,
"hexadecimal floating constants require an exponent");
227 if (sign && str_.empty()) {
228 ast().
error(loc_,
"stray '{}'", *sign ?
"-" :
"+");
232 if (is_float && base == 16) str_.insert(0,
"0x"sv);
233 if (sign && *sign) str_.insert(0,
"-"sv);
235 if (is_float)
return Tok{loc_,
f64(std::strtod (str_.c_str(),
nullptr ))};
236 if (sign)
return Tok{loc_,
u64(std::strtoll (str_.c_str(),
nullptr, base))};
237 else return Tok{loc_,
u64(std::strtoull(str_.c_str(),
nullptr, base))};
240void Lexer::parse_digits(
int base ) {
243 case 2:
while (accept(utf8::isbdigit)) {}
break;
244 case 8:
while (accept(utf8::isodigit)) {}
break;
245 case 10:
while (accept(utf8::isdigit)) {}
break;
246 case 16:
while (accept(utf8::isxdigit)) {}
break;
248 default: fe::unreachable();
252bool Lexer::parse_exp(
int base ) {
253 if (accept(base == 10 ? utf8::any(
'e',
'E') : utf8::any(
'p',
'P'))) {
254 accept(utf8::any(
'+',
'-'));
255 if (!utf8::isdigit(ahead()))
ast().
error(loc_,
"exponent has no digits");
263char8_t Lexer::lex_char() {
264 if (accept<Append::Off>(
'\\')) {
267 else if (accept<Append::Off>(
'\'')) str_ +=
'\'';
268 else if (accept<Append::Off>(
'\\')) str_ +=
'\\';
269 else if (accept<Append::Off>(
'"')) str_ +=
'\"';
270 else if (accept<Append::Off>(
'0')) str_ +=
'\0';
271 else if (accept<Append::Off>(
'a')) str_ +=
'\a';
272 else if (accept<Append::Off>(
'b')) str_ +=
'\b';
273 else if (accept<Append::Off>(
'f')) str_ +=
'\f';
274 else if (accept<Append::Off>(
'n')) str_ +=
'\n';
275 else if (accept<Append::Off>(
'r')) str_ +=
'\r';
276 else if (accept<Append::Off>(
't')) str_ +=
'\t';
277 else if (accept<Append::Off>(
'v')) str_ +=
'\v';
278 else ast().
error(loc_.anew_finis(),
"invalid escape character '\\{}'", (
char)ahead());
284 if (utf8::isascii(c))
return c;
285 ast().
error(loc_,
"invalid character '{}'", (
char)c);
289void Lexer::eat_comments() {
291 while (ahead() != utf8::EoF && ahead() !=
'*') next();
292 if (accept(utf8::EoF)) {
293 ast().
error(loc_,
"non-terminated multiline comment");
297 if (accept(
'/'))
break;
301void Lexer::emit_md(
bool start_of_file) {
302 if (!start_of_file) md_fence();
306 for (
int i = 0; i < 3; ++i) next();
310 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
312 }
while (start_md());
314 if (ahead() == utf8::EoF)
320Sym Lexer::sym() {
return ast().
sym(str_); }
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).