5using namespace std::literals;
9namespace utf8 = fe::utf8;
12Lexer::Lexer(
AST& ast, std::istream& istream,
const fs::path* path , std::ostream* md )
13 : Super(istream, path)
16#define CODE(t, str) keywords_[ast.sym(str)] = Tag::t;
21 if (Tag::t != Tag::Nil) keywords_[ast.sym(str)] = Tag::t;
35 if (accept(utf8::EoF))
return tok(Tag::EoF);
36 if (accept(utf8::isspace))
continue;
37 if (accept(utf8::Null)) {
38 ast().
error(loc_,
"invalid UTF-8 character");
44 if (accept(
'('))
return tok(Tag::D_paren_l);
45 if (accept(
')'))
return tok(Tag::D_paren_r);
46 if (accept(
'['))
return tok(Tag::D_brckt_l);
47 if (accept(
']'))
return tok(Tag::D_brckt_r);
48 if (accept(
'{'))
return tok(Tag::D_brace_l);
49 if (accept(
'}'))
return tok(Tag::D_brace_r);
50 if (accept(U
'«'))
return tok(Tag::D_quote_l);
51 if (accept(U
'»'))
return tok(Tag::D_quote_r);
52 if (accept(U
'⟪'))
return tok(Tag::D_quote_l);
53 if (accept(U
'⟫'))
return tok(Tag::D_quote_r);
54 if (accept(U
'‹'))
return tok(Tag::D_angle_l);
55 if (accept(U
'›'))
return tok(Tag::D_angle_r);
56 if (accept(U
'⟨'))
return tok(Tag::D_angle_l);
57 if (accept(U
'⟩'))
return tok(Tag::D_angle_r);
59 if (accept(
'<'))
return tok(Tag::D_quote_l);
60 return tok(Tag::D_angle_l);
63 if (accept(
'>'))
return tok(Tag::D_quote_r);
64 return tok(Tag::D_angle_r);
67 if (accept(U
'→'))
return tok(Tag::T_arrow);
68 if (accept(
'@'))
return tok(Tag::T_at);
69 if (accept(
'='))
return tok(Tag::T_assign);
70 if (accept(U
'⊥'))
return tok(Tag::T_bot);
71 if (accept(U
'⊤'))
return tok(Tag::T_top);
72 if (accept(U
'□'))
return tok(Tag::T_box);
73 if (accept(
','))
return tok(Tag::T_comma);
74 if (accept(
'$'))
return tok(Tag::T_dollar);
75 if (accept(
'#'))
return tok(Tag::T_extract);
76 if (accept(U
'λ'))
return tok(Tag::T_lm);
77 if (accept(
';'))
return tok(Tag::T_semicolon);
78 if (accept(U
'★'))
return tok(Tag::T_star);
79 if (accept(
'*'))
return tok(Tag::T_star);
80 if (accept(
':'))
return tok(Tag::T_colon);
84 if (lex_id())
return {loc_, Tag::M_anx, sym()};
85 ast().
error(loc_,
"invalid axiom name '{}'", str_);
90 if (accept(utf8::isdigit)) {
93 return {loc_,
f64(std::strtod(str_.c_str(),
nullptr))};
96 return tok(Tag::T_dot);
101 if (accept(
'\''))
return {loc_, c};
102 ast().
error(loc_,
"invalid character literal {}", str_);
106 if (accept<Append::Off>(
'\"')) {
107 while (lex_char() !=
'"') {}
109 return {loc_, Tag::L_str, sym()};
113 if (
auto i = keywords_.find(sym()); i != keywords_.end())
return tok(i->second);
114 return {loc_, Tag::M_id, sym()};
117 if (utf8::isdigit(ahead()) || utf8::any(
'+',
'-')(ahead())) {
118 if (
auto lit = parse_lit())
return *lit;
134 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
138 ast().
error({loc_.path, peek_},
"invalid input char '/'; maybe you wanted to start a comment?");
142 ast().
error({loc_.path, peek_},
"invalid input char '{}'", utf8::Char32(ahead()));
147bool Lexer::lex_id() {
148 if (accept([](
char32_t c) {
return c ==
'_' || utf8::isalpha(c); })) {
149 while (accept([](
char32_t c) {
return c ==
'_' || c ==
'.' || utf8::isalnum(c); })) {}
156std::optional<Tok> Lexer::parse_lit() {
158 std::optional<bool> sign;
160 if (accept<Append::Off>(
'+')) {
162 }
else if (accept<Append::Off>(
'-')) {
163 if (accept(
'>'))
return tok(Tag::T_arrow);
168 if (accept<Append::Off>(
'0')) {
169 if (accept<Append::Off>(
'b')) base = 2;
170 else if (accept<Append::Off>(
'B')) base = 2;
171 else if (accept<Append::Off>(
'o')) base = 8;
172 else if (accept<Append::Off>(
'O')) base = 8;
173 else if (accept<Append::Off>(
'x')) base = 16;
174 else if (accept<Append::Off>(
'X')) base = 16;
179 if (accept(utf8::any(
'i',
'I'))) {
180 if (sign) str_.insert(0,
"-"sv);
181 auto val = std::strtoull(str_.c_str(),
nullptr, base);
184 auto width = std::strtoull(str_.c_str(),
nullptr, 10);
188 if (!sign && base == 10) {
189 if (utf8::isrange(ahead(), U
'₀', U
'₉')) {
190 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
192 while (utf8::isrange(ahead(), U
'₀', U
'₉')) mod += next() - U
'₀' +
'0';
193 auto m = std::strtoull(mod.c_str(),
nullptr, 10);
195 }
else if (accept<Append::Off>(
'_')) {
196 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
198 if (accept(utf8::isdigit)) {
200 auto m = std::strtoull(str_.c_str(),
nullptr, 10);
203 ast().
error(loc_,
"stray underscore in Idx literal; size is missing");
204 auto i = std::strtoull(str_.c_str(),
nullptr, 10);
205 return Tok{loc_,
u64(i)};
210 bool is_float =
false;
211 if (base == 10 || base == 16) {
218 bool has_exp = parse_exp(base);
219 if (base == 16 && is_float && !has_exp)
ast().
error(loc_,
"hexadecimal floating constants require an exponent");
223 if (sign && str_.empty()) {
224 ast().
error(loc_,
"stray '{}'", *sign ?
"-" :
"+");
228 if (is_float && base == 16) str_.insert(0,
"0x"sv);
229 if (sign && *sign) str_.insert(0,
"-"sv);
231 if (is_float)
return Tok{loc_,
f64(std::strtod (str_.c_str(),
nullptr ))};
232 if (sign)
return Tok{loc_,
u64(std::strtoll (str_.c_str(),
nullptr, base))};
233 else return Tok{loc_,
u64(std::strtoull(str_.c_str(),
nullptr, base))};
236void Lexer::parse_digits(
int base ) {
239 case 2:
while (accept(utf8::isbdigit)) {}
break;
240 case 8:
while (accept(utf8::isodigit)) {}
break;
241 case 10:
while (accept(utf8::isdigit)) {}
break;
242 case 16:
while (accept(utf8::isxdigit)) {}
break;
244 default: fe::unreachable();
248bool Lexer::parse_exp(
int base ) {
249 if (accept(base == 10 ? utf8::any(
'e',
'E') : utf8::any(
'p',
'P'))) {
250 accept(utf8::any(
'+',
'-'));
251 if (!utf8::isdigit(ahead()))
ast().
error(loc_,
"exponent has no digits");
259char8_t Lexer::lex_char() {
260 if (accept<Append::Off>(
'\\')) {
263 else if (accept<Append::Off>(
'\'')) str_ +=
'\'';
264 else if (accept<Append::Off>(
'\\')) str_ +=
'\\';
265 else if (accept<Append::Off>(
'"')) str_ +=
'\"';
266 else if (accept<Append::Off>(
'0')) str_ +=
'\0';
267 else if (accept<Append::Off>(
'a')) str_ +=
'\a';
268 else if (accept<Append::Off>(
'b')) str_ +=
'\b';
269 else if (accept<Append::Off>(
'f')) str_ +=
'\f';
270 else if (accept<Append::Off>(
'n')) str_ +=
'\n';
271 else if (accept<Append::Off>(
'r')) str_ +=
'\r';
272 else if (accept<Append::Off>(
't')) str_ +=
'\t';
273 else if (accept<Append::Off>(
'v')) str_ +=
'\v';
274 else ast().
error(loc_.anew_finis(),
"invalid escape character '\\{}'", (
char)ahead());
280 if (utf8::isascii(c))
return c;
281 ast().
error(loc_,
"invalid character '{}'", (
char)c);
285void Lexer::eat_comments() {
287 while (ahead() != utf8::EoF && ahead() !=
'*') next();
288 if (accept(utf8::EoF)) {
289 ast().
error(loc_,
"non-terminated multiline comment");
293 if (accept(
'/'))
break;
297void Lexer::emit_md(
bool start_of_file) {
298 if (!start_of_file) md_fence();
302 for (
int i = 0; i < 3; ++i) next();
306 while (ahead() != utf8::EoF && ahead() !=
'\n') next();
308 }
while (start_md());
310 if (ahead() == utf8::EoF)
316Sym Lexer::sym() {
return ast().
sym(str_); }
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Lexer(AST &, std::istream &, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read *.mim files (see Lexical Structure).