thorin2/lexer_8cpp_source.html

#include "thorin/fe/lexer.h"


#include "thorin/world.h"


using namespace std::literals;


namespace thorin {


namespace utf8 = fe::utf8;

using Tag      = Tok::Tag;


Lexer::Lexer(World& world, std::istream& istream, const fs::path* path /*= nullptr*/, std::ostream* md /*= nullptr*/)

    : Super(istream, path)

    , world_(world)

    , md_(md) {

#define CODE(t, str) keywords_[world.sym(str)] = Tag::t;

    THORIN_KEY(CODE)

#undef CODE


#define CODE(str, t) \

    if (Tag::t != Tag::Nil) keywords_[world.sym(str)] = Tag::t;

    THORIN_SUBST(CODE)

#undef CODE


    if (start_md())

        emit_md(true);

    else

        md_fence();

}


Tok Lexer::lex() {

    while (true) {

        if (auto cache = cache_) {

            cache_.reset();

            return *cache;

        }


        start();


        if (accept(utf8::EoF)) return tok(Tag::EoF);

        if (accept(utf8::isspace)) continue;

        if (accept(utf8::Null)) error(loc_, "invalid UTF-8 character");


        // clang-format off

        // delimiters

        if (accept( '(')) return tok(Tag::D_paren_l);

        if (accept( ')')) return tok(Tag::D_paren_r);

        if (accept( '[')) return tok(Tag::D_brckt_l);

        if (accept( ']')) return tok(Tag::D_brckt_r);

        if (accept( '{')) return tok(Tag::D_brace_l);

        if (accept( '}')) return tok(Tag::D_brace_r);

        if (accept(U'«')) return tok(Tag::D_quote_l);

        if (accept(U'»')) return tok(Tag::D_quote_r);

        if (accept(U'⟪')) return tok(Tag::D_quote_l);

        if (accept(U'⟫')) return tok(Tag::D_quote_r);

        if (accept(U'‹')) return tok(Tag::D_angle_l);

        if (accept(U'›')) return tok(Tag::D_angle_r);

        if (accept(U'⟨')) return tok(Tag::D_angle_l);

        if (accept(U'⟩')) return tok(Tag::D_angle_r);

        if (accept( '<')) {

            if (accept( '<')) return tok(Tag::D_quote_l);

            return tok(Tag::D_angle_l);

        }

        if (accept( '>')) {

            if (accept( '>')) return tok(Tag::D_quote_r);

            return tok(Tag::D_angle_r);

        }

        // further tokens

        if (accept('`'))  return tok(Tag::T_backtick);

        if (accept(U'→')) return tok(Tag::T_arrow);

        if (accept( '@')) return tok(Tag::T_at);

        if (accept( '=')) return tok(Tag::T_assign);

        if (accept( '!')) return tok(Tag::T_bang);

        if (accept(U'⊥')) return tok(Tag::T_bot);

        if (accept(U'⊤')) return tok(Tag::T_top);

        if (accept(U'□')) return tok(Tag::T_box);

        if (accept( ',')) return tok(Tag::T_comma);

        if (accept( '$')) return tok(Tag::T_dollar);

        if (accept( '#')) return tok(Tag::T_extract);

        if (accept(U'λ')) return tok(Tag::T_lm);

        if (accept(U'Π')) return tok(Tag::T_Pi);

        if (accept( ';')) return tok(Tag::T_semicolon);

        if (accept(U'★')) return tok(Tag::T_star);

        if (accept( '*')) return tok(Tag::T_star);

        if (accept( ':')) {

            if (accept( ':')) return tok(Tag::T_colon_colon);

            return tok(Tag::T_colon);

        }

        if (accept( '|')) {

            if (accept('~')) {

                if (accept('|')) return tok(Tag::T_Pi);

            }

            error(loc_, "invalid input char '{}'; maybe you wanted to use '|~|'?", str_);

            continue;

        }

        // clang-format on


        if (accept('%')) {

            if (lex_id()) {

                auto loc = cache_trailing_dot();

                return {loc, Tag::M_anx, sym()};

            }

            error(loc_, "invalid axiom name '{}'", str_);

        }


        if (accept('.')) {

            if (lex_id()) {

                if (auto i = keywords_.find(sym()); i != keywords_.end()) return tok(i->second);

                // Split non-keyword into T_dot and M_id; M_id goes into cache_ for next lex().

                assert(!cache_.has_value());

                auto id_loc = loc();

                ++id_loc.begin.col;

                cache_.emplace(id_loc, Tag::M_id, world().sym(str_.substr(1)));

                return {loc().anew_begin(), Tag::T_dot};

            }


            if (accept(utf8::isdigit)) {

                parse_digits();

                parse_exp();

                return {loc_, f64(std::strtod(str_.c_str(), nullptr))};

            }


            return tok(Tag::T_dot);

        }


        if (accept('\'')) {

            auto c = lex_char();

            if (accept('\'')) return {loc(), c};

            error(loc_, "invalid character literal {}", str_);

            continue;

        }


        if (accept<Append::Off>('\"')) {

            while (lex_char() != '"') {}

            str_.pop_back(); // remove final '"'

            return {loc_, Tag::M_str, sym()};

        }


        if (lex_id()) {

            auto loc = cache_trailing_dot();

            return {loc, Tag::M_id, sym()};

        }


        if (utf8::isdigit(ahead()) || utf8::any('+', '-')(ahead())) {

            if (auto lit = parse_lit()) return *lit;

            continue;

        }


        if (start_md()) {

            emit_md();

            continue;

        }


        // comments

        if (accept('/')) {

            if (accept('*')) {

                eat_comments();

                continue;

            }

            if (accept('/')) {

                while (ahead() != utf8::EoF && ahead() != '\n') next();

                continue;

            }


            error({loc_.path, peek_}, "invalid input char '/'; maybe you wanted to start a comment?");

            continue;

        }


        error({loc_.path, peek_}, "invalid input char '{}'", utf8::Char32(ahead()));

        next();

    }

}


// A trailing T_dot does not belong to an annex name or identifier and goes into cache_ for next lex().

Loc Lexer::cache_trailing_dot() {

    auto l = loc();

    if (str_.back() == '.') {

        str_.pop_back();

        assert(!cache_.has_value());

        cache_.emplace(l.anew_finis(), Tag::T_dot);

        --l.finis.col;

    }

    return l;

}


bool Lexer::lex_id() {

    if (accept([](char32_t c) { return c == '_' || utf8::isalpha(c); })) {

        while (accept([](char32_t c) { return c == '_' || c == '.' || utf8::isalnum(c); })) {}

        return true;

    }

    return false;

}


// clang-format off

std::optional<Tok> Lexer::parse_lit() {

    int base = 10;

    std::optional<bool> sign;


    if (accept<Append::Off>('+')) {

        sign = false;

    } else if (accept<Append::Off>('-')) {

        if (accept('>')) return tok(Tag::T_arrow);

        sign = true;

    }


    // prefix starting with '0'

    if (accept<Append::Off>('0')) {

        if      (accept<Append::Off>('b')) base =  2;

        else if (accept<Append::Off>('B')) base =  2;

        else if (accept<Append::Off>('o')) base =  8;

        else if (accept<Append::Off>('O')) base =  8;

        else if (accept<Append::Off>('x')) base = 16;

        else if (accept<Append::Off>('X')) base = 16;

    }


    parse_digits(base);


    if (accept<Append::Off>('I')) {

        if (sign) str_.insert(0, "-"sv);

        auto val = std::strtoull(str_.c_str(), nullptr, base);

        str_.clear();

        parse_digits();

        auto width = std::strtoull(str_.c_str(), nullptr, 10);

        return Tok{loc_, world().lit_int(width, val)};

    }


    if (!sign && base == 10) {

        if (utf8::isrange(ahead(), U'₀', U'₉')) {

            auto i = std::strtoull(str_.c_str(), nullptr, 10);

            std::string mod;

            while (utf8::isrange(ahead(), U'₀', U'₉')) mod += next() - U'₀' + '0';

            auto m = std::strtoull(mod.c_str(), nullptr, 10);

            return Tok{loc_, world().lit_idx_mod(m, i)};

        } else if (accept<Append::Off>('_')) {

            auto i = std::strtoull(str_.c_str(), nullptr, 10);

            str_.clear();

            if (accept(utf8::isdigit)) {

                parse_digits(10);

                auto m = std::strtoull(str_.c_str(), nullptr, 10);

                return Tok{loc_, world().lit_idx_mod(m, i)};

            } else {

                error(loc_, "stray underscore in unsigned literal");

                auto i = std::strtoull(str_.c_str(), nullptr, 10);

                return Tok{loc_, u64(i)};

            }

        }

    }


    bool is_float = false;

    if (base == 10 || base == 16) {

        // parse fractional part

        if (accept('.')) {

            is_float = true;

            parse_digits(base);

        }


        bool has_exp = parse_exp(base);

        if (base == 16 && is_float && !has_exp) error(loc_, "hexadecimal floating constants require an exponent");

        is_float |= has_exp;

    }


    if (sign && str_.empty()) {

        error(loc_, "stray '{}'", *sign ? "-" : "+");

        return {};

    }


    if (is_float && base == 16) str_.insert(0, "0x"sv);

    if (sign && *sign) str_.insert(0, "-"sv);


    if (is_float) return Tok{loc_, f64(std::strtod  (str_.c_str(), nullptr      ))};

    if (sign)     return Tok{loc_, u64(std::strtoll (str_.c_str(), nullptr, base))};

    else          return Tok{loc_, u64(std::strtoull(str_.c_str(), nullptr, base))};

}


void Lexer::parse_digits(int base /*= 10*/) {

    switch (base) {

        // clang-format off

        case  2: while (accept(utf8::isbdigit)) {} break;

        case  8: while (accept(utf8::isodigit)) {} break;

        case 10: while (accept(utf8::isdigit))  {} break;

        case 16: while (accept(utf8::isxdigit)) {} break;

        // clang-format on

        default: fe::unreachable();

    }

}


bool Lexer::parse_exp(int base /*= 10*/) {

    if (accept(base == 10 ? utf8::any('e', 'E') : utf8::any('p', 'P'))) {

        accept(utf8::any('+', '-'));

        if (!utf8::isdigit(ahead())) error(loc_, "exponent has no digits");

        parse_digits();

        return true;

    }

    return false;

}

// clang-format on


char8_t Lexer::lex_char() {

    if (accept<Append::Off>('\\')) {

        // clang-format off

        if (false) {}

        else if (accept<Append::Off>('\'')) str_ += '\'';

        else if (accept<Append::Off>('\\')) str_ += '\\';

        else if (accept<Append::Off>( '"')) str_ += '\"';

        else if (accept<Append::Off>( '0')) str_ += '\0';

        else if (accept<Append::Off>( 'a')) str_ += '\a';

        else if (accept<Append::Off>( 'b')) str_ += '\b';

        else if (accept<Append::Off>( 'f')) str_ += '\f';

        else if (accept<Append::Off>( 'n')) str_ += '\n';

        else if (accept<Append::Off>( 'r')) str_ += '\r';

        else if (accept<Append::Off>( 't')) str_ += '\t';

        else if (accept<Append::Off>( 'v')) str_ += '\v';

        else error(loc_.anew_finis(), "invalid escape character '\\{}'", (char)ahead());

        // clang-format on

        return str_.back();

    }

    auto c = next();

    str_ += c;

    if (utf8::isascii(c)) return c;

    error(loc_, "invalid character '{}'", (char)c);

}


void Lexer::eat_comments() {

    while (true) {

        while (ahead() != utf8::EoF && ahead() != '*') next();

        if (accept(utf8::EoF)) {

            error(loc_, "non-terminated multiline comment");

            return;

        }

        next();

        if (accept('/')) break;

    }

}


void Lexer::emit_md(bool start_of_file) {

    if (!start_of_file) md_fence();


    do {

        out_ = false;

        for (int i = 0; i < 3; ++i) next();

        accept(' ');

        out_ = true;


        while (ahead() != utf8::EoF && ahead() != '\n') next();

        accept('\n');

    } while (start_md());


    if (ahead() == utf8::EoF)

        out_ = false;

    else

        md_fence();

}


Sym Lexer::sym() { return world().sym(str_); }


} // namespace thorin

thorin::Lexer::loc
Loc loc() const
Definition lexer.h:23

thorin::Lexer::lex
Tok lex()
Definition lexer.cpp:31

thorin::Lexer::Lexer
Lexer(World &world, std::istream &istream, const fs::path *path=nullptr, std::ostream *md=nullptr)
Creates a lexer to read Thorin files (see Lexical Structure).
Definition lexer.cpp:12

thorin::Lexer::world
World & world()
Definition lexer.h:21

thorin::Tok
Definition tok.h:115

thorin::Tok::Tag
Tag
Definition tok.h:139

thorin::World
The World represents the whole program and manages creation of Thorin nodes (Defs).
Definition world.h:35

thorin::World::sym
Sym sym(std::string_view)
Definition world.cpp:77

thorin::World::lit_int
const Lit * lit_int(nat_t width, u64 val)
Constructs a Lit of type Idx of size $2^width$.
Definition world.h:389

thorin::World::lit_idx_mod
const Lit * lit_idx_mod(nat_t mod, u64 val)
Constructs a Lit of type Idx of size mod.
Definition world.h:402

CODE
#define CODE(node, name)
Definition def.h:40

lexer.h

thorin::plug::math::c
@ c
Definition autogen.h:390

thorin::plug::mem::m
m
Definition autogen.h:151

thorin::plug::regex::any
any
Definition autogen.h:92

thorin
Definition cfg.h:11

thorin::error
void error(const Def *def, const char *fmt, Args &&... args)
Definition def.h:622

thorin::u64
uint64_t u64
Definition types.h:35

thorin::f64
double f64
Definition types.h:42

THORIN_SUBST
#define THORIN_SUBST(m)
Definition tok.h:98

THORIN_KEY
#define THORIN_KEY(m)
Definition tok.h:14

world.h