MimIR 0.1
MimIR is my Intermediate Representation
Loading...
Searching...
No Matches
regex2nfa.cpp
Go to the documentation of this file.
2
3#include <algorithm>
4
5#include <automaton/nfa.h>
6
7#include <mim/lam.h>
8#include <mim/world.h>
9
11
12using namespace mim;
13
14namespace mim::plug::regex {
15
16namespace {
17
18std::vector<std::pair<std::uint8_t, std::uint8_t>> gather_ranges(const Def* regex) {
19 auto args = detail::flatten_in_arg<regex::disj>(regex);
20 std::vector<std::pair<std::uint8_t, std::uint8_t>> inner_ranges;
21 for (auto& arg : args) {
22 if (auto not_ = Axm::isa<regex::not_>(arg)) {
23 auto not_ranges = gather_ranges(not_->arg());
24 inner_ranges.insert(inner_ranges.end(), not_ranges.begin(), not_ranges.end());
25 } else if (auto any = Axm::isa<regex::any>(arg)) {
26 inner_ranges.emplace_back(0, 255);
27 } else {
28 assert(Axm::isa<regex::range>(arg)
29 && "as per normalizer, if we're in a 'not_' argument, we must only have disjs, not_ and ranges!");
30
31 auto rng_match = Axm::isa<regex::range, false>(arg);
32 inner_ranges.emplace_back(Lit::as<std::uint8_t>(rng_match->arg(0)),
33 Lit::as<std::uint8_t>(rng_match->arg(1)));
34 }
35 }
36 std::sort(inner_ranges.begin(), inner_ranges.end());
37 std::uint8_t last{0};
38 std::vector<std::pair<std::uint8_t, std::uint8_t>> ranges;
39 for (auto rng : inner_ranges) {
40 if (rng.first > last) ranges.push_back({last, static_cast<std::uint8_t>(rng.first - 1)});
41 // ranges.push_back({rng.first, rng.second});
42 last = std::min(rng.second + 1_u16, 255);
43 }
44 if (last < 255) ranges.push_back({last, 255});
45 return ranges;
46}
47
48struct Regex2NfaConverter {
49 Regex2NfaConverter()
50 : nfa_(std::make_unique<automaton::NFA>()) {}
51
52 void add_range_transitions(automaton::NFANode* from, automaton::NFANode* to, std::uint16_t lb, std::uint16_t ub) {
53 for (auto i = lb; i <= ub; ++i)
54 from->add_transition(to, i);
55 }
56
57 void add_range_transitions(automaton::NFANode* from, automaton::NFANode* to, const Def* lit0, const Def* lit1) {
58 auto lb = lit0->as<Lit>()->get();
59 auto ub = lit1->as<Lit>()->get();
60 add_range_transitions(from, to, lb, ub);
61 }
62
63 void
64 convert(const Def* regex, automaton::NFANode* start, automaton::NFANode* end, automaton::NFANode* error = nullptr) {
65 if (auto conj = Axm::isa<regex::conj>(regex)) {
66 auto middle = nfa_->add_state();
67 convert(conj->arg(0), start, middle, error);
68 convert(conj->arg(1), middle, end, error);
69 } else if (auto any = Axm::isa<regex::any>(regex)) {
70 add_range_transitions(start, end, 0_u16, 255);
71 } else if (auto range = Axm::isa<regex::range>(regex)) {
72 add_range_transitions(start, end, range->arg(0), range->arg(1));
73 if (error) add_range_transitions(start, error, 0, 255);
74 } else if (auto not_ = Axm::isa<regex::not_>(regex)) {
75 auto first = nfa_->add_state();
76
78 auto ranges = gather_ranges(not_->arg());
79
80 for (auto rng : ranges)
81 add_range_transitions(first, end, rng.first, rng.second);
82 } else if (auto neg = Axm::isa<regex::neg_lookahead>(regex)) {
83 auto first = nfa_->add_state();
84 auto error = nfa_->add_state();
85 error->set_erroring(true);
86
88
89 convert(neg->arg(), first, error, end);
91 } else if (auto disj = Axm::isa<regex::disj>(regex)) {
92 convert(disj->arg(0), start, end, error);
93 convert(disj->arg(1), start, end, error);
94 } else if (auto opt = Axm::isa(quant::optional, regex)) {
96 convert(opt->arg(), start, end);
97 } else if (auto star = Axm::isa(quant::star, regex)) {
98 auto m1 = nfa_->add_state();
99 auto m2 = nfa_->add_state();
102 m2->add_transition(m1, automaton::NFA::SpecialTransitons::EPSILON);
103 m2->add_transition(end, automaton::NFA::SpecialTransitons::EPSILON);
104 convert(star->arg(), m1, m2);
105 } else if (auto plus = Axm::isa(quant::plus, regex)) {
106 auto m0 = nfa_->add_state();
107 auto m1 = nfa_->add_state();
108 auto m2 = nfa_->add_state();
109 convert(plus->arg(), start, m0, error);
110 m0->add_transition(m1, automaton::NFA::SpecialTransitons::EPSILON);
111 m2->add_transition(m1, automaton::NFA::SpecialTransitons::EPSILON);
112 m2->add_transition(end, automaton::NFA::SpecialTransitons::EPSILON);
113 m0->add_transition(end, automaton::NFA::SpecialTransitons::EPSILON);
114 convert(plus->arg(), m1, m2);
115 } else if (auto empty = Axm::isa<regex::empty>(regex)) {
117 }
118 }
119
120 void convert(const Def* regex) {
121 auto start = nfa_->add_state();
122 nfa_->set_start(start);
123 auto end = nfa_->add_state();
124 end->set_accepting(true);
125 convert(regex, start, end);
126 }
127
128 std::unique_ptr<automaton::NFA> nfa() { return std::move(nfa_); }
129
130private:
131 std::unique_ptr<automaton::NFA> nfa_;
132};
133
134} // namespace
135
136std::unique_ptr<automaton::NFA> regex2nfa(const Def* regex) {
137 Regex2NfaConverter converter;
138 converter.convert(regex);
139 return converter.nfa();
140}
141
142} // namespace mim::plug::regex
143
144extern "C" automaton::NFA* regex2nfa(const Def* regex) { return mim::plug::regex::regex2nfa(regex).release(); }
void set_accepting(bool accepting)
Definition nfa.h:36
void add_transition(const NFANode *to, std::uint16_t c)
Definition nfa.cpp:7
static auto isa(const Def *def)
Definition axm.h:107
Base class for all Defs.
Definition def.h:251
static T as(const Def *def)
Definition def.h:830
const Sigma * convert(const TBound< up > *b)
Definition core.cpp:18
The regex Plugin
Definition lower_regex.h:5
std::unique_ptr< automaton::NFA > regex2nfa(const Def *regex)
Definition ast.h:14
void error(Loc loc, const char *f, Args &&... args)
Definition dbg.h:125
constexpr decltype(auto) get(Span< T, N > span) noexcept
Definition span.h:115
@ Lit
Definition def.h:114
automaton::NFA * regex2nfa(const Def *regex)
You can dl::get this function.