Thorin 1.9.0
The Higher ORder INtermediate representation
Loading...
Searching...
No Matches
dfa2matcher.cpp
Go to the documentation of this file.
2
3#include <algorithm>
4#include <sstream>
5
6#include <absl/container/flat_hash_map.h>
7#include <automaton/dfa.h>
9
11#include <thorin/plug/mem/mem.h>
12
13using namespace thorin;
14using namespace automaton;
15
18
19// nomenclature:
20// c is the character from the string we want to match
21
22// see lit/regex/match_manual.thorin for a hand written state machine impl that this is based on
23
24// the main idea is:
25// for every state in the DFA, we create a lam that checks if the char c at pos is the end of the string \0
26// if not, jump to a checker lam, that consists of a few bit-wise or-ed in-range checks to verify if we can transition
27// to a certain other state with c if any of the checks is true, we jump to the corresponding state lam with the updated
28// position if the check fails, we jump to the next checker lam or the exit lam if we checked all possible transitions
29// Note, since the DFA stores the transitions as chars, not ranges, we have to merge the transitions to ranges first
30// (cf. transitions_to_ranges)
31
32namespace {
33
34namespace core = plug::core;
35namespace mem = plug::mem;
36
37std::string state_to_name(const DFANode* state) {
38 std::stringstream ss;
39 ss << "state_" << state;
40 return ss.str();
41}
42
43DFAMap<Ranges> transitions_to_ranges(World& w, const DFANode* state) {
44 DFAMap<Ranges> state2ranges;
45 state->for_transitions([&](std::uint16_t transition, const DFANode* next_state) {
46 if (!state2ranges.contains(next_state))
47 state2ranges.emplace(next_state, Ranges{
48 {transition, transition}
49 });
50 else
51 state2ranges[next_state].emplace_back(transition, transition);
52 });
53 Range any_range{0, 255};
54 for (auto& [state, ranges] : state2ranges) {
55 if (std::find(ranges.cbegin(), ranges.cend(), any_range) != ranges.cend()) {
56 ranges = {any_range};
57 continue;
58 }
59
60 std::sort(ranges.begin(), ranges.end(), RangeCompare{});
61 ranges = merge_ranges(ranges, [&w](auto&&... args) { w.DLOG(std::forward<decltype(args)>(args)...); });
62 }
63 return state2ranges;
64}
65
66Ref match_range(Ref c, nat_t lo, nat_t hi) {
67 World& w = c->world();
68 if (lo == 0 && hi == 255) return w.lit_tt();
69
70 // .let in_range = %core.bit2.and_ 0 (%core.icmp.uge (char, lower), %core.icmp.ule (char, upper));
71 auto below_hi = w.call(core::icmp::ule, w.tuple({c, w.lit_i8(hi)}));
72 auto above_lo = w.call(core::icmp::uge, w.tuple({c, w.lit_i8(lo)}));
73 return w.call(core::bit2::and_, w.lit_nat(2), w.tuple({below_hi, above_lo}));
74}
75
76DFAMap<Ref> create_check_match_transitions_from(Ref c, const DFANode* state) {
77 World& w = c->world();
78 DFAMap<Ref> state2check;
79
80 auto state2ranges = transitions_to_ranges(w, state);
81
82 for (auto& [state, ranges] : state2ranges) {
83 for (auto& [lo, hi] : ranges)
84 if (!state2check.contains(state))
85 state2check.emplace(state, match_range(c, lo, hi));
86 else
87 state2check[state]
88 = w.call(core::bit2::or_, w.lit_nat(2), w.tuple({state2check[state], match_range(c, lo, hi)}));
89 }
90 return state2check;
91}
92
93} // namespace
94
95extern "C" const Def* dfa2matcher(World& w, const DFA& dfa, Ref n) {
96 w.DLOG("dfa to match: {}", dfa);
97
98 auto states = dfa.get_reachable_states();
99 DFAMap<Lam*> state2matcher;
100
101 // ((mem: %mem.M, string: Str n, pos: .Idx n), .Cn [%mem.M, .Bool, .Idx n])
102 auto matcher = w.mut_fun({w.annex<mem::M>(), w.call<mem::Ptr0>(w.arr(n, w.type_i8())), w.type_idx(n)},
103 {w.annex<mem::M>(), w.type_bool(), w.type_idx(n)});
104 matcher->debug_prefix(std::string("match_regex"));
105 auto [args, exit] = matcher->vars<2>();
106 exit->debug_prefix(std::string("exit"));
107 auto [mem, string, pos] = args->projs<3>();
108 mem->debug_prefix(std::string("mem"));
109 string->debug_prefix(std::string("string"));
110 pos->debug_prefix(std::string("pos"));
111
112 auto error = mem::mut_con(w.type_idx(n));
113 error->debug_prefix("error");
114 {
115 auto [mem, pos] = error->vars<2>();
116 mem->debug_prefix(std::string("mem"));
117 pos->debug_prefix(std::string("pos"));
118 error->app(false, exit, {mem, w.lit_ff(), pos});
119 }
120
121 auto accept = mem::mut_con(w.type_idx(n));
122 accept->debug_prefix("accept");
123 {
124 auto [mem, pos] = accept->vars<2>();
125 mem->debug_prefix(std::string("mem"));
126 pos->debug_prefix(std::string("pos"));
127 accept->app(false, exit, {mem, w.lit_tt(), pos});
128 }
129
130 auto exiting = [error, accept](const DFANode* state) { return state->is_accepting() ? accept : error; };
131
132 for (auto state : states) {
133 auto lam = mem::mut_con(w.type_idx(n));
134 lam->debug_prefix(state_to_name(state));
135 state2matcher.emplace(state, lam);
136 }
137
138 for (auto [state, lam] : state2matcher) {
139 auto [mem, i] = lam->vars<2>();
140
141 if (state->is_erroring()) {
142 lam->app(true, error, {mem, i});
143 continue;
144 }
145
146 auto lea = w.call<mem::lea>(w.tuple({n, w.pack(n, w.type_i8()), w.lit_nat(0)}), w.tuple({string, i}));
147 auto [mem2, c] = w.call<mem::load>(w.tuple({mem, lea}))->projs<2>();
148
149 auto is_end = w.call(core::icmp::e, w.tuple({c, w.lit_i8(0)}));
150 auto not_end = mem::mut_con(w.type_idx(n));
151 not_end->debug_prefix("not_end_" + state_to_name(state));
152
153 auto new_i = w.call(core::wrap::add, core::Mode::nusw, w.tuple({i, w.call(core::conv::u, n, w.lit_i64(1))}));
154 lam->app(false, w.select(is_end, exiting(state), not_end), {mem2, i});
155
156 auto transitions = create_check_match_transitions_from(c, state);
157 auto next_check = exiting(state); // if we want to check full string only, use error instead of exiting(state)c
158 for (auto [next_state, check] : transitions) {
159 auto next_lam = state2matcher[next_state];
160 auto checker = mem::mut_con(w.type_idx(n));
161 checker->debug_prefix("check_" + state_to_name(state) + "_to_" + state_to_name(next_state));
162 auto [mem3, pos] = checker->vars<2>();
163 checker->app(false, w.select(check, next_lam, next_check), {mem3, w.select(check, new_i, pos)});
164 next_check = checker;
165 }
166 {
167 auto [mem, pos] = not_end->vars<2>();
168 not_end->app(true, next_check, {mem, pos});
169 }
170 }
171
172 matcher->app(false, state2matcher[dfa.get_start()], {mem, pos});
173 return matcher;
174}
std::set< const NodeType * > get_reachable_states() const
Definition automaton.h:35
bool is_accepting() const noexcept
Definition dfa.h:32
bool is_erroring() const noexcept
Definition dfa.h:38
void for_transitions(F &&f, std::uint16_t c) const
Definition dfa.h:21
Base class for all Defs.
Definition def.h:222
Helper class to retrieve Infer::arg if present.
Definition def.h:87
This is a thin wrapper for absl::InlinedVector<T, N, A> which in turn is a drop-in replacement for st...
Definition vector.h:16
The World represents the whole program and manages creation of Thorin nodes (Defs).
Definition world.h:35
const Def * dfa2matcher(World &w, const DFA &dfa, Ref n)
You can dl::get this function.
std::pair< std::uint64_t, std::uint64_t > Range
absl::flat_hash_map< const DFANode *, To > DFAMap
Definition dfa.h:63
std::optional< Range > merge_ranges(Range a, Range b) noexcept
The core Plugin
Definition core.h:8
The mem Plugin
Definition mem.h:11
Lam * mut_con(World &w)
Yields .con[mem.M].
Definition mem.h:16
Definition cfg.h:11
void error(const Def *def, const char *fmt, Args &&... args)
Definition def.h:622
u64 nat_t
Definition types.h:44