MimIR
MimIR is my Intermediate Representation
Loading...
Searching...
No Matches
dfa2matcher.cpp
Go to the documentation of this file.
2
3#include <algorithm>
4#include <sstream>
5
6#include <absl/container/flat_hash_map.h>
7#include <automaton/dfa.h>
9
10#include <mim/plug/core/core.h>
11#include <mim/plug/mem/mem.h>
12
13template<>
14struct std::formatter<automaton::DFA> : fe::ostream_formatter {};
15
16using namespace mim;
17using namespace automaton;
18
21
22// nomenclature:
23// c is the character from the string we want to match
24
25// see lit/regex/match_manual.mim for a hand written state machine impl that this is based on
26
27// the main idea is:
28// for every state in the DFA, we create a lam that checks if the char c at pos is the end of the string \0
29// if not, jump to a checker lam, that consists of a few bit-wise or-ed in-range checks to verify if we can transition
30// to a certain other state with c if any of the checks is true, we jump to the corresponding state lam with the updated
31// position if the check fails, we jump to the next checker lam or the exit lam if we checked all possible transitions
32// Note, since the DFA stores the transitions as chars, not ranges, we have to merge the transitions to ranges first
33// (cf. transitions_to_ranges)
34
35namespace {
36
37namespace core = plug::core;
38namespace mem = plug::mem;
39
40std::string state_to_name(const DFANode* state) {
41 std::stringstream ss;
42 ss << "state_" << state;
43 return ss.str();
44}
45
46DFAMap<Ranges> transitions_to_ranges(World& w, const DFANode* state) {
47 DFAMap<Ranges> state2ranges;
48 state->for_transitions([&](std::uint16_t transition, const DFANode* next_state) {
49 if (!state2ranges.contains(next_state))
50 state2ranges.try_emplace(next_state, Ranges{
51 {transition, transition}
52 });
53 else
54 state2ranges[next_state].emplace_back(transition, transition);
55 });
56 Range any_range{0, 255};
57 for (auto& [state, ranges] : state2ranges) {
58 if (std::ranges::contains(ranges, any_range)) {
59 ranges = {any_range};
60 continue;
61 }
62
63 std::sort(ranges.begin(), ranges.end(), RangeCompare{});
64 ranges = merge_ranges(ranges, [&w](std::string_view msg) { w.DLOG("{}", msg); });
65 }
66 return state2ranges;
67}
68
69const Def* match_range(const Def* c, nat_t lo, nat_t hi) {
70 World& w = c->world();
71 if (lo == 0 && hi == 255) return w.lit_tt();
72
73 // let in_range = %core.bit2.and_ 0 (%core.icmp.uge (char, lower), %core.icmp.ule (char, upper));
74 auto below_hi = w.call(core::icmp::ule, w.tuple({c, w.lit_i8(hi)}));
75 auto above_lo = w.call(core::icmp::uge, w.tuple({c, w.lit_i8(lo)}));
76 return w.call(core::bit2::and_, w.lit_nat(2), w.tuple({below_hi, above_lo}));
77}
78
79DFAMap<const Def*> create_check_match_transitions_from(const Def* c, const DFANode* state) {
80 World& w = c->world();
81 DFAMap<const Def*> state2check;
82
83 auto state2ranges = transitions_to_ranges(w, state);
84
85 for (auto& [state, ranges] : state2ranges) {
86 for (auto& [lo, hi] : ranges)
87 if (!state2check.contains(state))
88 state2check.try_emplace(state, match_range(c, lo, hi));
89 else
90 state2check[state]
91 = w.call(core::bit2::or_, w.lit_nat(2), w.tuple({state2check[state], match_range(c, lo, hi)}));
92 }
93 return state2check;
94}
95
96} // namespace
97
98extern "C" const Def* dfa2matcher(World& w, const DFA& dfa, const Def* n) {
99 w.DLOG("dfa to match: {}", dfa);
100
101 auto states = dfa.get_reachable_states();
102 DFAMap<Lam*> state2matcher;
103
104 // ((mem: %mem.M 0, string: Str n, pos: Idx n), Cn [%mem.M 0, Bool, Idx n])
105 auto matcher = w.mut_fun({w.call<mem::M>(0), w.call<mem::Ptr0>(w.arr(n, w.type_i8())), w.type_idx(n)},
106 {w.call<mem::M>(0), w.type_bool(), w.type_idx(n)});
107 matcher->debug_prefix(std::string("match_regex"));
108 auto [args, exit] = matcher->vars<2>();
109 exit->debug_prefix(std::string("exit"));
110 auto [mem, string, pos] = args->projs<3>();
111 mem->debug_prefix(std::string("mem"));
112 string->debug_prefix(std::string("string"));
113 pos->debug_prefix(std::string("pos"));
114
115 auto error = mem::mut_con(w.type_idx(n));
116 error->debug_prefix("error");
117 {
118 auto [mem, pos] = error->vars<2>();
119 mem->debug_prefix(std::string("mem"));
120 pos->debug_prefix(std::string("pos"));
121 error->app(false, exit, {mem, w.lit_ff(), pos});
122 }
123
124 auto accept = mem::mut_con(w.type_idx(n));
125 accept->debug_prefix("accept");
126 {
127 auto [mem, pos] = accept->vars<2>();
128 mem->debug_prefix(std::string("mem"));
129 pos->debug_prefix(std::string("pos"));
130 accept->app(false, exit, {mem, w.lit_tt(), pos});
131 }
132
133 auto exiting = [error, accept](const DFANode* state) { return state->is_accepting() ? accept : error; };
134
135 for (auto state : states) {
136 auto lam = mem::mut_con(w.type_idx(n));
137 lam->debug_prefix(state_to_name(state));
138 state2matcher.emplace(state, lam);
139 }
140
141 for (auto [state, lam] : state2matcher) {
142 auto [mem, i] = lam->vars<2>();
143
144 if (state->is_erroring()) {
145 lam->app(true, error, {mem, i});
146 continue;
147 }
148
149 auto lea = w.call<mem::lea>(Defs{string, i});
150 auto [mem2, c] = w.call<mem::load>(Defs{mem, lea})->projs<2>();
151
152 auto is_end = w.call(core::icmp::e, Defs({c, w.lit_i8(0)}));
153 auto not_end = mem::mut_con(w.type_idx(n));
154 not_end->debug_prefix("not_end_" + state_to_name(state));
155
156 auto new_i = w.call(core::wrap::add, core::Mode::nsuw, w.tuple({i, w.call(core::conv::u, n, w.lit_i64(1))}));
157 lam->app(false, w.select(is_end, exiting(state), not_end), {mem2, i});
158
159 auto transitions = create_check_match_transitions_from(c, state);
160 auto next_check = exiting(state); // if we want to check full string only, use error instead of exiting(state)c
161 for (auto [next_state, check] : transitions) {
162 auto next_lam = state2matcher[next_state];
163 auto checker = mem::mut_con(w.type_idx(n));
164 checker->debug_prefix("check_" + state_to_name(state) + "_to_" + state_to_name(next_state));
165 auto [mem3, pos] = checker->vars<2>();
166 checker->app(false, w.select(check, next_lam, next_check), {mem3, w.select(check, new_i, pos)});
167 next_check = checker;
168 }
169 {
170 auto [mem, pos] = not_end->vars<2>();
171 not_end->app(true, next_check, {mem, pos});
172 }
173 }
174
175 matcher->app(false, state2matcher[dfa.get_start()], {mem, pos});
176 return matcher;
177}
const NodeType * get_start() const
Definition automaton.h:31
std::set< const NodeType * > get_reachable_states() const
Definition automaton.h:33
void for_transitions(F &&f, std::uint16_t c) const
Definition dfa.h:25
Base class for all Defs.
Definition def.h:246
This is a thin wrapper for absl::InlinedVector<T, N, A> which is a drop-in replacement for std::vecto...
Definition vector.h:18
The World represents the whole program and manages creation of MimIR nodes (Defs).
Definition world.h:36
Vector< Range > Ranges
const Def * dfa2matcher(World &w, const DFA &dfa, const Def *n)
You can dl::get this function.
std::pair< std::uint64_t, std::uint64_t > Range
absl::btree_map< const DFANode *, To, DFANode::Lt > DFAMap
Definition dfa.h:68
std::optional< Range > merge_ranges(Range a, Range b) noexcept
The core Plugin
Definition core.h:8
The mem Plugin
Definition mem.h:11
Definition ast.h:14
View< const Def * > Defs
Definition def.h:78
u64 nat_t
Definition types.h:37
void error(std::format_string< Args... > fmt, Args &&... args)
Wraps std::format to throw T with a formatted message.
Definition dbg.h:17