MimIR/lower__matrix__mediumlevel_8cpp_source.html

#include "mim/plug/matrix/pass/lower_matrix_mediumlevel.h"


#include <iostream>


#include <mim/lam.h>


#include "mim/def.h"


#include "mim/plug/affine/affine.h"

#include "mim/plug/core/core.h"

#include "mim/plug/direct/direct.h"

#include "mim/plug/matrix/matrix.h"

#include "mim/plug/mem/mem.h"


using namespace std::string_literals;


namespace mim::plug::matrix {


const Def* LowerMatrixMediumLevel::rewrite(const Def* def) {

    if (auto i = rewritten.find(def); i != rewritten.end()) return i->second;

    auto new_def   = rewrite_(def);

    rewritten[def] = new_def;

    return rewritten[def];

}


std::pair<Lam*, const Def*> counting_for(const Def* bound, DefVec acc, const Def* exit, const char* name = "for_body") {

    auto& world = bound->world();

    auto acc_ty = world.tuple(acc)->type();

    auto body

        = world.mut_con({/* iter */ world.type_i32(), /* acc */ acc_ty, /* return */ world.cn(acc_ty)})->set(name);

    auto for_loop = world.call<affine::For>(Defs{world.lit_i32(0), bound, world.lit_i32(1), world.tuple(acc), body, exit});

    return {body, for_loop};

}


// TODO: compare with other impala version (why is one easier than the other?)

// TODO: replace sum_ptr by using sum as accumulator

// TODO: extract inner loop into function (for read normalizer)


const Def* LowerMatrixMediumLevel::rewrite_(const Def* def) {

    if (auto map_reduce_ax = Axm::isa<matrix::map_reduce>(def); map_reduce_ax) {

        // meta arguments:

        // * n = out-count, (nat)

        // * S = out-dim, (n*nat)

        // * T = out-type (*)

        // * m = in-count (nat)

        // * NI = in-dim-count (m*nat)

        // * TI = types (m**)

        // * SI = dimensions (m*NI#i)

        // arguments:

        // * mem

        // * zero = accumulator init (T)

        // * combination function (mem, acc, inputs) -> (mem, acc)

        // * input matrixes

        auto [mem, zero, comb, inputs] = map_reduce_ax->args<4>();

        auto [n, S, T, m, NI, TI, SI]  = map_reduce_ax->callee()->as<App>()->args<7>();

        world().DLOG("map_reduce_ax {} : {}", map_reduce_ax, map_reduce_ax->type());

        world().DLOG("meta variables:");

        world().DLOG("  n = {}", n);

        world().DLOG("  S = {}", S);

        world().DLOG("  T = {}", T);

        world().DLOG("  m = {}", m);

        world().DLOG("  NI = {} : {}", NI, NI->type());

        world().DLOG("  TI = {} : {}", TI, TI->type());

        world().DLOG("  SI = {} : {}", SI, SI->type());

        world().DLOG("arguments:");

        world().DLOG("  mem = {}", mem);

        world().DLOG("  zero = {}", zero);

        world().DLOG("  comb = {} : {}", comb, comb->type());

        world().DLOG("  inputs = {} : {}", inputs, inputs->type());


        // Our goal is to generate a call to a function that performs:

        // ```

        // matrix = new matrix (n, S, T)

        // for out_idx { // n for loops

        //     acc = zero

        //     for in_idx { // remaining loops

        //         inps = read from matrices // m-tuple

        //         acc = comb(mem, acc, inps)

        //     }

        //     write acc to output matrix

        // }

        // return matrix

        // ```


        absl::flat_hash_map<u64, const Def*> dims;         // idx ↦ nat (size bound = dimension)

        absl::flat_hash_map<u64, const Def*> raw_iterator; // idx ↦ I32

        absl::flat_hash_map<u64, const Def*> iterator;     // idx ↦ %Idx (S/NI#i)

        Vector<u64> out_indices;                           // output indices 0..n-1

        Vector<u64> in_indices;                            // input indices ≥ n


        Vector<const Def*> output_dims; // i<n ↦ nat (dimension S#i)

        Vector<DefVec> input_dims;      // i<m ↦ j<NI#i ↦ nat (dimension SI#i#j)

        Vector<u64> n_input;            // i<m ↦ nat (number of dimensions of SI#i)


        auto n_lit = n->isa<Lit>();

        auto m_lit = m->isa<Lit>();

        if (!n_lit || !m_lit) {

            world().DLOG("n or m is not a literal");

            return def;

        }


        auto n_nat = n_lit->get<u64>(); // number of output dimensions (in S)

        auto m_nat = m_lit->get<u64>(); // number of input matrices


        // collect output dimensions

        world().DLOG("out dims (n) = {}", n_nat);

        for (u64 i = 0; i < n_nat; ++i) {

            auto dim = S->proj(n_nat, i);

            world().DLOG("dim {} = {}", i, dim);

            dims[i] = dim;

            output_dims.push_back(dim);

        }


        // collect other (input) dimensions

        world().DLOG("matrix count (m) = {}", m_nat);


        for (u64 i = 0; i < m_nat; ++i) {

            auto ni     = NI->proj(m_nat, i);

            auto ni_lit = Lit::isa(ni);

            if (!ni_lit) {

                world().DLOG("matrix {} has non-constant dimension count", i);

                return def;

            }

            u64 ni_nat = *ni_lit;

            world().DLOG("  dims({i}) = {}", i, ni_nat);

            auto SI_i = SI->proj(m_nat, i);

            DefVec input_dims_i;

            for (u64 j = 0; j < ni_nat; ++j) {

                auto dim = SI_i->proj(ni_nat, j);

                world().DLOG("    dim {} {} = {}", i, j, dim);

                // dims[i * n_nat + j] = dim;

                input_dims_i.push_back(dim);

            }

            input_dims.push_back(input_dims_i);

            n_input.push_back(ni_nat);

        }


        // extracts bounds for each index (in, out)

        for (u64 i = 0; i < m_nat; ++i) {

            world().DLOG("investigate {} / {}", i, m_nat);

            auto [indices, mat] = inputs->proj(m_nat, i)->projs<2>();

            world().DLOG("  indices {} = {}", i, indices);

            world().DLOG("  matrix {} = {}", i, mat);

            for (u64 j = 0; j < n_input[i]; ++j) {

                // world().DLOG("    dimension {} / {}", j, n_input[i]);

                auto idx     = indices->proj(n_input[i], j);

                auto idx_lit = Lit::isa(idx);

                if (!idx_lit) {

                    world().DLOG("    index {} {} is not a literal", i, j);

                    return def;

                }

                u64 idx_nat = *idx_lit;

                auto dim    = input_dims[i][j];

                world().DLOG("      index {} = {}", j, idx);

                world().DLOG("        dim {} = {}", idx, dim);

                if (!dims.contains(idx_nat)) {

                    dims[idx_nat] = dim;

                    world().DLOG("        {} ↦ {}", idx_nat, dim);

                } else {

                    // assert(dims[idx_nat] == dim);

                    auto prev_dim = dims[idx_nat];

                    world().DLOG("        prev dim {} = {}", idx_nat, prev_dim);

                    // override with more precise information

                    if (auto dim_lit = dim->isa<Lit>()) {

                        if (auto prev_dim_lit = prev_dim->isa<Lit>())

                            assert(dim_lit->get<u64>() == prev_dim_lit->get<u64>() && "dimensions must be equal");

                        else

                            dims[idx_nat] = dim;

                    }

                }

            }

        }


        for (auto [idx, dim] : dims) {

            world().ILOG("dim {} = {}", idx, dim);

            if (idx < n_nat)

                out_indices.push_back(idx);

            else

                in_indices.push_back(idx);

        }

        // sort indices to make checks easier later.

        std::sort(out_indices.begin(), out_indices.end());

        std::sort(in_indices.begin(), in_indices.end());


        // create function `%mem.M -> [%mem.M, %matrix.Mat (n,S,T)]` to replace axm call


        auto mem_type = world().annex<mem::M>();

        auto fun      = world().mut_fun(mem_type, map_reduce_ax->type())->set("mapRed");


        // assert(0);

        auto ds_fun = direct::op_cps2ds_dep(fun);

        world().DLOG("ds_fun {} : {}", ds_fun, ds_fun->type());

        auto call = world().app(ds_fun, mem);

        world().DLOG("call {} : {}", call, call->type());


        // flowchart:

        // ```

        // -> init

        // -> forOut1 with yieldOut1

        //    => exitOut1 = return_cont

        // -> forOut2 with yieldOut2

        //    => exitOut2 = yieldOut1

        // -> ...

        // -> accumulator init

        // -> forIn1 with yieldIn1

        //    => exitIn1 = writeCont

        // -> forIn2 with yieldIn2

        //    => exitIn2 = yieldIn1

        // -> ...

        // -> read matrices

        // -> fun

        //    => exitFun = yieldInM

        //

        // (return path)

        // -> ...

        // -> write

        // -> yieldOutN

        // -> ...

        // ```


        // First create the output matrix.

        auto current_mem      = mem;

        auto [mem2, init_mat] = world().app(world().annex<matrix::init>(), {n, S, T, current_mem})->projs<2>();

        current_mem           = mem2;


        // The function on where to continue -- return after all output loops.

        auto cont        = fun->var(1);

        auto current_mut = fun;


        // Each of the outer loops contains the memory and matrix as accumulator (in an inner monad).

        DefVec acc = {current_mem, init_mat};


        for (auto idx : out_indices) {

            auto for_name    = world().sym("forIn_"s + std::to_string(idx));

            auto dim_nat_def = dims[idx];

            auto dim         = world().call<core::bitcast>(world().type_i32(), dim_nat_def);


            auto [body, for_call]       = counting_for(dim, acc, cont, for_name);

            auto [iter, new_acc, yield] = body->vars<3>();

            cont                        = yield;

            raw_iterator[idx]           = iter;

            iterator[idx]               = world().call<core::bitcast>(world().type_idx(dim_nat_def), iter);

            auto [new_mem, new_mat]     = new_acc->projs<2>();

            acc                         = {new_mem, new_mat};

            current_mut->set(true, for_call);

            current_mut = body;

        }


        // Now the inner loops for the inputs:

        // Each of the inner loops contains the element accumulator and memory as accumulator (in an inner monad).

        world().DLOG("acc at inner: {;}", acc);


        // First create the accumulator.

        auto element_acc = zero;

        element_acc->set("acc");

        current_mem    = acc[0];

        auto wb_matrix = acc[1];

        assert(wb_matrix);

        world().DLOG("wb_matrix {} : {}", wb_matrix, wb_matrix->type());


        // Write back element to matrix. Set this as return after all inner loops.

        auto write_back = mem::mut_con(T)->set("matrixWriteBack");

        world().DLOG("write_back {} : {}", write_back, write_back->type());

        auto [wb_mem, element_final] = write_back->vars<2>();


        auto output_iterators = DefVec((size_t)n_nat, [&](u64 i) {

            auto idx = out_indices[i];

            if (idx != i) world().ELOG("output indices must be consecutive 0..n-1 but {} != {}", idx, i);

            assert(idx == i && "output indices must be consecutive 0..n-1");

            auto iter_idx_def = iterator[idx];

            return iter_idx_def;

        });

        auto output_it_tuple  = world().tuple(output_iterators);

        world().DLOG("output tuple: {} : {}", output_it_tuple, output_it_tuple->type());


        auto [wb_mem2, written_matrix] = world()

                                             .app(world().app(world().annex<matrix::insert>(), {n, S, T}),

                                                  {wb_mem, wb_matrix, output_it_tuple, element_final})

                                             ->projs<2>();


        write_back->app(true, cont, {wb_mem2, written_matrix});


        // From here on the continuations take the element and memory.

        acc  = {current_mem, element_acc};

        cont = write_back;


        // TODO this is copy&paste code from above

        for (auto idx : in_indices) {

            auto for_name    = world().sym("forIn_"s + std::to_string(idx));

            auto dim_nat_def = dims[idx];

            auto dim         = world().call<core::bitcast>(world().type_i32(), dim_nat_def);


            auto [body, for_call]       = counting_for(dim, acc, cont, for_name);

            auto [iter, new_acc, yield] = body->vars<3>();

            cont                        = yield;

            raw_iterator[idx]           = iter;

            iterator[idx]               = world().call<core::bitcast>(world().type_idx(dim_nat_def), iter);

            auto [new_mem, new_element] = new_acc->projs<2>();

            acc                         = {new_mem, new_element};

            current_mut->set(true, for_call);

            current_mut = body;

        }


        // For testing: id in innermost loop instead of read, fun:

        // current_mut->app(true, cont, acc);


        current_mem = acc[0];

        element_acc = acc[1];


        // Read element from input matrix.

        DefVec input_elements((size_t)m_nat);

        for (u64 i = 0; i < m_nat; i++) {

            // TODO: case m_nat == 1

            auto input_i                       = inputs->proj(m_nat, i);

            auto [input_idx_tup, input_matrix] = input_i->projs<2>();


            world().DLOG("input matrix {} is {} : {}", i, input_matrix, input_matrix->type());


            auto indices         = input_idx_tup->projs(n_input[i]);

            auto input_iterators = DefVec(n_input[i], [&](u64 j) {

                auto idx     = indices[j];

                auto idx_lit = idx->as<Lit>()->get<u64>();

                world().DLOG("  idx {} {} = {}", i, j, idx_lit);

                return iterator[idx_lit];

            });

            auto input_it_tuple  = world().tuple(input_iterators);


            auto read_entry = op_read(current_mem, input_matrix, input_it_tuple);

            world().DLOG("read_entry {} : {}", read_entry, read_entry->type());

            auto [new_mem, element_i] = read_entry->projs<2>();

            current_mem               = new_mem;

            input_elements[i]         = element_i;

        }


        world().DLOG("  read elements {,}", input_elements);

        world().DLOG("  fun {} : {}", fun, fun->type());


        // TODO: make non-scalar or completely scalar?

        current_mut->app(true, comb, {world().tuple({current_mem, element_acc, world().tuple(input_elements)}), cont});


        return call;

    }


    return def;

}


} // namespace mim::plug::matrix

affine.h

mim::App
Definition lam.h:210

mim::Axm::isa
static auto isa(const Def *def)
Definition axm.h:107

mim::Def
Base class for all Defs.
Definition def.h:216

mim::Def::world
World & world() const noexcept
Definition def.cpp:413

mim::Def::var
const Def * var(nat_t a, nat_t i) noexcept
Definition def.h:394

mim::Def::type
const Def * type() const noexcept
Yields the "raw" type of this Def (maybe nullptr).
Definition def.h:260

mim::Lam::set
Lam * set(Filter filter, const Def *body)
Definition lam.h:166

mim::Lit
Definition def.h:749

mim::Lit::isa
static std::optional< T > isa(const Def *def)
Definition def.h:773

mim::Lit::get
T get() const
Definition def.h:760

mim::Pass::world
World & world()
Definition pass.h:310

mim::Vector
This is a thin wrapper for absl::InlinedVector<T, N, A> which is a drop-in replacement for std::vecto...
Definition vector.h:18

mim::World::app
const Def * app(const Def *callee, const Def *arg)
Definition world.cpp:195

mim::World::type_idx
const Idx * type_idx()
Definition world.h:481

mim::World::annex
const Def * annex(Id id)
Lookup annex by Axm::id.
Definition world.h:177

mim::World::mut_fun
Lam * mut_fun(const Def *dom, const Def *codom)
Definition world.h:307

mim::World::tuple
const Def * tuple(Defs ops)
Definition world.cpp:287

mim::World::sym
Sym sym(std::string_view)
Definition world.cpp:74

mim::World::call
const Def * call(Id id, Args &&... args)
Complete curried call of annexes obeying implicits.
Definition world.h:525

mim::World::vars
auto & vars()
Definition world.h:533

mim::World::type_i32
const Def * type_i32()
Definition world.h:497

mim::plug::matrix::LowerMatrixMediumLevel::rewrite_
const Def * rewrite_(const Def *)
Definition lower_matrix_mediumlevel.cpp:38

mim::plug::matrix::LowerMatrixMediumLevel::rewrite
const Def * rewrite(const Def *) override
custom rewrite function memoized version of rewrite_
Definition lower_matrix_mediumlevel.cpp:19

core.h

def.h

direct.h

lam.h

lower_matrix_mediumlevel.h

matrix.h

mem.h

mim::plug::affine::For
For
Definition autogen.h:14

mim::plug::core::bitcast
bitcast
Definition autogen.h:262

mim::plug::direct::op_cps2ds_dep
const Def * op_cps2ds_dep(const Def *k)
Definition direct.h:15

mim::plug::matrix
The matrix Plugin
Definition matrix.h:9

mim::plug::matrix::op_read
const Def * op_read(const Def *mem, const Def *matrix, const Def *idx)
Definition matrix.h:19

mim::plug::matrix::counting_for
std::pair< Lam *, const Def * > counting_for(const Def *bound, DefVec acc, const Def *exit, const char *name="for_body")
Definition lower_matrix_mediumlevel.cpp:26

mim::plug::mem
The mem Plugin
Definition mem.h:11

mim::plug::mem::M
M
Definition autogen.h:14

mim::plug::mem::mut_con
Lam * mut_con(World &w)
Yields con[mem.M].
Definition mem.h:16

mim::Defs
View< const Def * > Defs
Definition def.h:51

mim::DefVec
Vector< const Def * > DefVec
Definition def.h:52

mim::get
constexpr decltype(auto) get(Span< T, N > span) noexcept
Definition span.h:115

mim::u64
uint64_t u64
Definition types.h:34