thorin2/lower__matrix__mediumlevel_8cpp_source.html

#include "thorin/plug/matrix/pass/lower_matrix_mediumlevel.h"


#include <iostream>


#include <thorin/lam.h>


#include "thorin/def.h"


#include "thorin/plug/affine/affine.h"

#include "thorin/plug/core/core.h"

#include "thorin/plug/direct/direct.h"

#include "thorin/plug/matrix/matrix.h"

#include "thorin/plug/mem/mem.h"


using namespace std::string_literals;


namespace thorin::plug::matrix {


Ref LowerMatrixMediumLevel::rewrite(Ref def) {

    if (auto i = rewritten.find(def); i != rewritten.end()) return i->second;

    auto new_def   = rewrite_(def);

    rewritten[def] = new_def;

    return rewritten[def];

}


std::pair<Lam*, Ref> counting_for(Ref bound, DefVec acc, Ref exit, const char* name = "for_body") {

    auto& world = bound->world();

    auto acc_ty = world.tuple(acc)->type();

    auto body

        = world.mut_con({/* iter */ world.type_i32(), /* acc */ acc_ty, /* return */ world.cn(acc_ty)})->set(name);

    auto for_loop = affine::op_for(world, world.lit_i32(0), bound, world.lit_i32(1), acc, body, exit);

    return {body, for_loop};

}


// TODO: compare with other impala version (why is one easier than the other?)

// TODO: replace sum_ptr by using sum as accumulator

// TODO: extract inner loop into function (for read normalizer)


Ref LowerMatrixMediumLevel::rewrite_(Ref def) {

    auto& world = def->world();


    if (auto map_reduce_ax = match<matrix::map_reduce>(def); map_reduce_ax) {

        // meta arguments:

        // * n = out-count, (nat)

        // * S = out-dim, (n*nat)

        // * T = out-type (*)

        // * m = in-count (nat)

        // * NI = in-dim-count (m*nat)

        // * TI = types (m**)

        // * SI = dimensions (m*NI#i)

        // arguments:

        // * mem

        // * zero = accumulator init (T)

        // * combination function (mem, acc, inputs) -> (mem, acc)

        // * input matrixes

        auto [mem, zero, comb, inputs] = map_reduce_ax->args<4>();

        auto [n, S, T, m, NI, TI, SI]  = map_reduce_ax->callee()->as<App>()->args<7>();

        world.DLOG("map_reduce_ax {} : {}", map_reduce_ax, map_reduce_ax->type());

        world.DLOG("meta variables:");

        world.DLOG("  n = {}", n);

        world.DLOG("  S = {}", S);

        world.DLOG("  T = {}", T);

        world.DLOG("  m = {}", m);

        world.DLOG("  NI = {} : {}", NI, NI->type());

        world.DLOG("  TI = {} : {}", TI, TI->type());

        world.DLOG("  SI = {} : {}", SI, SI->type());

        world.DLOG("arguments:");

        world.DLOG("  mem = {}", mem);

        world.DLOG("  zero = {}", zero);

        world.DLOG("  comb = {} : {}", comb, comb->type());

        world.DLOG("  inputs = {} : {}", inputs, inputs->type());


        // Our goal is to generate a call to a function that performs:

        // ```

        // matrix = new matrix (n, S, T)

        // for out_idx { // n for loops

        //     acc = zero

        //     for in_idx { // remaining loops

        //         inps = read from matrices // m-tuple

        //         acc = comb(mem, acc, inps)

        //     }

        //     write acc to output matrix

        // }

        // return matrix

        // ```


        absl::flat_hash_map<u64, Ref> dims;         // idx ↦ nat (size bound = dimension)

        absl::flat_hash_map<u64, Ref> raw_iterator; // idx ↦ I32

        absl::flat_hash_map<u64, Ref> iterator;     // idx ↦ %Idx (S/NI#i)

        Vector<u64> out_indices;                    // output indices 0..n-1

        Vector<u64> in_indices;                     // input indices ≥ n


        Vector<Ref> output_dims;   // i<n ↦ nat (dimension S#i)

        Vector<DefVec> input_dims; // i<m ↦ j<NI#i ↦ nat (dimension SI#i#j)

        Vector<u64> n_input;       // i<m ↦ nat (number of dimensions of SI#i)


        auto n_lit = n->isa<Lit>();

        auto m_lit = m->isa<Lit>();

        if (!n_lit || !m_lit) {

            world.DLOG("n or m is not a literal");

            return def;

        }


        auto n_nat = n_lit->get<u64>(); // number of output dimensions (in S)

        auto m_nat = m_lit->get<u64>(); // number of input matrices


        // collect output dimensions

        world.DLOG("out dims (n) = {}", n_nat);

        for (u64 i = 0; i < n_nat; ++i) {

            auto dim = S->proj(n_nat, i);

            world.DLOG("dim {} = {}", i, dim);

            dims[i] = dim;

            output_dims.push_back(dim);

        }


        // collect other (input) dimensions

        world.DLOG("matrix count (m) = {}", m_nat);


        for (u64 i = 0; i < m_nat; ++i) {

            auto ni     = NI->proj(m_nat, i);

            auto ni_lit = Lit::isa(ni);

            if (!ni_lit) {

                world.DLOG("matrix {} has non-constant dimension count", i);

                return def;

            }

            u64 ni_nat = *ni_lit;

            world.DLOG("  dims({i}) = {}", i, ni_nat);

            auto SI_i = SI->proj(m_nat, i);

            DefVec input_dims_i;

            for (u64 j = 0; j < ni_nat; ++j) {

                auto dim = SI_i->proj(ni_nat, j);

                world.DLOG("    dim {} {} = {}", i, j, dim);

                // dims[i * n_nat + j] = dim;

                input_dims_i.push_back(dim);

            }

            input_dims.push_back(input_dims_i);

            n_input.push_back(ni_nat);

        }


        // extracts bounds for each index (in, out)

        for (u64 i = 0; i < m_nat; ++i) {

            world.DLOG("investigate {} / {}", i, m_nat);

            auto [indices, mat] = inputs->proj(m_nat, i)->projs<2>();

            world.DLOG("  indices {} = {}", i, indices);

            world.DLOG("  matrix {} = {}", i, mat);

            for (u64 j = 0; j < n_input[i]; ++j) {

                // world.DLOG("    dimension {} / {}", j, n_input[i]);

                auto idx     = indices->proj(n_input[i], j);

                auto idx_lit = Lit::isa(idx);

                if (!idx_lit) {

                    world.DLOG("    index {} {} is not a literal", i, j);

                    return def;

                }

                u64 idx_nat = *idx_lit;

                auto dim    = input_dims[i][j];

                world.DLOG("      index {} = {}", j, idx);

                world.DLOG("        dim {} = {}", idx, dim);

                if (!dims.contains(idx_nat)) {

                    dims[idx_nat] = dim;

                    world.DLOG("        {} ↦ {}", idx_nat, dim);

                } else {

                    // assert(dims[idx_nat] == dim);

                    auto prev_dim = dims[idx_nat];

                    world.DLOG("        prev dim {} = {}", idx_nat, prev_dim);

                    // override with more precise information

                    if (auto dim_lit = dim->isa<Lit>()) {

                        if (auto prev_dim_lit = prev_dim->isa<Lit>())

                            assert(dim_lit->get<u64>() == prev_dim_lit->get<u64>() && "dimensions must be equal");

                        else

                            dims[idx_nat] = dim;

                    }

                }

            }

        }


        for (auto [idx, dim] : dims) {

            world.ILOG("dim {} = {}", idx, dim);

            if (idx < n_nat)

                out_indices.push_back(idx);

            else

                in_indices.push_back(idx);

        }

        // sort indices to make checks easier later.

        std::sort(out_indices.begin(), out_indices.end());

        std::sort(in_indices.begin(), in_indices.end());


        // create function `%mem.M -> [%mem.M, %matrix.Mat (n,S,T)]` to replace axiom call


        auto mem_type = world.annex<mem::M>();

        auto fun      = world.mut_fun(mem_type, map_reduce_ax->type())->set("mapRed");


        // assert(0);

        auto ds_fun = direct::op_cps2ds_dep(fun);

        world.DLOG("ds_fun {} : {}", ds_fun, ds_fun->type());

        auto call = world.app(ds_fun, mem);

        world.DLOG("call {} : {}", call, call->type());


        // flowchart:

        // ```

        // -> init

        // -> forOut1 with yieldOut1

        //    => exitOut1 = return_cont

        // -> forOut2 with yieldOut2

        //    => exitOut2 = yieldOut1

        // -> ...

        // -> accumulator init

        // -> forIn1 with yieldIn1

        //    => exitIn1 = writeCont

        // -> forIn2 with yieldIn2

        //    => exitIn2 = yieldIn1

        // -> ...

        // -> read matrices

        // -> fun

        //    => exitFun = yieldInM

        //

        // (return path)

        // -> ...

        // -> write

        // -> yieldOutN

        // -> ...

        // ```


        // First create the output matrix.

        auto current_mem      = mem;

        auto [mem2, init_mat] = world.app(world.annex<matrix::init>(), {n, S, T, current_mem})->projs<2>();

        current_mem           = mem2;


        // The function on where to continue -- return after all output loops.

        auto cont        = fun->var(1);

        auto current_mut = fun;


        // Each of the outer loops contains the memory and matrix as accumulator (in an inner monad).

        DefVec acc = {current_mem, init_mat};


        for (auto idx : out_indices) {

            auto for_name    = world.sym("forIn_"s + std::to_string(idx));

            auto dim_nat_def = dims[idx];

            auto dim         = world.call<core::bitcast>(world.type_i32(), dim_nat_def);


            auto [body, for_call]       = counting_for(dim, acc, cont, for_name);

            auto [iter, new_acc, yield] = body->vars<3>();

            cont                        = yield;

            raw_iterator[idx]           = iter;

            iterator[idx]               = world.call<core::bitcast>(world.type_idx(dim_nat_def), iter);

            auto [new_mem, new_mat]     = new_acc->projs<2>();

            acc                         = {new_mem, new_mat};

            current_mut->set(true, for_call);

            current_mut = body;

        }


        // Now the inner loops for the inputs:

        // Each of the inner loops contains the element accumulator and memory as accumulator (in an inner monad).

        world.DLOG("acc at inner: {;}", acc);


        // First create the accumulator.

        auto element_acc = zero;

        element_acc->set("acc");

        current_mem    = acc[0];

        auto wb_matrix = acc[1];

        assert(wb_matrix);

        world.DLOG("wb_matrix {} : {}", wb_matrix, wb_matrix->type());


        // Write back element to matrix. Set this as return after all inner loops.

        auto write_back = mem::mut_con(T)->set("matrixWriteBack");

        world.DLOG("write_back {} : {}", write_back, write_back->type());

        auto [wb_mem, element_final] = write_back->vars<2>();


        auto output_iterators = DefVec((size_t)n_nat, [&](u64 i) {

            auto idx = out_indices[i];

            if (idx != i) world.ELOG("output indices must be consecutive 0..n-1 but {} != {}", idx, i);

            assert(idx == i && "output indices must be consecutive 0..n-1");

            auto iter_idx_def = iterator[idx];

            return iter_idx_def;

        });

        auto output_it_tuple  = world.tuple(output_iterators);

        world.DLOG("output tuple: {} : {}", output_it_tuple, output_it_tuple->type());


        auto [wb_mem2, written_matrix] = world

                                             .app(world.app(world.annex<matrix::insert>(), {n, S, T}),

                                                  {wb_mem, wb_matrix, output_it_tuple, element_final})

                                             ->projs<2>();


        write_back->app(true, cont, {wb_mem2, written_matrix});


        // From here on the continuations take the element and memory.

        acc  = {current_mem, element_acc};

        cont = write_back;


        // TODO this is copy&paste code from above

        for (auto idx : in_indices) {

            auto for_name    = world.sym("forIn_"s + std::to_string(idx));

            auto dim_nat_def = dims[idx];

            auto dim         = world.call<core::bitcast>(world.type_i32(), dim_nat_def);


            auto [body, for_call]       = counting_for(dim, acc, cont, for_name);

            auto [iter, new_acc, yield] = body->vars<3>();

            cont                        = yield;

            raw_iterator[idx]           = iter;

            iterator[idx]               = world.call<core::bitcast>(world.type_idx(dim_nat_def), iter);

            auto [new_mem, new_element] = new_acc->projs<2>();

            acc                         = {new_mem, new_element};

            current_mut->set(true, for_call);

            current_mut = body;

        }


        // For testing: id in innermost loop instead of read, fun:

        // current_mut->app(true, cont, acc);


        current_mem = acc[0];

        element_acc = acc[1];


        // Read element from input matrix.

        DefVec input_elements((size_t)m_nat);

        for (u64 i = 0; i < m_nat; i++) {

            // TODO: case m_nat == 1

            auto input_i                       = inputs->proj(m_nat, i);

            auto [input_idx_tup, input_matrix] = input_i->projs<2>();


            world.DLOG("input matrix {} is {} : {}", i, input_matrix, input_matrix->type());


            auto indices         = input_idx_tup->projs(n_input[i]);

            auto input_iterators = DefVec(n_input[i], [&](u64 j) {

                auto idx     = indices[j];

                auto idx_lit = idx->as<Lit>()->get<u64>();

                world.DLOG("  idx {} {} = {}", i, j, idx_lit);

                return iterator[idx_lit];

            });

            auto input_it_tuple  = world.tuple(input_iterators);


            auto read_entry = op_read(current_mem, input_matrix, input_it_tuple);

            world.DLOG("read_entry {} : {}", read_entry, read_entry->type());

            auto [new_mem, element_i] = read_entry->projs<2>();

            current_mem               = new_mem;

            input_elements[i]         = element_i;

        }


        world.DLOG("  read elements {,}", input_elements);

        world.DLOG("  fun {} : {}", fun, fun->type());


        // TODO: make non-scalar or completely scalar?

        current_mut->app(true, comb, {world.tuple({current_mem, element_acc, world.tuple(input_elements)}), cont});


        return call;

    }


    return def;

}


} // namespace thorin::plug::matrix

affine.h

thorin::App
Definition lam.h:194

thorin::Def::projs
auto projs(F f) const
Splits this Def via Def::projections into an Array (if A == -1_n) or std::array (otherwise).
Definition def.h:369

thorin::Def::type
const Def * type() const
Yields the raw type of this Def, i.e. maybe nullptr.
Definition def.h:248

thorin::Def::world
World & world() const
Definition def.cpp:421

thorin::Lam::set
Lam * set(Filter filter, const Def *body)
Definition lam.h:159

thorin::Lit
Definition def.h:706

thorin::Lit::isa
static std::optional< T > isa(Ref def)
Definition def.h:726

thorin::Lit::get
T get() const
Definition def.h:714

thorin::Pass::world
World & world()
Definition pass.h:296

thorin::Ref
Helper class to retrieve Infer::arg if present.
Definition def.h:87

thorin::Vector< const Def * >

thorin::World::type_i32
Ref type_i32()
Definition world.h:463

thorin::World::sym
Sym sym(std::string_view)
Definition world.cpp:77

thorin::World::vars
Vars vars(const Var *var)
Definition world.h:484

thorin::World::annex
const Def * annex(Id id)
Lookup annex by Axiom::id.
Definition world.h:167

thorin::World::type_idx
const Idx * type_idx()
Definition world.h:447

thorin::World::call
const Def * call(Id id, Args &&... args)
Definition world.h:497

thorin::World::app
Ref app(Ref callee, Ref arg)
Definition world.cpp:183

thorin::World::tuple
Ref tuple(Defs ops)
Definition world.cpp:226

thorin::World::mut_fun
Lam * mut_fun(Ref dom, Ref codom)
Definition world.h:281

thorin::plug::matrix::LowerMatrixMediumLevel::rewrite_
Ref rewrite_(Ref)
Definition lower_matrix_mediumlevel.cpp:38

thorin::plug::matrix::LowerMatrixMediumLevel::rewrite
Ref rewrite(Ref) override
custom rewrite function memoized version of rewrite_
Definition lower_matrix_mediumlevel.cpp:19

core.h

def.h

direct.h

lam.h

lower_matrix_mediumlevel.h

matrix.h

mem.h

thorin::plug::affine::op_for
const Def * op_for(World &w, Ref begin, Ref end, Ref step, Defs inits, Ref body, Ref brk)
Returns a fully applied affine_for axiom.
Definition affine.h:20

thorin::plug::core::bitcast
bitcast
Definition autogen.h:322

thorin::plug::direct::op_cps2ds_dep
const Def * op_cps2ds_dep(const Def *f)
Definition direct.h:11

thorin::plug::matrix
The matrix Plugin
Definition matrix.h:9

thorin::plug::matrix::init
init
Definition autogen.h:73

thorin::plug::matrix::insert
insert
Definition autogen.h:61

thorin::plug::matrix::counting_for
std::pair< Lam *, Ref > counting_for(Ref bound, DefVec acc, Ref exit, const char *name="for_body")
Definition lower_matrix_mediumlevel.cpp:26

thorin::plug::matrix::op_read
const Def * op_read(Ref mem, Ref matrix, Ref idx)
Definition matrix.h:19

thorin::plug::mem
The mem Plugin
Definition mem.h:11

thorin::plug::mem::M
M
Definition autogen.h:15

thorin::plug::mem::mut_con
Lam * mut_con(World &w)
Yields .con[mem.M].
Definition mem.h:16

thorin::u64
uint64_t u64
Definition types.h:35

thorin::DefVec
Vector< const Def * > DefVec
Definition def.h:63