t32_t smem) {{
        desc |= (static_cast<uint64_t>(smem & 0xFFFFFF) >> 4);

        max_desc = desc + (BYTES_PER_SMEM_A_{p_id} >> 4) * (STAGES_{p_id} - 1);
    }}

    uint64_t desc;
    uint64_t max_desc;
}};

class Gmma_descriptor_b_{guid} {{
public:
    inline __device__ Gmma_descriptor_b_{guid}() {{
        desc = gmma_desc_b_{guid};
    }}
    inline __device__ void set_smem(uint32_t smem) {{
        desc |= (static_cast<uint64_t>(smem & 0xFFFFFF) >> 4);

        max_desc = desc + (BYTES_PER_SMEM_B_{p_id} >> 4) * (STAGES_{p_id} - 1);
    }}

    uint64_t desc;
    uint64_t max_desc;
}};

class Compute_tile_{guid} {{
public:{compute_tile_ctor_code}
    inline __device__ void increment_gmma_smem_buffer() {{{increment_gmma_smem_b_buffer_code}{increment_gmma_smem_a_buffer_code}
    }}{compute_code}{load_a_code}{load_b_code}
    inline __device__ void clear_acc() {{
        this->gmma_count_       = 0;
        this->clear_acc_tmp_    = 0;
        this->acc_needs_update_ = 0;
        #pragma unroll
        for(int m = 0; m < XMMAS_M_{p_id}; ++m) {{
            #pragma unroll
            for(int i = 0; i < ACC_COUNT_{guid}; ++i) {{
                acc[m][i] = __wgmma_fence_operand(0);
            }}
        }}
    }}
    inline __device__ void clear_acc_tmp() {{
        #pragma unroll
        for(int m = 0; m < XMMAS_M_{p_id}; ++m) {{
            #pragma unroll
            for(int i = 0; i < ACC_COUNT_{guid}; ++i) {{
                acc_tmp[m][i] = __wgmma_fence_operand(0);
            }}
        }}
    }}
    inline __device__ void update_accumulators() {{
        fort::warpgroup_wait<0>();
        #pragma unroll
        for (int m = 0; m < XMMAS_M_{p_id}; ++m) {{
            #pragma unroll
            for(int i = 0; i < ACC_COUNT_{guid}; ++i) {{
                reinterpret_cast<float&>(acc[m][i]) += reinterpret_cast<float&>(acc_tmp[m][i]);
            }}
        }}
        this->acc_needs_update_ = 0;
    }}

    r32 acc[XMMAS_M_{p_id}][ACC_COUNT_{guid}];
    r32 acc_tmp[XMMAS_M_{p_id}][ACC_COUNT_{guid}];  // For 2xacc feature{a_rf_decl}{b_rf_decl}
    // Current GMMA counter
    uint32_t gmma_count_ = 0;
    // Predicate to say if acc clear is needed
    uint32_t clear_acc_tmp_ = 0;
    // Predicate to say if acc clear is needed
    uint32_t acc_needs_update_ = 0;
}};