Tensorium
Loading...
Searching...
No Matches
tensorium::GemmKernelBigger< T > Class Template Reference

#include <GemmKernel_bigger.hpp>

Collaboration diagram for tensorium::GemmKernelBigger< T >:

Public Types

using Simd = simd::SimdTraits<T, DefaultISA>
 
using reg = typename Simd::reg
 

Public Member Functions

void fma_loop_00 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void fma_loop_01 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void fma_loop_02 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void fma_loop_03 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void fma_loop_04 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void fma_loop_05 (T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc)
 
void maskload_accum_00 (T *C, reg *C_accum_00, reg *C_accum_01, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskload_accum_01 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskload_accum_02 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskload_accum_03 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskload_accum_04 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskload_accum_05 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void load_accum_00 (T *C, reg *C_accum_00, reg *C_accum_01, int M)
 
void load_accum_01 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, int M)
 
void load_accum_02 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, int M)
 
void load_accum_03 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, int M)
 
void load_accum_04 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, int M)
 
void load_accum_05 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, int M)
 
void store_accum_00 (T *C, reg *C_accum_00, reg *C_accum_01, int M)
 
void store_accum_01 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, int M)
 
void store_accum_02 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, int M)
 
void store_accum_03 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, int M)
 
void store_accum_04 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, int M)
 
void store_accum_05 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, int M)
 
void maskstore_accum_00 (T *C, reg *C_accum_00, reg *C_accum_01, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskstore_accum_01 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskstore_accum_02 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskstore_accum_03 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskstore_accum_04 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void maskstore_accum_05 (T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, __m256i packed_mask_0, __m256i packed_mask_1, int M)
 
void kernel_16x6_load_accum (T *blockA_packed, T *blockB_packed, T *C, int mr, int nr, int kc, int M)
 
void kernel_16x6_zero_init_accum (T *blockA_packed, T *blockB_packed, T *C, int mr, int nr, int kc, int M)
 
void pack_panelB (T *B, T *blockB_packed, int nr, int kc, int K)
 
void pack_blockB (T *B, T *blockB_packed, int nc, int kc, int K)
 
void pack_panelA (T *A, T *blockA_packed, int mr, int kc, int M)
 
void pack_blockA (T *A, T *blockA_packed, int mc, int kc, int M)
 
void matmul (T *A, T *B, T *C, int M, int N, int K)
 

Static Public Member Functions

static int8_t mask[32] __attribute__ ((aligned(64)))
 
static void build_masks (__m256i *packed_mask_0, __m256i *packed_mask_1, int mr)
 
static T blockA_packed[MC *KC__attribute__ ((aligned(64)))
 
static T blockB_packed[NC *KC__attribute__ ((aligned(64)))
 

Static Public Attributes

static constexpr int SimdWidth = Simd::width
 
static constexpr int TileRows = SimdWidth * 2
 
static constexpr int TileCols = 6
 
static constexpr int NThreads = 16
 
static constexpr int BlockDepth = 256
 
static constexpr int BlockRows = 384
 
static constexpr int BlockCols = 512
 

Member Typedef Documentation

◆ reg

template<typename T >
using tensorium::GemmKernelBigger< T >::reg = typename Simd::reg

◆ Simd

Member Function Documentation

◆ __attribute__() [1/3]

template<typename T >
static int8_t mask[32] tensorium::GemmKernelBigger< T >::__attribute__ ( (aligned(64)) )
inlinestatic

◆ __attribute__() [2/3]

template<typename T >
static T blockA_packed[MC *KC] tensorium::GemmKernelBigger< T >::__attribute__ ( (aligned(64)) )
static

◆ __attribute__() [3/3]

template<typename T >
static T blockB_packed[NC *KC] tensorium::GemmKernelBigger< T >::__attribute__ ( (aligned(64)) )
static

◆ build_masks()

template<typename T >
static void tensorium::GemmKernelBigger< T >::build_masks ( __m256i * packed_mask_0,
__m256i * packed_mask_1,
int mr )
inlinestatic

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_00()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_00 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_01()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_01 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_02()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_02 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_03()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_03 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_04()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_04 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ fma_loop_05()

template<typename T >
void tensorium::GemmKernelBigger< T >::fma_loop_05 ( T * blockA_packed,
T * blockB_packed,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * C_accum_50,
reg * C_accum_51,
reg * a0_packFloat8,
reg * a1_packFloat8,
reg * b_packFloat8,
int kc )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ kernel_16x6_load_accum()

template<typename T >
void tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum ( T * blockA_packed,
T * blockB_packed,
T * C,
int mr,
int nr,
int kc,
int M )
inline

References tensorium::GemmKernelBigger< T >::build_masks(), tensorium::GemmKernelBigger< T >::fma_loop_00(), tensorium::GemmKernelBigger< T >::fma_loop_01(), tensorium::GemmKernelBigger< T >::fma_loop_02(), tensorium::GemmKernelBigger< T >::fma_loop_03(), tensorium::GemmKernelBigger< T >::fma_loop_04(), tensorium::GemmKernelBigger< T >::fma_loop_05(), tensorium::GemmKernelBigger< T >::load_accum_00(), tensorium::GemmKernelBigger< T >::load_accum_01(), tensorium::GemmKernelBigger< T >::load_accum_02(), tensorium::GemmKernelBigger< T >::load_accum_03(), tensorium::GemmKernelBigger< T >::load_accum_04(), tensorium::GemmKernelBigger< T >::load_accum_05(), tensorium::GemmKernelBigger< T >::maskload_accum_00(), tensorium::GemmKernelBigger< T >::maskload_accum_01(), tensorium::GemmKernelBigger< T >::maskload_accum_02(), tensorium::GemmKernelBigger< T >::maskload_accum_03(), tensorium::GemmKernelBigger< T >::maskload_accum_04(), tensorium::GemmKernelBigger< T >::maskload_accum_05(), tensorium::GemmKernelBigger< T >::maskstore_accum_00(), tensorium::GemmKernelBigger< T >::maskstore_accum_01(), tensorium::GemmKernelBigger< T >::maskstore_accum_02(), tensorium::GemmKernelBigger< T >::maskstore_accum_03(), tensorium::GemmKernelBigger< T >::maskstore_accum_04(), tensorium::GemmKernelBigger< T >::maskstore_accum_05(), tensorium::GemmKernelBigger< T >::store_accum_00(), tensorium::GemmKernelBigger< T >::store_accum_01(), tensorium::GemmKernelBigger< T >::store_accum_02(), tensorium::GemmKernelBigger< T >::store_accum_03(), tensorium::GemmKernelBigger< T >::store_accum_04(), and tensorium::GemmKernelBigger< T >::store_accum_05().

Referenced by tensorium::GemmKernelBigger< T >::matmul().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ kernel_16x6_zero_init_accum()

template<typename T >
void tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum ( T * blockA_packed,
T * blockB_packed,
T * C,
int mr,
int nr,
int kc,
int M )
inline

◆ load_accum_00()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_00 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ load_accum_01()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_01 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ load_accum_02()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_02 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ load_accum_03()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_03 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ load_accum_04()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_04 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ load_accum_05()

template<typename T >
void tensorium::GemmKernelBigger< T >::load_accum_05 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * C_accum_50,
reg * C_accum_51,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_00()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_00 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_01()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_01 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_02()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_02 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_03()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_03 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_04()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_04 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskload_accum_05()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskload_accum_05 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * C_accum_50,
reg * C_accum_51,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum().

Here is the caller graph for this function:

◆ maskstore_accum_00()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_00 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ maskstore_accum_01()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_01 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ maskstore_accum_02()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_02 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ maskstore_accum_03()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_03 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ maskstore_accum_04()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_04 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ maskstore_accum_05()

template<typename T >
void tensorium::GemmKernelBigger< T >::maskstore_accum_05 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * C_accum_50,
reg * C_accum_51,
__m256i packed_mask_0,
__m256i packed_mask_1,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ matmul()

template<typename T >
void tensorium::GemmKernelBigger< T >::matmul ( T * A,
T * B,
T * C,
int M,
int N,
int K )
inline

References _min, KC, tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum(), MC, NC, tensorium::GemmKernelBigger< T >::pack_blockA(), tensorium::GemmKernelBigger< T >::pack_blockB(), and PRAGMA_OMP_PARALLEL_FOR.

Referenced by tensorium::Matrix< K, RowMajor >::_mul_mat().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ pack_blockA()

template<typename T >
void tensorium::GemmKernelBigger< T >::pack_blockA ( T * A,
T * blockA_packed,
int mc,
int kc,
int M )
inline

References _min, tensorium::GemmKernelBigger< T >::pack_panelA(), and PRAGMA_OMP_PARALLEL_FOR.

Referenced by tensorium::GemmKernelBigger< T >::matmul().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ pack_blockB()

template<typename T >
void tensorium::GemmKernelBigger< T >::pack_blockB ( T * B,
T * blockB_packed,
int nc,
int kc,
int K )
inline

References _min, and tensorium::GemmKernelBigger< T >::pack_panelB().

Referenced by tensorium::GemmKernelBigger< T >::matmul().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ pack_panelA()

template<typename T >
void tensorium::GemmKernelBigger< T >::pack_panelA ( T * A,
T * blockA_packed,
int mr,
int kc,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::pack_blockA().

Here is the caller graph for this function:

◆ pack_panelB()

template<typename T >
void tensorium::GemmKernelBigger< T >::pack_panelB ( T * B,
T * blockB_packed,
int nr,
int kc,
int K )
inline

Referenced by tensorium::GemmKernelBigger< T >::pack_blockB().

Here is the caller graph for this function:

◆ store_accum_00()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_00 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ store_accum_01()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_01 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ store_accum_02()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_02 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ store_accum_03()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_03 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ store_accum_04()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_04 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

◆ store_accum_05()

template<typename T >
void tensorium::GemmKernelBigger< T >::store_accum_05 ( T * C,
reg * C_accum_00,
reg * C_accum_01,
reg * C_accum_10,
reg * C_accum_11,
reg * C_accum_20,
reg * C_accum_21,
reg * C_accum_30,
reg * C_accum_31,
reg * C_accum_40,
reg * C_accum_41,
reg * C_accum_50,
reg * C_accum_51,
int M )
inline

Referenced by tensorium::GemmKernelBigger< T >::kernel_16x6_load_accum(), and tensorium::GemmKernelBigger< T >::kernel_16x6_zero_init_accum().

Here is the caller graph for this function:

Member Data Documentation

◆ BlockCols

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::BlockCols = 512
staticconstexpr

◆ BlockDepth

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::BlockDepth = 256
staticconstexpr

◆ BlockRows

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::BlockRows = 384
staticconstexpr

◆ NThreads

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::NThreads = 16
staticconstexpr

◆ SimdWidth

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::SimdWidth = Simd::width
staticconstexpr

◆ TileCols

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::TileCols = 6
staticconstexpr

◆ TileRows

template<typename T >
constexpr int tensorium::GemmKernelBigger< T >::TileRows = SimdWidth * 2
staticconstexpr

The documentation for this class was generated from the following file: