Tensorium
|
This is the complete list of members for tensorium::GemmKernelBigger< T >, including all inherited members.
__attribute__((aligned(64))) | tensorium::GemmKernelBigger< T > | inlinestatic |
__attribute__((aligned(64))) | tensorium::GemmKernelBigger< T > | static |
__attribute__((aligned(64))) | tensorium::GemmKernelBigger< T > | static |
BlockCols | tensorium::GemmKernelBigger< T > | static |
BlockDepth | tensorium::GemmKernelBigger< T > | static |
BlockRows | tensorium::GemmKernelBigger< T > | static |
build_masks(__m256i *packed_mask_0, __m256i *packed_mask_1, int mr) | tensorium::GemmKernelBigger< T > | inlinestatic |
fma_loop_00(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
fma_loop_01(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
fma_loop_02(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
fma_loop_03(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
fma_loop_04(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
fma_loop_05(T *blockA_packed, T *blockB_packed, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, reg *a0_packFloat8, reg *a1_packFloat8, reg *b_packFloat8, int kc) | tensorium::GemmKernelBigger< T > | inline |
kernel_16x6_load_accum(T *blockA_packed, T *blockB_packed, T *C, int mr, int nr, int kc, int M) | tensorium::GemmKernelBigger< T > | inline |
kernel_16x6_zero_init_accum(T *blockA_packed, T *blockB_packed, T *C, int mr, int nr, int kc, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_00(T *C, reg *C_accum_00, reg *C_accum_01, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_01(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_02(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_03(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_04(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, int M) | tensorium::GemmKernelBigger< T > | inline |
load_accum_05(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_00(T *C, reg *C_accum_00, reg *C_accum_01, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_01(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_02(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_03(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_04(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskload_accum_05(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_00(T *C, reg *C_accum_00, reg *C_accum_01, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_01(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_02(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_03(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_04(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
maskstore_accum_05(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, __m256i packed_mask_0, __m256i packed_mask_1, int M) | tensorium::GemmKernelBigger< T > | inline |
matmul(T *A, T *B, T *C, int M, int N, int K) | tensorium::GemmKernelBigger< T > | inline |
NThreads | tensorium::GemmKernelBigger< T > | static |
pack_blockA(T *A, T *blockA_packed, int mc, int kc, int M) | tensorium::GemmKernelBigger< T > | inline |
pack_blockB(T *B, T *blockB_packed, int nc, int kc, int K) | tensorium::GemmKernelBigger< T > | inline |
pack_panelA(T *A, T *blockA_packed, int mr, int kc, int M) | tensorium::GemmKernelBigger< T > | inline |
pack_panelB(T *B, T *blockB_packed, int nr, int kc, int K) | tensorium::GemmKernelBigger< T > | inline |
reg typedef | tensorium::GemmKernelBigger< T > | |
Simd typedef | tensorium::GemmKernelBigger< T > | |
SimdWidth | tensorium::GemmKernelBigger< T > | static |
store_accum_00(T *C, reg *C_accum_00, reg *C_accum_01, int M) | tensorium::GemmKernelBigger< T > | inline |
store_accum_01(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, int M) | tensorium::GemmKernelBigger< T > | inline |
store_accum_02(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, int M) | tensorium::GemmKernelBigger< T > | inline |
store_accum_03(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, int M) | tensorium::GemmKernelBigger< T > | inline |
store_accum_04(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, int M) | tensorium::GemmKernelBigger< T > | inline |
store_accum_05(T *C, reg *C_accum_00, reg *C_accum_01, reg *C_accum_10, reg *C_accum_11, reg *C_accum_20, reg *C_accum_21, reg *C_accum_30, reg *C_accum_31, reg *C_accum_40, reg *C_accum_41, reg *C_accum_50, reg *C_accum_51, int M) | tensorium::GemmKernelBigger< T > | inline |
TileCols | tensorium::GemmKernelBigger< T > | static |
TileRows | tensorium::GemmKernelBigger< T > | static |