Skip to content

Commit

Permalink
Merge commit '81e10346' into amd-main
Browse files Browse the repository at this point in the history
* commit '81e10346':
  Alloc at least 1 elem in pool_t block_ptrs. (flame#560)
  Fix insufficient pool-growing logic in bli_pool.c. (flame#559)
  Arm SVE C/ZGEMM Fix FMOV 0 Mistake
  SH Kernel Unused Eigher
  Arm SVE C/ZGEMM Support *beta==0
  Arm SVE Config armsve Use ZGEMM/CGEMM
  Arm SVE: Update Perf. Graph
  Arm SVE CGEMM 2Vx10 Unindex Process Alpha=1.0
  Arm SVE ZGEMM 2Vx10 Unindex Process Alpha=1.0
  A64FX Config Use ZGEMM/CGEMM
  Arm SVE Typo Fix ZGEMM/CGEMM C Prefetch Reg
  Arm SVE Add SGEMM 2Vx10 Unindexed
  Arm SVE ZGEMM Support Gather Load / Scatt. St.
  Arm SVE Add ZGEMM 2Vx10 Unindexed
  Arm SVE Add ZGEMM 2Vx7 Unindexed
  Arm SVE Add ZGEMM 2Vx8 Unindexed
  Update Travis CI badge
  Armv8 Trash New Bulk Kernels
  Enable testing 1m in `make check`.
  Config ArmSVE Unregister 12xk. Move 12xk to Old
  Revert __has_include(). Distinguish w/ BLIS_FAMILY_**
  Register firestorm into arm64 Metaconfig
  Armv8 DGEMMSUP Fix Edge 6x4 Switch Case Typo
  Armv8 DGEMMSUP Fix 8x4m Store Inst. Typo
  Add test for Apple M1 (firestorm)
  Firestorm CPUID Dispatcher
  Armv8 GEMMSUP Edge Cases Require Signed Ints
  Make error checking level a thread-local variable.
  Fix data race in testsuite.
  Update .appveyor.yml
  Firestorm Block Size Fixes
  Armv8 Handle *beta == 0 for GEMMSUP ??r Case.
  Move unused ARM SVE kernels to "old" directory.
  Add an option to control whether or not to use @rpath.
  Fix $ORIGIN usage on linux.
  Arm micro-architecture dispatch (flame#344)
  Use @path-based install name on MacOS and use relocatable RPATH entries for testsuite inaries.
  Armv8 Handle *beta == 0 for GEMMSUP ?rc Case.
  Armv8 Fix 6x8 Row-Maj Ukr
  Apply patch from @xrq-phys.
  Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs.
  bli_error: more cleanup on the error strings array
  Arm SVE Exclude SVE-Intrinsic Kernels for GCC 8-9
  Arm SVE: Correct PACKM Ker Name: Intrinsic Kers
  Fix config_name in bli_arch.c
  Arm Whole GEMMSUP Call Route is Asm/Int Optimized
  Arm: DGEMMSUP `Macro' Edge Cases Stop Calling Ref
  Header Typo
  Arm: DGEMMSUP ??r(rv) Invoke Edge Size
  Arm: DGEMMSUP ?rc(rd) Invoke Edge Size
  Arm: Implement GEMMSUP Fallback Method
  Arm64 Fix: Support Alpha/Beta in GEMMSUP Intrin
  Added Apple Firestorm (A14/M1) Subconfig
  Arm64 8x4 Kernel Use Less Regs
  Armv8-A Supplimentary GEMMSUP Sizes for RD
  Armv8-A Fix GEMMSUP-RD Kernels on GNU Asm
  Armv8-A Adjust Types for PACKM Kernels
  Armv8-A GEMMSUP-RD 6x8m
  Armv8-A GEMMSUP-RD 6x8n
  Armv8-A s/d Packing Kernels Fix Typo
  Armv8-A Introduced s/d Packing Kernels
  Armv8-A DGEMMSUP 6x8m Kernel
  Armv8-A DGEMMSUP Adjustments
  Armv8-A Add More DGEMMSUP
  Armv8-A Add GEMMSUP 4x8n Kernel
  Armv8-A Add Part of GEMMSUP 8x4m Kernel
  Armv8A DGEMM 4x4 Kernel WIP. Slow
  Armv8-A Add 8x4 Kernel WIP

AMD-Internal: [CPUPL-2698]
Change-Id: I194ff69356740bb36ca189fd1bf9fef02eec3803
  • Loading branch information
edwsmyth committed Jun 25, 2024
2 parents 43d36b9 + 81e1034 commit 8de8dc2
Show file tree
Hide file tree
Showing 80 changed files with 11,427 additions and 283 deletions.
2 changes: 2 additions & 0 deletions .appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
skip_branch_with_pr: true

environment:
matrix:
- LIB_TYPE: shared
Expand Down
7 changes: 7 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ matrix:
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ \
PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
# Apple M1 (firestorm) build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="firestorm" \
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ \
PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
# armsve build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc-10
Expand Down
16 changes: 5 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -869,20 +869,14 @@ else
@$(RANLIB) $@
endif

# first argument: the base name of the BLAS test driver.
define make-blat-rule
$(BASE_EXE_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
$(BASE_EXE_BLASTEST_PATH)/%.x: $(BASE_OBJ_BLASTEST_PATH)/%.o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
@mkdir -p $(BASE_EXE_BLASTEST_PATH)
ifeq ($(ENABLE_VERBOSE),yes)
$(LINKER) $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $$@
$(LINKER) $< $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
else
@echo "Linking $$(@F) against '$(notdir $(BLASTEST_F2C_LIB)) $(LIBBLIS_LINK) $(LDFLAGS)'"
@$(LINKER) $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $$@
@echo "Linking $@ against '$(notdir $(BLASTEST_F2C_LIB)) $(LIBBLIS_LINK) "$(LDFLAGS)"'"
@$(LINKER) $< $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
endif
endef

# Instantiate the rule above for each driver file.
$(foreach name, $(BLASTEST_DRV_BASES), $(eval $(call make-blat-rule,$(name))))

# A rule to run ?blat1.x driver files.
define make-run-blat1-rule
Expand Down Expand Up @@ -952,7 +946,7 @@ $(TESTSUITE_BIN): $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK)
ifeq ($(ENABLE_VERBOSE),yes)
$(LINKER) $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
else
@echo "Linking $@ against '$(LIBBLIS_LINK) $(LDFLAGS)'"
@echo "Linking $@ against '$(LIBBLIS_LINK) "$(LDFLAGS)"'"
@$(LINKER) $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
endif

Expand Down
3 changes: 3 additions & 0 deletions build/config.mk.in
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ ARG_MAX_HACK := @enable_arg_max_hack@
MK_ENABLE_STATIC := @enable_static@
MK_ENABLE_SHARED := @enable_shared@

# Whether to use an install_name based on @rpath.
MK_ENABLE_RPATH := @enable_rpath@

# Whether to export all symbols within the shared library, even those symbols
# that are considered to be for internal use only.
EXPORT_SHARED := @export_shared@
Expand Down
16 changes: 15 additions & 1 deletion common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,11 @@ endif
ifeq ($(OS_NAME),Darwin)
# OS X shared library link flags.
SOFLAGS := -dynamiclib
ifeq ($(MK_ENABLE_RPATH),yes)
SOFLAGS += -Wl,-install_name,@rpath/$(LIBBLIS_SONAME)
else
SOFLAGS += -Wl,-install_name,$(libdir)/$(LIBBLIS_SONAME)
endif
else
SOFLAGS := -shared
ifeq ($(IS_WIN),yes)
Expand Down Expand Up @@ -627,7 +631,17 @@ ifeq ($(MK_ENABLE_SHARED),yes)
LIBBLIS_LINK := $(LIBBLIS_SO_PATH)
ifeq ($(IS_WIN),no)
# For Linux and OS X: set rpath property of shared object.
LDFLAGS += -Wl,-rpath,$(BASE_LIB_PATH)
ifeq ($(OS_NAME),Darwin)
# rpath for any executables generated in the top level directory
LDFLAGS += -Wl,-rpath,@executable_path/$(BASE_LIB_PATH)
# rpath for BLAS tests and test_libblis.x
LDFLAGS += -Wl,-rpath,@executable_path/../../../$(BASE_LIB_PATH)
else
# rpath for any executables generated in the top level directory
LDFLAGS += -Wl,-rpath,'$$ORIGIN/$(BASE_LIB_PATH)'
# rpath for BLAS tests and test_libblis.x
LDFLAGS += -Wl,-rpath,'$$ORIGIN/../../../../$(BASE_LIB_PATH)'
endif
endif
endif
# On windows, use the shared library even if static is created.
Expand Down
23 changes: 13 additions & 10 deletions config/a64fx/bli_cntx_init_a64fx.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,29 +49,32 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);

// Set SVE-512 packing routine.
bli_cntx_set_packm_kers
(
3,
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
// 12xk is not used and disabled for GCC 8-9 compatibility.
// BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 16, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, 10, 10 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 192, 96 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, 1536, 1536 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 );

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
Expand Down
6 changes: 5 additions & 1 deletion config/arm64/make_defs.mk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -march=armv8-a
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -march=armv8-a
else
$(error gcc or clang is required for this configuration.)
endif
endif

# Flags specific to reference kernels.
Expand Down
2 changes: 2 additions & 0 deletions config/armsve/bli_armsve_config_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,6 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \

EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
EXPANDMAC_BLKSZ_ARMSVE( c, 8 )
EXPANDMAC_BLKSZ_ARMSVE( z, 16 )

2 changes: 2 additions & 0 deletions config/armsve/bli_armsve_config_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ dim_t bli_vl_bits_armsve(void);

void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);

27 changes: 16 additions & 11 deletions config/armsve/bli_cntx_init_armsve.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,45 +50,50 @@ void bli_cntx_init_armsve( cntx_t* cntx )
// Block size.
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c;
dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z;
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);

// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
4,
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);

// Set VL-specific packing routines if applicable.
if (m_r_d==16)
bli_cntx_set_packm_kers
(
3,
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
else if (m_r_d==8)
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_asm_8xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
cntx
);

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z );

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
Expand Down
6 changes: 5 additions & 1 deletion config/cortexa53/make_defs.mk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a53
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a53
else
$(error gcc or clang is required for this configuration.)
endif
endif

# Flags specific to reference kernels.
Expand Down
6 changes: 5 additions & 1 deletion config/cortexa57/make_defs.mk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a57
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a57
else
$(error gcc or clang is required for this configuration.)
endif
endif

# Flags specific to reference kernels.
Expand Down
144 changes: 144 additions & 0 deletions config/firestorm/bli_cntx_init_firestorm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "blis.h"

void bli_cntx_init_firestorm( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];

// Set default kernel blocksizes and functions.
bli_cntx_init_firestorm_ref( cntx );

// -------------------------------------------------------------------------

// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
);

// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
4,
BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
cntx
);

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);

// -------------------------------------------------------------------------

// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 );

// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);

// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
cntx
);

// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 );

// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

Loading

0 comments on commit 8de8dc2

Please sign in to comment.