Skip to content

Commit

Permalink
Performance fix for 5.1 and updated changelog(#391)
Browse files Browse the repository at this point in the history
* Getrf tuning3 (tuning for npvt and batch cases) (#383)
* update changelog
  • Loading branch information
jzuniga-amd authored Mar 16, 2022
1 parent 55761c3 commit 0e9e2c9
Show file tree
Hide file tree
Showing 8 changed files with 980 additions and 187 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](https://rocsolver.readthedocs.io/en/latest/).

## (Unreleased) rocSOLVER
## rocSOLVER 3.17.0 for ROCm 5.1.0
### Optimized
- Optimized non-pivoting and batch cases of the LU factorization

### Fixed
- Fixed missing synchronization in SYTRF with `rocblas_fill_lower` that could potentially
result in incorrect pivot values.
- Fixed multi-level logging output to file with the `ROCSOLVER_LOG_PATH`,
`ROCSOLVER_LOG_TRACE_PATH`, `ROCSOLVER_LOG_BENCH_PATH` and `ROCSOLVER_LOG_PROFILE_PATH`
environment variables.
- Fixed performance regression in the batched LU factorization of tiny matrices


## rocSOLVER 3.16.0 for ROCm 5.0.0
Expand Down
6 changes: 3 additions & 3 deletions clients/gtest/memory_model_gtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ class checkin_misc_MEMORY_MODEL : public ::testing::Test

const rocblas_int m = 1500;
const rocblas_int n = 1500;
const rocblas_int m_small = 750;
const rocblas_int n_small = 750;
const rocblas_int m_small = 65;
const rocblas_int n_small = 65;
const rocblas_int lda = m;
const rocblas_stride stA = lda * n;
const rocblas_stride stP = n;
const rocblas_int bc = 8;
const rocblas_int bc_small = 8;
const rocblas_int bc_small = 5;
};

/*************************************/
Expand Down
43 changes: 29 additions & 14 deletions library/src/include/ideal_sizes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,22 +184,37 @@

/**************************** getf2/getfr *************************************
*******************************************************************************/
#define GETF2_MAX_COLS 64 //always <= wavefront size
#define GETF2_MAX_THDS 64
#define GETF2_SPKER_MAX_M 1024 //always <= 1024
#define GETF2_SPKER_MAX_N 256 //always <= 256
#define GETF2_SSKER_MAX_M 512 //always <= 512 and <= GETF2_SPKER_MAX_M
#define GETF2_SSKER_MAX_N 64 //always <= wavefront and <= GETF2_SPKER_MAX_N
#define GETF2_OPTIM_NGRP \
16, 15, 8, 8, 8, 8, 8, 8, 6, 6, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
#define GETRF_NUM_INTERVALS 4
#define GETRF_INTERVALS 64, 512, 1536, 4096
#define GETRF_BLKSIZES 0, 1, 32, 128, 384
#define GETRF_BATCH_NUM_INTERVALS 3
#define GETRF_BATCH_INTERVALS 52, 148, 1376
#define GETRF_BATCH_BLKSIZES 0, 16, 32, 288
#define GETRF_NPVT_NUM_INTERVALS 2
#define GETRF_NPVT_INTERVALS 65, 1536
#define GETRF_NPVT_BLKSIZES 0, 32, 256
#define GETRF_NPVT_BATCH_NUM_INTERVALS 3
#define GETRF_NPVT_BATCH_INTERVALS 33, 148, 1216
#define GETRF_NPVT_BATCH_BLKSIZES 0, 16, 32, 256
#define GETRF_NUM_INTERVALS_REAL 4
#define GETRF_INTERVALS_REAL 64, 512, 1856, 2944
#define GETRF_BLKSIZES_REAL 0, 1, 32, 256, 512
#define GETRF_BATCH_NUM_INTERVALS_REAL 9
#define GETRF_BATCH_INTERVALS_REAL 40, 42, 46, 49, 52, 58, 112, 800, 1024
#define GETRF_BATCH_BLKSIZES_REAL 0, 32, 0, 16, 0, 32, 1, 32, 64, 160
#define GETRF_NPVT_NUM_INTERVALS_REAL 2
#define GETRF_NPVT_INTERVALS_REAL 64, 512
#define GETRF_NPVT_BLKSIZES_REAL 0, -1, 512
#define GETRF_NPVT_BATCH_NUM_INTERVALS_REAL 6
#define GETRF_NPVT_BATCH_INTERVALS_REAL 40, 168, 448, 512, 896, 1408
#define GETRF_NPVT_BATCH_BLKSIZES_REAL 0, -24, -32, -64, 32, 96, 512

#define GETRF_NUM_INTERVALS_COMPLEX 4
#define GETRF_INTERVALS_COMPLEX 64, 512, 1024, 2944
#define GETRF_BLKSIZES_COMPLEX 0, 1, 32, 96, 512
#define GETRF_BATCH_NUM_INTERVALS_COMPLEX 10
#define GETRF_BATCH_INTERVALS_COMPLEX 23, 28, 30, 32, 40, 48, 56, 64, 768, 1024
#define GETRF_BATCH_BLKSIZES_COMPLEX 0, 16, 0, 1, 24, 16, 24, 16, 48, 64, 160
#define GETRF_NPVT_NUM_INTERVALS_COMPLEX 2
#define GETRF_NPVT_INTERVALS_COMPLEX 64, 512
#define GETRF_NPVT_BLKSIZES_COMPLEX 0, -1, 512
#define GETRF_NPVT_BATCH_NUM_INTERVALS_COMPLEX 5
#define GETRF_NPVT_BATCH_INTERVALS_COMPLEX 20, 32, 42, 512, 1408
#define GETRF_NPVT_BATCH_BLKSIZES_COMPLEX 0, -16, -32, -48, 64, 128

/****************************** getri *****************************************
*******************************************************************************/
Expand Down
81 changes: 72 additions & 9 deletions library/src/include/lapack_device_functions.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc.
* Copyright (c) 2019-2022 Advanced Micro Devices, Inc.
* ************************************************************************ */

#pragma once
Expand Down Expand Up @@ -882,7 +882,7 @@ ROCSOLVER_KERNEL void gemm_kernel(const rocblas_int m,
}

/** Optimized kernel that solves a simple triangular system B <- Ax=B
with A unit matrix. A and B are sub blocks of the same matrix MM with
with A unit lower triangular matrix. A and B are sub blocks of the same matrix MM with
leading dimension ldim and stride. A and B are
located in MM by their respective shifts.
Expand All @@ -892,13 +892,13 @@ ROCSOLVER_KERNEL void gemm_kernel(const rocblas_int m,
Size of shared memory per group should be:
lmemsize = hipBlockDim_y * sizeof(T); **/
template <typename T, typename U>
ROCSOLVER_KERNEL void trsm2_kernel(const rocblas_int m,
const rocblas_int n,
U MM,
const rocblas_int shiftA,
const rocblas_int shiftB,
const rocblas_int ldim,
const rocblas_stride stride)
ROCSOLVER_KERNEL void trsm2_lower_kernel(const rocblas_int m,
const rocblas_int n,
U MM,
const rocblas_int shiftA,
const rocblas_int shiftB,
const rocblas_int ldim,
const rocblas_stride stride)
{
int id = hipBlockIdx_z;
int i = hipThreadIdx_x;
Expand Down Expand Up @@ -934,3 +934,66 @@ ROCSOLVER_KERNEL void trsm2_kernel(const rocblas_int m,
B[i + j * ldim] = c;
}
}

/** Optimized kernel that solves a simple triangular system B <- xA=B
with A non-unit upper triangular matrix. A and B are sub blocks of the same matrix MM with
leading dimension ldim and stride. A and B are
located in MM by their respective shifts.
Call this kernel with 'batch_count' groups in z, and enough
groups in x to cover all the 'm' right-hand-sides (rows of B).
There should be only one group in y with hipBlockDim_y = n.
Size of shared memory per group should be:
lmemsize = hipBlockDim_x * sizeof(T); **/
template <typename T, typename U>
ROCSOLVER_KERNEL void trsm2_upper_kernel(const rocblas_int m,
const rocblas_int n,
U MM,
const rocblas_int shiftA,
const rocblas_int shiftB,
const rocblas_int ldim,
const rocblas_stride stride)
{
int id = hipBlockIdx_z;
int j = hipThreadIdx_y;
int tx = hipThreadIdx_x;
int bdx = hipBlockDim_x;
int i = hipBlockIdx_x * bdx + tx;

// batch instance
T* A = load_ptr_batch(MM, id, shiftA, stride);
T* B = load_ptr_batch(MM, id, shiftB, stride);

// shared mem setup
extern __shared__ double lmem[];
T* b = reinterpret_cast<T*>(lmem);
T c, d;

if(i < m)
{
// read data
c = B[i + j * ldim];

// solve for right-hand sides
for(int k = 0; k < n - 1; ++k)
{
__syncthreads();
if(j == k)
{
d = A[j + j * ldim];
c = d != 0 ? c / d : c;
b[tx] = c;
}
__syncthreads();
c -= (j > k) ? A[k + j * ldim] * b[tx] : 0;
}
if(j == n - 1)
{
d = A[j + j * ldim];
c = d != 0 ? c / d : c;
}

// move results back to global
B[i + j * ldim] = c;
}
}
Loading

0 comments on commit 0e9e2c9

Please sign in to comment.