Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added vmaf_cuda_fex_synchronize, fixed cuda fex flush functions #1332

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions libvmaf/src/feature/cuda/integer_adm_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "picture_cuda.h"
#include <unistd.h>
#include <assert.h>
#include "nvtx3/nvToolsExt.h"
MorkTheOrk marked this conversation as resolved.
Show resolved Hide resolved

#define RES_BUFFER_SIZE 4 * 3 * 2

Expand All @@ -54,7 +55,7 @@ typedef struct AdmStateCuda {
int dst_stride, CUstream c_stream);
CUstream str, host_stream;
void* write_score_parameters;
CUevent ref_event, dis_event, finished;
CUevent ref_event, dis_event, finished, write_scores;
VmafDictionary *feature_name_dict;

// adm_dwt kernels
Expand Down Expand Up @@ -641,7 +642,7 @@ typedef struct write_score_parameters_adm {

static int write_scores(write_score_parameters_adm* params)
{

nvtxRangePushA("write_scores ADM");
VmafFeatureCollector *feature_collector = params->feature_collector;
AdmStateCuda *s = params->s;
unsigned index = params->index;
Expand Down Expand Up @@ -714,7 +715,12 @@ static int write_scores(write_score_parameters_adm* params)
s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7],
index);

if (!s->debug) return err;
if (!s->debug) {

nvtxRangePop();
return err;
}


err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm", score, index);
Expand Down Expand Up @@ -748,7 +754,7 @@ static int write_scores(write_score_parameters_adm* params)

err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm_den_scale3", scores[7], index);

nvtxRangePop();
return err;
}

Expand Down Expand Up @@ -1014,9 +1020,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));


CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module;
Expand Down Expand Up @@ -1157,7 +1164,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->write_scores));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

// current implementation is limited by the 16-bit data pipeline, thus
Expand All @@ -1178,6 +1187,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
data->w = ref_pic->w[0];
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream));
return 0;
}

Expand Down Expand Up @@ -1220,10 +1230,18 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("flush ADM");
AdmStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
return 0;
while (cuEventQuery(s->write_scores) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->write_scores));
nvtxRangePop();
return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
71 changes: 48 additions & 23 deletions libvmaf/src/feature/cuda/integer_motion_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
#include "picture.h"
#include "picture_cuda.h"
#include "cuda_helper.cuh"
#include "nvtx3/nvToolsExt.h"

typedef struct MotionStateCuda {
CUevent event, finished;
CUevent event, finished, scores_written;
CUfunction funcbpc8, funcbpc16;
CUstream str, host_stream;
VmafCudaBuffer* blur[2];
Expand All @@ -44,6 +45,8 @@ typedef struct MotionStateCuda {
double score;
bool debug;
bool motion_force_zero;
bool flushed;
bool closed;
void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred,
const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad,
unsigned width, unsigned height,
Expand Down Expand Up @@ -136,12 +139,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
unsigned bpc, unsigned w, unsigned h)
{
MotionStateCuda *s = fex->priv;
s->flushed = true;
s->closed = false;

CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC));

CUmodule module;
CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx));
Expand Down Expand Up @@ -202,22 +208,34 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
return -ENOMEM;
}

// if called twice in a row, finalize FEX and close
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("FLUSH MOT");

MotionStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));

// Not required, write_scores takes care of this
// if (s->index > 0) {
// ret = vmaf_feature_collector_append(feature_collector,
// "VMAF_integer_feature_motion2_score",
// s->score, s->index);
// }

return 0;
if(!s->flushed) {
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
while (cuEventQuery(s->scores_written) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->scores_written));
nvtxRangePop();
}
else {
if (s->index > 0 && !s->closed) {
ret = vmaf_feature_collector_append(feature_collector,
"VMAF_integer_feature_motion2_score",
s->score, s->index);
}
s->closed = true;
}
s->flushed = true;
return (ret < 0) ? ret : !ret;
}

static inline double normalize_and_scale_sad(uint64_t sad,
Expand All @@ -243,7 +261,7 @@ static int write_scores(write_score_parameters_moco* params)
}
if (err) return err;

if (params->index == 1)
if (params->index == 1)
return 0;

err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -258,13 +276,16 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
VmafFeatureCollector *feature_collector)
{
MotionStateCuda *s = fex->priv;

if(s->closed) {
return -ESHUTDOWN; // TODO: proper error code here
}
s->flushed = false;
// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
// CHECK_CUDA(cuEventDestroy(s->finished));
// CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

int err = 0;
Expand All @@ -287,10 +308,12 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
// This event ensures the input buffer is consumed
CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->event));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));
// CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
// CHECK_CUDA(cuEventDestroy(s->event));
// CHECK_CUDA(cuEventDestroy(s->scores_written));
// CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
// CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC));
// CHECK_CUDA(cuCtxPopCurrent(NULL));

if (index == 0) {
err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -312,11 +335,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));

write_score_parameters_moco* params = s->write_score_parameters;
cuEventSynchronize(s->scores_written);
params->feature_collector = feature_collector;
params->h = ref_pic->h[0];
params->w = ref_pic->w[0];
params->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters));
CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream));
return 0;
}

Expand Down
34 changes: 27 additions & 7 deletions libvmaf/src/feature/cuda/integer_vif_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include "cuda/integer_vif_cuda.h"
#include "picture_cuda.h"


#include "nvtx3/nvToolsExt.h"

#if ARCH_X86
#include "x86/vif_avx2.h"
#if HAVE_AVX512
Expand All @@ -42,7 +45,7 @@

typedef struct VifStateCuda {
VifBufferCuda buf;
CUevent event, finished;
CUevent event, finished, write_scores;
CUstream str, host_stream;
bool debug;
double vif_enhn_gain_limit;
Expand Down Expand Up @@ -101,7 +104,7 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));

CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
// make this static
CUmodule filter1d_module;
CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx));
Expand Down Expand Up @@ -346,6 +349,7 @@ typedef struct VifScore {

static int write_scores(write_score_parameters_vif* data)
{
nvtxRangePushA("write_scoes VIF");
VmafFeatureCollector *feature_collector = data->feature_collector;
VifStateCuda *s = data->s;
unsigned index = data->index;
Expand Down Expand Up @@ -380,7 +384,11 @@ static int write_scores(write_score_parameters_vif* data)
s->feature_name_dict, "VMAF_integer_feature_vif_scale3_score",
vif.scale[3].num / vif.scale[3].den, index);

if (!s->debug) return err;
if (!s->debug) {

nvtxRangePop();
return err;
}

const double score_num =
(double)vif.scale[0].num + (double)vif.scale[1].num +
Expand Down Expand Up @@ -433,7 +441,7 @@ static int write_scores(write_score_parameters_vif* data)
err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_vif_den_scale3", vif.scale[3].den,
index);

nvtxRangePop();
return err;
}

Expand All @@ -454,7 +462,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->write_scores));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str));
Expand Down Expand Up @@ -496,7 +506,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
write_score_parameters_vif *data = s->buf.cpu_param_buf;
data->feature_collector = feature_collector;
data->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream));
return 0;
}

Expand Down Expand Up @@ -524,11 +535,20 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("flush VIF");
VifStateCuda *s = fex->priv;

int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
return 0;
while (cuEventQuery(s->write_scores) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->write_scores));
nvtxRangePop();

return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
14 changes: 14 additions & 0 deletions libvmaf/src/libvmaf.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ static int flush_context(VmafContext *vmaf)
}

#ifdef HAVE_CUDA
vmaf_cuda_fex_synchronize(vmaf);
MorkTheOrk marked this conversation as resolved.
Show resolved Hide resolved
vmaf_cuda_fex_synchronize(vmaf);
if (vmaf->cuda.state.ctx) {
RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors;
for (unsigned i = 0; i < rfe.cnt; i++) {
Expand Down Expand Up @@ -761,6 +763,16 @@ int vmaf_score_at_index(VmafContext *vmaf, VmafModel *model, double *score,
if (err) {
err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index,
score, true, 0);
// if(err) {
// // Error? Sync and try again
// vmaf_cuda_fex_synchronize(vmaf);
// err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index,
// score, true, 0);
// if(err == 0) {
// // No error - got score
// return 0;
// }
// }
}

return err;
Expand Down Expand Up @@ -789,6 +801,8 @@ int vmaf_feature_score_pooled(VmafContext *vmaf, const char *feature_name,
if (index_low > index_high) return -EINVAL;
if (!pool_method) return -EINVAL;

// vmaf_cuda_fex_synchronize(vmaf);

unsigned pic_cnt = 0;
double min = 0., max = 0., sum = 0., i_sum = 0.;
for (unsigned i = index_low; i <= index_high; i++) {
Expand Down