Skip to content

Commit

Permalink
rendersub: port the subsample code to metal blend
Browse files Browse the repository at this point in the history
  • Loading branch information
galad87 committed Jan 29, 2025
1 parent 88b548a commit 992c94c
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 76 deletions.
4 changes: 2 additions & 2 deletions libhb/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -6296,7 +6296,7 @@ hb_csp_convert_f hb_get_rgb2yuv_function(int color_matrix)
return hb_rgb2yuv_bt709;
}

void hb_compute_chroma_smoothing_coefficient(unsigned chroma_coeffs[2][4], int pix_fmt, int chroma_location)
void hb_compute_chroma_smoothing_coefficient(uint32_t chroma_coeffs[2][4], int pix_fmt, int chroma_location)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);

Expand Down Expand Up @@ -6324,7 +6324,7 @@ void hb_compute_chroma_smoothing_coefficient(unsigned chroma_coeffs[2][4], int p
break;
}

const unsigned base_coefficients[] = {1, 3, 9, 27, 9, 3, 1};
const uint32_t base_coefficients[] = {1, 3, 9, 27, 9, 3, 1};
// If wZ is even, an intermediate value is interpolated for symmetry.
for (int x = 0; x < 4; x++)
{
Expand Down
67 changes: 46 additions & 21 deletions libhb/platform/macosx/blend_vt.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,28 @@

struct mtl_blend_params
{
uint x;
uint y;
uint16_t x;
uint16_t y;
uint16_t xc;
uint16_t yc;
uint16_t width;
uint16_t height;
};

struct hb_blend_private_s
{
hb_metal_context_t *mtl;

id<MTLTexture> overlays[4];
id<MTLBuffer> chroma_coeffs;

const AVPixFmtDescriptor *in_desc;
const AVPixFmtDescriptor *overlay_desc;

int subw;
int subh;
int overlay_subw;
int overlay_subh;
uint16_t subw;
uint16_t subh;
uint16_t overlay_subw;
uint16_t overlay_subh;
};

static int hb_blend_vt_init(hb_blend_object_t *object,
Expand Down Expand Up @@ -77,9 +82,9 @@ static int hb_blend_vt_init(hb_blend_object_t *object,
pv->in_desc = av_pix_fmt_desc_get(in_pix_fmt);
pv->overlay_desc = av_pix_fmt_desc_get(overlay_pix_fmt);

int depth = pv->in_desc->comp[0].depth == 8 ? 8 : 16;
int shift = depth - 8;
int max_value = (1 << depth) - 1;
uint16_t depth = pv->in_desc->comp[0].depth == 8 ? 8 : 16;
uint16_t shift = depth - 8;
uint32_t max_value = (1 << depth) - 1;

pv->subw = pv->in_desc->log2_chroma_w;
pv->subh = pv->in_desc->log2_chroma_h;
Expand All @@ -99,17 +104,17 @@ static int hb_blend_vt_init(hb_blend_object_t *object,
return -1;
}

uint32_t plane = 0;
uint32_t channels = 1;
uint16_t plane = 0;
uint16_t channels = 1;
MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];

[constant_values setConstantValue:&plane type:MTLDataTypeUInt withName:@"plane"];
[constant_values setConstantValue:&channels type:MTLDataTypeUInt withName:@"channels"];
[constant_values setConstantValue:&pv->subw type:MTLDataTypeUInt withName:@"subw"];
[constant_values setConstantValue:&pv->subh type:MTLDataTypeUInt withName:@"subh"];
[constant_values setConstantValue:&pv->overlay_subw type:MTLDataTypeUInt withName:@"osubw"];
[constant_values setConstantValue:&pv->overlay_subh type:MTLDataTypeUInt withName:@"osubh"];
[constant_values setConstantValue:&shift type:MTLDataTypeUInt withName:@"shift"];
[constant_values setConstantValue:&plane type:MTLDataTypeUShort withName:@"plane"];
[constant_values setConstantValue:&channels type:MTLDataTypeUShort withName:@"channels"];
[constant_values setConstantValue:&pv->subw type:MTLDataTypeUShort withName:@"subw"];
[constant_values setConstantValue:&pv->subh type:MTLDataTypeUShort withName:@"subh"];
[constant_values setConstantValue:&pv->overlay_subw type:MTLDataTypeUShort withName:@"osubw"];
[constant_values setConstantValue:&pv->overlay_subh type:MTLDataTypeUShort withName:@"osubh"];
[constant_values setConstantValue:&shift type:MTLDataTypeUShort withName:@"shift"];
[constant_values setConstantValue:&max_value type:MTLDataTypeUInt withName:@"maxv"];
[constant_values setConstantValue:&needs_subsample type:MTLDataTypeBool withName:@"subsample"];

Expand All @@ -122,8 +127,8 @@ static int hb_blend_vt_init(hb_blend_object_t *object,

plane = 1;
channels = 2;
[constant_values setConstantValue:&plane type:MTLDataTypeUInt withName:@"plane"];
[constant_values setConstantValue:&channels type:MTLDataTypeUInt withName:@"channels"];
[constant_values setConstantValue:&plane type:MTLDataTypeUShort withName:@"plane"];
[constant_values setConstantValue:&channels type:MTLDataTypeUShort withName:@"channels"];

if (hb_metal_add_pipeline(pv->mtl, "blend", constant_values, pv->mtl->pipelines_count))
{
Expand Down Expand Up @@ -153,6 +158,19 @@ static int hb_blend_vt_init(hb_blend_object_t *object,
[descriptor release];
}

NSUInteger length = 4 * 2 * sizeof(uint32_t);
pv->chroma_coeffs = [pv->mtl->device newBufferWithLength:length
options:MTLResourceStorageModeManaged];
if (pv->chroma_coeffs == nil)
{
hb_error("blend_vt: failed to create Metal buffers");
}

hb_compute_chroma_smoothing_coefficient(pv->chroma_coeffs.contents,
in_pix_fmt,
in_chroma_location);
[pv->chroma_coeffs didModifyRange:NSMakeRange(0, length)];

return 0;
}

Expand Down Expand Up @@ -187,13 +205,18 @@ static void call_kernel(hb_blend_private_t *pv,
struct mtl_blend_params *params = (struct mtl_blend_params *)pv->mtl->params_buffer.contents;
params->x = x;
params->y = y;
params->xc = x & ~((1 << pv->subw) - 1);
params->yc = y & ~((1 << pv->subh) - 1);
params->width = width;
params->height = height;

[encoder setTexture:dst atIndex:0];
for (int i = 0; i < pv->overlay_desc->nb_components; i++)
{
[encoder setTexture:pv->overlays[i] atIndex:i + 1];
}
[encoder setBuffer:pv->mtl->params_buffer offset:0 atIndex:0];
[encoder setBuffer:pv->chroma_coeffs offset:0 atIndex:0];
[encoder setBuffer:pv->mtl->params_buffer offset:0 atIndex:1];

if (plane)
{
Expand Down Expand Up @@ -289,6 +312,8 @@ static void hb_blend_vt_close(hb_blend_object_t *metric)
[pv->overlays[i] release];
}

[pv->chroma_coeffs release];

hb_metal_context_close(&pv->mtl);

free(pv);
Expand Down
2 changes: 1 addition & 1 deletion libhb/platform/macosx/metal_utils.m
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ int hb_metal_add_pipeline(hb_metal_context_t *ctx, const char *function_name,
}
if (!ctx->functions[index])
{
hb_error("metal: failed to create Metal function");
hb_error("metal: failed to create Metal function: %s", err.description.UTF8String);
return -1;
}
ctx->pipelines[index] = [ctx->device newComputePipelineStateWithFunction:ctx->functions[index] error:&err];
Expand Down
158 changes: 106 additions & 52 deletions libhb/platform/macosx/shaders/blend_vt.metal
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,23 @@
* Parameters
*/

constant uint plane [[function_constant(0)]];
constant uint channels [[function_constant(1)]];
constant uint subw [[function_constant(2)]];
constant uint subh [[function_constant(3)]];
constant uint osubw [[function_constant(4)]];
constant uint osubh [[function_constant(5)]];
constant uint shift [[function_constant(6)]];
constant uint maxv [[function_constant(7)]];
constant bool subsample [[function_constant(8)]];
//constant uint chroma_loc [[function_constant(9)]];
constant uint16_t plane [[function_constant(0)]];
constant uint16_t channels [[function_constant(1)]];
constant uint16_t subw [[function_constant(2)]];
constant uint16_t subh [[function_constant(3)]];
constant uint16_t osubw [[function_constant(4)]];
constant uint16_t osubh [[function_constant(5)]];
constant uint16_t shift [[function_constant(6)]];
constant uint32_t maxv [[function_constant(7)]];
constant bool subsample [[function_constant(8)]];

struct params {
uint x;
uint y;
uint16_t x;
uint16_t y;
uint16_t xc;
uint16_t yc;
uint16_t width;
uint16_t height;
};

using namespace metal;
Expand All @@ -38,47 +41,44 @@ using namespace metal;
* Blend helpers
*/

#define accesstype access::sample
constexpr sampler s(coord::pixel);

template <typename T>
T blend_pixel(T y_out, T y_in, T a_in)
{
return ((uint32_t)y_out * (maxv - a_in) + (uint32_t)y_in * a_in) / maxv;
}

template <typename T>
T pos_dst_y(T pos, uint x, uint y)
T pos_dst_y(T pos, uint16_t x, uint16_t y)
{
return ushort2(pos.x + x, pos.y + y);
return T(pos.x + x, pos.y + y);
}

template <typename T>
T pos_dst_u(T pos, uint x, uint y)
T pos_dst_u(T pos, uint16_t x, uint16_t y)
{
return ushort2((pos.x + (x >> subw)) * channels,
pos.y + (y >> subh));
return T((pos.x + (x >> subw)) * channels,
pos.y + (y >> subh));
}

template <typename T>
T pos_dst_v(T pos, uint x, uint y)
T pos_dst_v(T pos, uint16_t x, uint16_t y)
{
return ushort2((pos.x + (x >> subw)) * channels + 1,
pos.y + (y >> subh));
return T((pos.x + (x >> subw)) * channels + 1,
pos.y + (y >> subh));
}

template <typename T>
float2 pos_uv_subsample(T pos, uint x, uint y)
T pos_uv_subsample(T pos, uint16_t x, uint16_t y)
{
uint uvsubw = subw - osubw;
uint uvsubh = subh - osubh;
uint16_t uvsubw = subw - osubw;
uint16_t uvsubh = subh - osubh;

return float2((pos.x << uvsubw) + (x >> osubw),
(pos.y << uvsubh) + (y >> osubh));
return T((pos.x << uvsubw) + (x >> osubw),
(pos.y << uvsubh) + (y >> osubh));
}

template <typename T>
T pos_a(T pos, uint x, uint y)
T pos_a(T pos, uint16_t x, uint16_t y)
{
return ushort2((pos.x << subw) + x, (pos.y << subh) + y);
}
Expand All @@ -103,25 +103,15 @@ T blend_pixel_y(
template <typename T, typename V>
V blend_pixel_uv(
texture2d<T, access::read_write> dst,
texture2d<T, access::sample> overlay_u,
texture2d<T, access::sample> overlay_v,
texture2d<T, access::read> overlay_u,
texture2d<T, access::read> overlay_v,
texture2d<T, access::read> overlay_a,
constant params& p,
ushort2 pos)
{
T u_in;
T v_in;

if (subsample) {
float2 pos_uv = pos_uv_subsample(pos, p.x, p.y);
u_in = overlay_u.sample(s, pos_uv).x << shift;
v_in = overlay_v.sample(s, pos_uv).x << shift;
}
else {
ushort2 pos_uv = ushort2(pos.x + (p.x >> osubw), pos.y + (p.y >> osubh));
u_in = overlay_u.read(pos_uv).x << shift;
v_in = overlay_v.read(pos_uv).x << shift;
}
ushort2 pos_uv = ushort2(pos.x + (p.x >> osubw), pos.y + (p.y >> osubh));
T u_in = overlay_u.read(pos_uv).x << shift;
T v_in = overlay_v.read(pos_uv).x << shift;

T u_out = dst.read(pos_dst_u(pos, p.x, p.y)).x;
T v_out = dst.read(pos_dst_v(pos, p.x, p.y)).x;
Expand All @@ -131,17 +121,73 @@ V blend_pixel_uv(
blend_pixel(v_out, v_in, a_in));
}

template <typename T, typename V>
V blend_subsample_pixel_uv(
texture2d<T, access::read_write> dst,
texture2d<T, access::read> overlay_u,
texture2d<T, access::read> overlay_v,
texture2d<T, access::read> overlay_a,
constant uint *chroma_coeffs,
constant params& p,
ushort2 pos)
{
T u_out = dst.read(pos_dst_u(pos, p.xc, p.yc)).x;
T v_out = dst.read(pos_dst_v(pos, p.xc, p.yc)).x;

// Perform chromaloc-aware subsampling and blending
uint32_t accu_a = 0, accu_b = 0, accu_c = 0;
ushort2 pos_uv = pos_uv_subsample(pos, p.xc, p.yc);

for (uint16_t yz = 0; yz < 1 << subh; yz++) {
for (uint16_t xz = 0; xz < 1 << subw; xz++) {
// Weight of the current chroma sample
uint32_t coeff = *(chroma_coeffs + xz) * *(chroma_coeffs + 4 + yz);
uint32_t res_u = u_out;
uint32_t res_v = v_out;

// Chroma sampled area overlap with bitmap
if ((pos.x > 0 || p.xc == p.x) && pos.x < p.width ||
(pos.y > 0 || p.yc == p.y) && pos.y < p.height) {
ushort2 offset = ushort2(xz, yz);
T a_in = overlay_a.read(pos_a(pos, p.xc, p.yc) + offset).x << shift;
T u_in = overlay_u.read(pos_uv + offset).x << shift;
T v_in = overlay_v.read(pos_uv + offset).x << shift;

res_u *= (maxv - a_in);
res_u = (res_u + ((uint32_t)u_in) * a_in + (maxv >> 1)) / maxv;

res_v *= (maxv - a_in);
res_v = (res_v + ((uint32_t)v_in) * a_in + (maxv >> 1)) / maxv;
}

// Accumulate
accu_a += coeff * res_u;
accu_b += coeff * res_v;
accu_c += coeff;
}
}

if (accu_c) {
return V((accu_a + (accu_c >> 1)) / accu_c,
(accu_b + (accu_c >> 1)) / accu_c);
}
else {
return V(u_out, v_out);
}
}

/*
* Kernel dispatch
*/

kernel void blend(
texture2d<ushort, access::read_write> dst [[texture(0)]],
texture2d<ushort, access::read> overlay_y [[texture(1)]],
texture2d<ushort, access::sample> overlay_u [[texture(2)]],
texture2d<ushort, access::sample> overlay_v [[texture(3)]],
texture2d<ushort, access::read> overlay_a [[texture(4)]],
constant params& p [[buffer(0)]],
texture2d<ushort, access::read> overlay_y [[texture(1)]],
texture2d<ushort, access::read> overlay_u [[texture(2)]],
texture2d<ushort, access::read> overlay_v [[texture(3)]],
texture2d<ushort, access::read> overlay_a [[texture(4)]],
constant uint *chroma_coeffs [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]])
{
if (plane == 0) {
Expand All @@ -150,9 +196,17 @@ kernel void blend(
dst.write(value, pos_dst_y(pos, p.x, p.y));
}
else {
ushort2 value = blend_pixel_uv<ushort, ushort2>(dst, overlay_u,
overlay_v, overlay_a,
p, pos);
ushort2 value;
if (subsample) {
value = blend_subsample_pixel_uv<ushort, ushort2>(dst, overlay_u,
overlay_v, overlay_a,
chroma_coeffs, p, pos);
}
else {
value = blend_pixel_uv<ushort, ushort2>(dst, overlay_u,
overlay_v, overlay_a,
p, pos);
}
dst.write(value.x, pos_dst_u(pos, p.x, p.y));
dst.write(value.y, pos_dst_v(pos, p.x, p.y));
}
Expand Down

0 comments on commit 992c94c

Please sign in to comment.