x264 source for verification 2026-05-22
This commit is contained in:
265
common/opencl/bidir.cl
Normal file
265
common/opencl/bidir.cl
Normal file
@@ -0,0 +1,265 @@
|
||||
/* Mode selection routines, select the least SATD cost mode for each lowres
|
||||
* macroblock. When measuring B slices, this includes measuring the cost of
|
||||
* three bidir modes. */
|
||||
|
||||
/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
|
||||
int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
|
||||
int2 fencpos,
|
||||
read_only image2d_t fref0_planes,
|
||||
int2 qpos0,
|
||||
read_only image2d_t fref1_planes,
|
||||
int2 qpos1,
|
||||
int weight,
|
||||
local sum2_t *tmpp,
|
||||
int idx )
|
||||
{
|
||||
volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
|
||||
sum2_t b0, b1, b2, b3;
|
||||
sum2_t sum = 0;
|
||||
|
||||
// fencpos is full-pel position of original MB
|
||||
// qpos0 is qpel position within reference frame 0
|
||||
// qpos1 is qpel position within reference frame 1
|
||||
|
||||
int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
|
||||
int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
|
||||
|
||||
int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
|
||||
int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
|
||||
int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
|
||||
|
||||
int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
|
||||
int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
|
||||
|
||||
int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
|
||||
int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
|
||||
int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
|
||||
|
||||
uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
|
||||
uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
|
||||
|
||||
uint vA, vB;
|
||||
uint enc, ref0, ref1;
|
||||
uint a0, a1;
|
||||
const int weight2 = 64 - weight;
|
||||
|
||||
#define READ_BIDIR_DIFF( OUT, X )\
|
||||
enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
|
||||
vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
|
||||
vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
|
||||
ref0 = rhadd( vA, vB );\
|
||||
vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
|
||||
vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
|
||||
ref1 = rhadd( vA, vB );\
|
||||
OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
|
||||
|
||||
#define READ_DIFF_EX( OUT, a, b )\
|
||||
READ_BIDIR_DIFF( a0, a );\
|
||||
READ_BIDIR_DIFF( a1, b );\
|
||||
OUT = a0 + (a1<<BITS_PER_SUM);
|
||||
|
||||
#define ROW_8x4_SATD( a, b, c )\
|
||||
fencpos.y += a;\
|
||||
fref0Apos.y += b;\
|
||||
fref0Bpos.y += b;\
|
||||
fref1Apos.y += c;\
|
||||
fref1Bpos.y += c;\
|
||||
READ_DIFF_EX( b0, 0, 4 );\
|
||||
READ_DIFF_EX( b1, 1, 5 );\
|
||||
READ_DIFF_EX( b2, 2, 6 );\
|
||||
READ_DIFF_EX( b3, 3, 7 );\
|
||||
HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
|
||||
HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
|
||||
sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
|
||||
|
||||
ROW_8x4_SATD( 0, 0, 0 );
|
||||
ROW_8x4_SATD( 4, 4, 4 );
|
||||
|
||||
#undef READ_BIDIR_DIFF
|
||||
#undef READ_DIFF_EX
|
||||
#undef ROW_8x4_SATD
|
||||
|
||||
return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* mode selection - pick the least cost partition type for each 8x8 macroblock.
|
||||
* Intra, list0 or list1. When measuring a B slice, also test three bidir
|
||||
* possibilities.
|
||||
*
|
||||
* fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
|
||||
* hold many frames worth of motion vectors. We must offset into the correct
|
||||
* location for this frame's vectors:
|
||||
*
|
||||
* CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
|
||||
* GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
|
||||
*
|
||||
* global launch dimensions for P slice estimate: [mb_width, mb_height]
|
||||
* global launch dimensions for B slice estimate: [mb_width * 4, mb_height]
|
||||
*/
|
||||
kernel void mode_selection( read_only image2d_t fenc_lowres,
|
||||
read_only image2d_t fref0_planes,
|
||||
read_only image2d_t fref1_planes,
|
||||
const global short2 *fenc_lowres_mvs0,
|
||||
const global short2 *fenc_lowres_mvs1,
|
||||
const global short2 *fref1_lowres_mvs0,
|
||||
const global int16_t *fenc_lowres_mv_costs0,
|
||||
const global int16_t *fenc_lowres_mv_costs1,
|
||||
const global uint16_t *fenc_intra_cost,
|
||||
global uint16_t *lowres_costs,
|
||||
global int *frame_stats,
|
||||
local int16_t *cost_local,
|
||||
local sum2_t *satd_local,
|
||||
int mb_width,
|
||||
int bipred_weight,
|
||||
int dist_scale_factor,
|
||||
int b,
|
||||
int p0,
|
||||
int p1,
|
||||
int lambda )
|
||||
{
|
||||
int mb_x = get_global_id( 0 );
|
||||
int b_bidir = b < p1;
|
||||
if( b_bidir )
|
||||
{
|
||||
/* when mode_selection is run for B frames, it must perform BIDIR SATD
|
||||
* measurements, so it is launched with four times as many threads in
|
||||
* order to spread the work around more of the GPU. And it can add
|
||||
* padding threads in the X direction. */
|
||||
mb_x >>= 2;
|
||||
if( mb_x >= mb_width )
|
||||
return;
|
||||
}
|
||||
int mb_y = get_global_id( 1 );
|
||||
int mb_height = get_global_size( 1 );
|
||||
int mb_count = mb_width * mb_height;
|
||||
int mb_xy = mb_x + mb_y * mb_width;
|
||||
|
||||
/* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
|
||||
if( mb_x < 4 && mb_y == 0 )
|
||||
frame_stats[mb_x] = 0;
|
||||
|
||||
int bcost = COST_MAX;
|
||||
int list_used = 0;
|
||||
|
||||
if( !b_bidir )
|
||||
{
|
||||
int icost = fenc_intra_cost[mb_xy];
|
||||
COPY2_IF_LT( bcost, icost, list_used, 0 );
|
||||
}
|
||||
if( b != p0 )
|
||||
{
|
||||
int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
|
||||
COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
|
||||
}
|
||||
if( b != p1 )
|
||||
{
|
||||
int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
|
||||
COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
|
||||
}
|
||||
|
||||
if( b_bidir )
|
||||
{
|
||||
int2 coord = (int2)(mb_x, mb_y) << 3;
|
||||
int mb_i = get_global_id( 0 ) & 3;
|
||||
int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
|
||||
cost_local += mb_in_group * 4;
|
||||
satd_local += mb_in_group * 16;
|
||||
|
||||
#define TRY_BIDIR( mv0, mv1, penalty )\
|
||||
{\
|
||||
int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
|
||||
int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
|
||||
cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
|
||||
int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
|
||||
COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
|
||||
}
|
||||
|
||||
/* temporal prediction */
|
||||
short2 dmv0, dmv1;
|
||||
short2 mvr = fref1_lowres_mvs0[mb_xy];
|
||||
dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
|
||||
dmv1 = dmv0 - mvr;
|
||||
TRY_BIDIR( dmv0, dmv1, 0 )
|
||||
|
||||
if( as_uint( dmv0 ) || as_uint( dmv1 ) )
|
||||
{
|
||||
/* B-direct prediction */
|
||||
dmv0 = 0; dmv1 = 0;
|
||||
TRY_BIDIR( dmv0, dmv1, 0 );
|
||||
}
|
||||
|
||||
/* L0+L1 prediction */
|
||||
dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
|
||||
dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
|
||||
TRY_BIDIR( dmv0, dmv1, 5 );
|
||||
#undef TRY_BIDIR
|
||||
}
|
||||
|
||||
lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* parallel sum inter costs
|
||||
*
|
||||
* global launch dimensions: [256, mb_height]
|
||||
*/
|
||||
kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
|
||||
const global uint16_t *inv_qscale_factor,
|
||||
global int *fenc_row_satds,
|
||||
global int *frame_stats,
|
||||
int mb_width,
|
||||
int bframe_bias,
|
||||
int b,
|
||||
int p0,
|
||||
int p1 )
|
||||
{
|
||||
int y = get_global_id( 1 );
|
||||
int mb_height = get_global_size( 1 );
|
||||
|
||||
int row_satds = 0;
|
||||
int cost_est = 0;
|
||||
int cost_est_aq = 0;
|
||||
int intra_mbs = 0;
|
||||
|
||||
for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
|
||||
{
|
||||
int mb_xy = x + y * mb_width;
|
||||
int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
|
||||
int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
|
||||
int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
|
||||
|
||||
if( list == 0 && b_frame_score_mb )
|
||||
intra_mbs++;
|
||||
|
||||
int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
|
||||
|
||||
row_satds += cost_aq;
|
||||
|
||||
if( b_frame_score_mb )
|
||||
{
|
||||
cost_est += cost;
|
||||
cost_est_aq += cost_aq;
|
||||
}
|
||||
}
|
||||
|
||||
local int buffer[256];
|
||||
int x = get_global_id( 0 );
|
||||
|
||||
row_satds = parallel_sum( row_satds, x, buffer );
|
||||
cost_est = parallel_sum( cost_est, x, buffer );
|
||||
cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
|
||||
intra_mbs = parallel_sum( intra_mbs, x, buffer );
|
||||
|
||||
if( b != p1 )
|
||||
// Use floating point math to avoid 32bit integer overflow conditions
|
||||
cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
|
||||
|
||||
if( get_global_id( 0 ) == 0 )
|
||||
{
|
||||
fenc_row_satds[y] = row_satds;
|
||||
atomic_add( frame_stats + COST_EST, cost_est );
|
||||
atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
|
||||
atomic_add( frame_stats + INTRA_MBS, intra_mbs );
|
||||
}
|
||||
}
|
||||
135
common/opencl/downscale.cl
Normal file
135
common/opencl/downscale.cl
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
|
||||
*
|
||||
* --
|
||||
*
|
||||
* fenc_img is an output image (area of memory referenced through a texture
|
||||
* cache). A read of any pixel location (x,y) returns four pixel values:
|
||||
*
|
||||
* val.s0 = P(x,y)
|
||||
* val.s1 = P(x+1,y)
|
||||
* val.s2 = P(x+2,y)
|
||||
* val.s3 = P(x+3,y)
|
||||
*
|
||||
* This is a 4x replication of the lowres pixels, a trade-off between memory
|
||||
* size and read latency.
|
||||
*
|
||||
* --
|
||||
*
|
||||
* hpel_planes is an output image that contains the four HPEL planes used for
|
||||
* subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
|
||||
* the four planar values C | V | H | F
|
||||
*
|
||||
* launch dimensions: [lowres-width, lowres-height]
|
||||
*/
|
||||
kernel void downscale_hpel( const global pixel *fenc,
|
||||
write_only image2d_t fenc_img,
|
||||
write_only image2d_t hpel_planes,
|
||||
int stride )
|
||||
{
|
||||
int x = get_global_id( 0 );
|
||||
int y = get_global_id( 1 );
|
||||
uint4 values;
|
||||
|
||||
fenc += y * stride * 2;
|
||||
const global pixel *src1 = fenc + stride;
|
||||
const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
|
||||
int2 pos = (int2)(x, y);
|
||||
pixel right, left;
|
||||
|
||||
right = rhadd( fenc[x*2], src1[x*2] );
|
||||
left = rhadd( fenc[x*2+1], src1[x*2+1] );
|
||||
values.s0 = rhadd( right, left ); // F
|
||||
|
||||
right = rhadd( fenc[2*x+1], src1[2*x+1] );
|
||||
left = rhadd( fenc[2*x+2], src1[2*x+2] );
|
||||
values.s1 = rhadd( right, left ); // H
|
||||
|
||||
right = rhadd( src1[2*x], src2[2*x] );
|
||||
left = rhadd( src1[2*x+1], src2[2*x+1] );
|
||||
values.s2 = rhadd( right, left ); // V
|
||||
|
||||
right = rhadd( src1[2*x+1], src2[2*x+1] );
|
||||
left = rhadd( src1[2*x+2], src2[2*x+2] );
|
||||
values.s3 = rhadd( right, left ); // C
|
||||
|
||||
uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
|
||||
write_imageui( hpel_planes, pos, val );
|
||||
|
||||
x = select( x, x+1, x+1 < get_global_size( 0 ) );
|
||||
right = rhadd( fenc[x*2], src1[x*2] );
|
||||
left = rhadd( fenc[x*2+1], src1[x*2+1] );
|
||||
values.s1 = rhadd( right, left );
|
||||
|
||||
x = select( x, x+1, x+1 < get_global_size( 0 ) );
|
||||
right = rhadd( fenc[x*2], src1[x*2] );
|
||||
left = rhadd( fenc[x*2+1], src1[x*2+1] );
|
||||
values.s2 = rhadd( right, left );
|
||||
|
||||
x = select( x, x+1, x+1 < get_global_size( 0 ) );
|
||||
right = rhadd( fenc[x*2], src1[x*2] );
|
||||
left = rhadd( fenc[x*2+1], src1[x*2+1] );
|
||||
values.s3 = rhadd( right, left );
|
||||
|
||||
write_imageui( fenc_img, pos, values );
|
||||
}
|
||||
|
||||
/*
|
||||
* downscale lowres hierarchical motion search image, copy from one image to
|
||||
* another decimated image. This kernel is called iteratively to generate all
|
||||
* of the downscales.
|
||||
*
|
||||
* launch dimensions: [lower_res width, lower_res height]
|
||||
*/
|
||||
kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
|
||||
{
|
||||
int x = get_global_id( 0 );
|
||||
int y = get_global_id( 1 );
|
||||
int2 pos = (int2)(x, y);
|
||||
int gs = get_global_size( 0 );
|
||||
uint4 top, bot, values;
|
||||
top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
|
||||
bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
|
||||
values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
|
||||
|
||||
/* these select statements appear redundant, and they should be, but tests break when
|
||||
* they are not here. I believe this was caused by a driver bug
|
||||
*/
|
||||
values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
|
||||
top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
|
||||
bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
|
||||
values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
|
||||
values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
|
||||
write_imageui( lower_res, pos, (uint4)(values) );
|
||||
}
|
||||
|
||||
/*
|
||||
* Second copy of downscale kernel, no differences. This is a (no perf loss)
|
||||
* workaround for a scheduling bug in current Tahiti drivers. This bug has
|
||||
* theoretically been fixed in the July 2012 driver release from AMD.
|
||||
*/
|
||||
kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
|
||||
{
|
||||
int x = get_global_id( 0 );
|
||||
int y = get_global_id( 1 );
|
||||
int2 pos = (int2)(x, y);
|
||||
int gs = get_global_size( 0 );
|
||||
uint4 top, bot, values;
|
||||
top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
|
||||
bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
|
||||
values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
|
||||
|
||||
// see comment in above function copy
|
||||
values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
|
||||
top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
|
||||
bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
|
||||
values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
|
||||
values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
|
||||
write_imageui( lower_res, pos, (uint4)(values) );
|
||||
}
|
||||
|
||||
/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
|
||||
kernel void memset_int16( global int16_t *buf, int16_t value )
|
||||
{
|
||||
buf[get_global_id( 0 )] = value;
|
||||
}
|
||||
1072
common/opencl/intra.cl
Normal file
1072
common/opencl/intra.cl
Normal file
File diff suppressed because it is too large
Load Diff
249
common/opencl/motionsearch.cl
Normal file
249
common/opencl/motionsearch.cl
Normal file
@@ -0,0 +1,249 @@
|
||||
/* Hierarchical (iterative) OpenCL lowres motion search */
|
||||
|
||||
inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
|
||||
{
|
||||
/* edge macroblocks might not have a direct descendant, use nearest */
|
||||
x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
|
||||
y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
|
||||
return (mb_width>>1) * y + x;
|
||||
}
|
||||
|
||||
/* Four threads calculate an 8x8 SAD. Each does two rows */
|
||||
int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
|
||||
{
|
||||
frefpos.y += idx << 1;
|
||||
fencpos.y += idx << 1;
|
||||
int cost = 0;
|
||||
if( frefpos.x < 0 )
|
||||
{
|
||||
/* slow path when MV goes past left edge. The GPU clamps reads from
|
||||
* (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
|
||||
* want are [0, 0, 1, 2]
|
||||
*/
|
||||
for( int y = 0; y < 2; y++ )
|
||||
{
|
||||
for( int x = 0; x < 8; x++ )
|
||||
{
|
||||
pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
|
||||
pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
|
||||
cost += abs_diff( enc, ref );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint4 enc, ref, costs = 0;
|
||||
enc = read_imageui( fenc, sampler, fencpos );
|
||||
ref = read_imageui( fref, sampler, frefpos );
|
||||
costs += abs_diff( enc, ref );
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
|
||||
ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
|
||||
costs += abs_diff( enc, ref );
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
|
||||
ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
|
||||
costs += abs_diff( enc, ref );
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
|
||||
ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
|
||||
costs += abs_diff( enc, ref );
|
||||
cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
|
||||
}
|
||||
costs[idx] = cost;
|
||||
return costs[0] + costs[1] + costs[2] + costs[3];
|
||||
}
|
||||
|
||||
/* One thread performs 8x8 SAD */
|
||||
int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
|
||||
{
|
||||
if( frefpos.x < 0 )
|
||||
{
|
||||
/* slow path when MV goes past left edge */
|
||||
int cost = 0;
|
||||
for( int y = 0; y < 8; y++ )
|
||||
{
|
||||
for( int x = 0; x < 8; x++ )
|
||||
{
|
||||
uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
|
||||
uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
|
||||
cost += abs_diff( enc, ref );
|
||||
}
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint4 enc, ref, cost = 0;
|
||||
for( int y = 0; y < 8; y++ )
|
||||
{
|
||||
for( int x = 0; x < 8; x += 4 )
|
||||
{
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
|
||||
ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
|
||||
cost += abs_diff( enc, ref );
|
||||
}
|
||||
}
|
||||
return cost.s0 + cost.s1 + cost.s2 + cost.s3;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* hierarchical motion estimation
|
||||
*
|
||||
* Each kernel launch is a single iteration
|
||||
*
|
||||
* MB per work group is determined by lclx / 4 * lcly
|
||||
*
|
||||
* global launch dimensions: [mb_width * 4, mb_height]
|
||||
*/
|
||||
kernel void hierarchical_motion( read_only image2d_t fenc,
|
||||
read_only image2d_t fref,
|
||||
const global short2 *in_mvs,
|
||||
global short2 *out_mvs,
|
||||
global int16_t *out_mv_costs,
|
||||
global short2 *mvp_buffer,
|
||||
local int16_t *cost_local,
|
||||
local short2 *mvc_local,
|
||||
int mb_width,
|
||||
int lambda,
|
||||
int me_range,
|
||||
int scale,
|
||||
int b_shift_index,
|
||||
int b_first_iteration,
|
||||
int b_reverse_references )
|
||||
{
|
||||
int mb_x = get_global_id( 0 ) >> 2;
|
||||
if( mb_x >= mb_width )
|
||||
return;
|
||||
int mb_height = get_global_size( 1 );
|
||||
int mb_i = get_global_id( 0 ) & 3;
|
||||
int mb_y = get_global_id( 1 );
|
||||
int mb_xy = mb_y * mb_width + mb_x;
|
||||
const int mb_size = 8;
|
||||
int2 coord = (int2)(mb_x, mb_y) * mb_size;
|
||||
|
||||
const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
|
||||
cost_local += 4 * mb_in_group;
|
||||
|
||||
int i_mvc = 0;
|
||||
mvc_local += 4 * mb_in_group;
|
||||
mvc_local[mb_i] = 0;
|
||||
int2 mvp =0;
|
||||
|
||||
if( !b_first_iteration )
|
||||
{
|
||||
#define MVC( DX, DY )\
|
||||
{\
|
||||
int px = mb_x + DX;\
|
||||
int py = mb_y + DY;\
|
||||
mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
|
||||
in_mvs[mb_width * py + px];\
|
||||
mvc_local[i_mvc] >>= (short) scale;\
|
||||
i_mvc++;\
|
||||
}
|
||||
/* Find MVP from median of MVCs */
|
||||
if( b_reverse_references )
|
||||
{
|
||||
/* odd iterations: derive MVP from down and right */
|
||||
if( mb_x < mb_width - 1 )
|
||||
MVC( 1, 0 );
|
||||
if( mb_y < mb_height - 1 )
|
||||
{
|
||||
MVC( 0, 1 );
|
||||
if( mb_x > b_shift_index )
|
||||
MVC( -1, 1 );
|
||||
if( mb_x < mb_width - 1 )
|
||||
MVC( 1, 1 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* even iterations: derive MVP from up and left */
|
||||
if( mb_x > 0 )
|
||||
MVC( -1, 0 );
|
||||
if( mb_y > 0 )
|
||||
{
|
||||
MVC( 0, -1 );
|
||||
if( mb_x < mb_width - 1 )
|
||||
MVC( 1, -1 );
|
||||
if( mb_x > b_shift_index )
|
||||
MVC( -1, -1 );
|
||||
}
|
||||
}
|
||||
#undef MVC
|
||||
mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
|
||||
}
|
||||
/* current mvp matches the previous mvp and we have not changed scale. We know
|
||||
* we're going to arrive at the same MV again, so just copy the previous
|
||||
* result to our output. */
|
||||
if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
|
||||
{
|
||||
out_mvs[mb_xy] = in_mvs[mb_xy];
|
||||
return;
|
||||
}
|
||||
mvp_buffer[mb_xy] = convert_short2_sat(mvp);
|
||||
int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
|
||||
int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
|
||||
|
||||
int2 bestmv = clamp(mvp, mv_min, mv_max);
|
||||
int2 refcrd = coord + bestmv;
|
||||
|
||||
/* measure cost at bestmv */
|
||||
int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
|
||||
lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
|
||||
|
||||
do
|
||||
{
|
||||
/* measure costs at offsets from bestmv */
|
||||
refcrd = coord + bestmv + dia_offs[mb_i];
|
||||
int2 trymv = bestmv + dia_offs[mb_i];
|
||||
int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
|
||||
lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
|
||||
|
||||
cost_local[mb_i] = (cost<<2) | mb_i;
|
||||
cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
|
||||
|
||||
if( (cost >> 2) >= bcost )
|
||||
break;
|
||||
|
||||
bestmv += dia_offs[cost&3];
|
||||
bcost = cost>>2;
|
||||
|
||||
if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
|
||||
break;
|
||||
}
|
||||
while( --me_range > 0 );
|
||||
|
||||
int2 trymv = 0, diff = 0;
|
||||
|
||||
#define COST_MV_NO_PAD( L )\
|
||||
trymv = clamp( trymv, mv_min, mv_max );\
|
||||
diff = convert_int2_sat(abs_diff( mvp, trymv ));\
|
||||
if( diff.x > 1 || diff.y > 1 ) {\
|
||||
int2 refcrd = coord + trymv;\
|
||||
int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
|
||||
L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
|
||||
if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
|
||||
|
||||
COST_MV_NO_PAD( 0 );
|
||||
|
||||
if( !b_first_iteration )
|
||||
{
|
||||
/* try cost at previous iteration's MV, if MVP was too far away */
|
||||
int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
|
||||
prevmv >>= scale;
|
||||
trymv = prevmv;
|
||||
COST_MV_NO_PAD( lambda );
|
||||
}
|
||||
|
||||
for( int i = 0; i < i_mvc; i++ )
|
||||
{
|
||||
/* try cost at each candidate MV, if MVP was too far away */
|
||||
trymv = convert_int2_sat( mvc_local[i] );
|
||||
COST_MV_NO_PAD( lambda );
|
||||
}
|
||||
|
||||
if( mb_i == 0 )
|
||||
{
|
||||
bestmv <<= scale;
|
||||
out_mvs[mb_xy] = convert_short2_sat(bestmv);
|
||||
out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
|
||||
}
|
||||
}
|
||||
242
common/opencl/subpel.cl
Normal file
242
common/opencl/subpel.cl
Normal file
@@ -0,0 +1,242 @@
|
||||
/* OpenCL lowres subpel Refine */
|
||||
|
||||
/* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are
|
||||
* calculated simultaneously */
|
||||
int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
|
||||
{
|
||||
int2 frefpos = qpos >> 2;
|
||||
int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
|
||||
uint mask_shift = 8 * hpel_idx;
|
||||
|
||||
uint4 cost4 = 0;
|
||||
|
||||
for( int y = 0; y < 8; y++ )
|
||||
{
|
||||
uint4 enc, val4;
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
|
||||
val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
|
||||
cost4 += abs_diff( enc, val4 );
|
||||
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
|
||||
val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
|
||||
val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
|
||||
cost4 += abs_diff( enc, val4 );
|
||||
}
|
||||
|
||||
return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
|
||||
}
|
||||
|
||||
/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
|
||||
int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
|
||||
{
|
||||
int2 frefApos = qpos >> 2;
|
||||
int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
|
||||
|
||||
int2 qposB = qpos + ((qpos & 1) << 1);
|
||||
int2 frefBpos = qposB >> 2;
|
||||
int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
|
||||
|
||||
uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
|
||||
|
||||
int cost = 0;
|
||||
|
||||
for( int y = 0; y < 8; y++ )
|
||||
{
|
||||
for( int x = 0; x < 8; x++ )
|
||||
{
|
||||
uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
|
||||
uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
|
||||
uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
|
||||
cost += abs_diff( enc, rhadd( vA, vB ) );
|
||||
}
|
||||
}
|
||||
|
||||
return cost;
|
||||
}
|
||||
|
||||
/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
|
||||
*
|
||||
* Each thread collects 1/4 of the rows of diffs and processes one quarter of
|
||||
* the transforms
|
||||
*/
|
||||
int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
|
||||
int2 fencpos,
|
||||
read_only image2d_t fref_planes,
|
||||
int2 qpos,
|
||||
local sum2_t *tmpp,
|
||||
int idx )
|
||||
{
|
||||
volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
|
||||
sum2_t b0, b1, b2, b3;
|
||||
|
||||
// fencpos is full-pel position of original MB
|
||||
// qpos is qpel position within reference frame
|
||||
int2 frefApos = qpos >> 2;
|
||||
int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
|
||||
|
||||
int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
|
||||
int2 frefBpos = qposB >> 2;
|
||||
int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
|
||||
|
||||
uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
|
||||
|
||||
uint vA, vB;
|
||||
uint a0, a1;
|
||||
uint enc;
|
||||
sum2_t sum = 0;
|
||||
|
||||
#define READ_DIFF( OUT, X )\
|
||||
enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
|
||||
vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
|
||||
vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
|
||||
OUT = enc - rhadd( vA, vB );
|
||||
|
||||
#define READ_DIFF_EX( OUT, a, b )\
|
||||
{\
|
||||
READ_DIFF( a0, a );\
|
||||
READ_DIFF( a1, b );\
|
||||
OUT = a0 + (a1<<BITS_PER_SUM);\
|
||||
}
|
||||
#define ROW_8x4_SATD( a, b )\
|
||||
{\
|
||||
fencpos.y += a;\
|
||||
frefApos.y += b;\
|
||||
frefBpos.y += b;\
|
||||
READ_DIFF_EX( b0, 0, 4 );\
|
||||
READ_DIFF_EX( b1, 1, 5 );\
|
||||
READ_DIFF_EX( b2, 2, 6 );\
|
||||
READ_DIFF_EX( b3, 3, 7 );\
|
||||
HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
|
||||
HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
|
||||
sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
|
||||
}
|
||||
ROW_8x4_SATD( 0, 0 );
|
||||
ROW_8x4_SATD( 4, 4 );
|
||||
|
||||
#undef READ_DIFF
|
||||
#undef READ_DIFF_EX
|
||||
#undef ROW_8x4_SATD
|
||||
return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
|
||||
}
|
||||
|
||||
constant int2 hpoffs[4] =
|
||||
{
|
||||
{0, -2}, {-2, 0}, {2, 0}, {0, 2}
|
||||
};
|
||||
|
||||
/* sub pixel refinement of motion vectors, output MVs and costs are moved from
|
||||
* temporary buffers into final per-frame buffer
|
||||
*
|
||||
* global launch dimensions: [mb_width * 4, mb_height]
|
||||
*
|
||||
* With X being the source 16x16 pixels, F is the lowres pixel used by the
|
||||
* motion search. We will now utilize the H V and C pixels (stored in separate
|
||||
* planes) to search at half-pel increments.
|
||||
*
|
||||
* X X X X X X
|
||||
* F H F H F
|
||||
* X X X X X X
|
||||
* V C V C V
|
||||
* X X X X X X
|
||||
* F H F H F
|
||||
* X X X X X X
|
||||
*
|
||||
* The YX HPEL bits of the motion vector selects the plane we search in. The
|
||||
* four planes are packed in the fref_planes 2D image buffer. Each sample
|
||||
* returns: s0 = F, s1 = H, s2 = V, s3 = C */
|
||||
kernel void subpel_refine( read_only image2d_t fenc,
|
||||
read_only image2d_t fref_planes,
|
||||
const global short2 *in_mvs,
|
||||
const global int16_t *in_sad_mv_costs,
|
||||
local int16_t *cost_local,
|
||||
local sum2_t *satd_local,
|
||||
local short2 *mvc_local,
|
||||
global short2 *fenc_lowres_mv,
|
||||
global int16_t *fenc_lowres_mv_costs,
|
||||
int mb_width,
|
||||
int lambda,
|
||||
int b,
|
||||
int ref,
|
||||
int b_islist1 )
|
||||
{
|
||||
int mb_x = get_global_id( 0 ) >> 2;
|
||||
if( mb_x >= mb_width )
|
||||
return;
|
||||
int mb_height = get_global_size( 1 );
|
||||
|
||||
int mb_i = get_global_id( 0 ) & 3;
|
||||
int mb_y = get_global_id( 1 );
|
||||
int mb_xy = mb_y * mb_width + mb_x;
|
||||
|
||||
/* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
|
||||
* hold many frames worth of motion vectors. We must offset into the correct
|
||||
* location for this frame's vectors. The kernel will be passed the correct
|
||||
* directional buffer for the direction of the search: list1 or list0
|
||||
*
|
||||
* CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
|
||||
* GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
|
||||
fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
|
||||
fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
|
||||
|
||||
/* Adjust pointers into local memory buffers for this thread's data */
|
||||
int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
|
||||
cost_local += mb_in_group * 4;
|
||||
satd_local += mb_in_group * 16;
|
||||
mvc_local += mb_in_group * 4;
|
||||
|
||||
int i_mvc = 0;
|
||||
|
||||
mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
|
||||
|
||||
#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
|
||||
if( mb_x > 0 )
|
||||
MVC( -1, 0 );
|
||||
if( mb_y > 0 )
|
||||
{
|
||||
MVC( 0, -1 );
|
||||
if( mb_x < mb_width - 1 )
|
||||
MVC( 1, -1 );
|
||||
if( mb_x > 0 )
|
||||
MVC( -1, -1 );
|
||||
}
|
||||
#undef MVC
|
||||
int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
|
||||
|
||||
int bcost = in_sad_mv_costs[mb_xy];
|
||||
int2 coord = (int2)(mb_x, mb_y) << 3;
|
||||
int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
|
||||
|
||||
/* Make mvp and bmv QPEL MV */
|
||||
mvp <<= 2; bmv <<= 2;
|
||||
|
||||
#define HPEL_QPEL( ARR, FUNC )\
|
||||
{\
|
||||
int2 trymv = bmv + ARR[mb_i];\
|
||||
int2 qpos = (coord << 2) + trymv;\
|
||||
int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
|
||||
cost_local[mb_i] = (cost<<2) + mb_i;\
|
||||
cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
|
||||
if( (cost>>2) < bcost )\
|
||||
{\
|
||||
bmv += ARR[cost&3];\
|
||||
bcost = cost>>2;\
|
||||
}\
|
||||
}
|
||||
|
||||
HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
|
||||
HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
|
||||
fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
|
||||
|
||||
/* remeasure cost of bmv using SATD */
|
||||
int2 qpos = (coord << 2) + bmv;
|
||||
cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
|
||||
bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
|
||||
bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
|
||||
|
||||
fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
|
||||
}
|
||||
48
common/opencl/weightp.cl
Normal file
48
common/opencl/weightp.cl
Normal file
@@ -0,0 +1,48 @@
|
||||
/* Weightp filter a downscaled image into a temporary output buffer.
|
||||
* This kernel is launched once for each scale.
|
||||
*
|
||||
* Launch dimensions: width x height (in pixels)
|
||||
*/
|
||||
kernel void weightp_scaled_images( read_only image2d_t in_plane,
|
||||
write_only image2d_t out_plane,
|
||||
uint offset,
|
||||
uint scale,
|
||||
uint denom )
|
||||
{
|
||||
int gx = get_global_id( 0 );
|
||||
int gy = get_global_id( 1 );
|
||||
uint4 input_val;
|
||||
uint4 output_val;
|
||||
|
||||
input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
|
||||
output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
|
||||
write_imageui( out_plane, (int2)(gx, gy), output_val );
|
||||
}
|
||||
|
||||
/* Weightp filter for the half-pel interpolated image
|
||||
*
|
||||
* Launch dimensions: width x height (in pixels)
|
||||
*/
|
||||
kernel void weightp_hpel( read_only image2d_t in_plane,
|
||||
write_only image2d_t out_plane,
|
||||
uint offset,
|
||||
uint scale,
|
||||
uint denom )
|
||||
{
|
||||
int gx = get_global_id( 0 );
|
||||
int gy = get_global_id( 1 );
|
||||
uint input_val;
|
||||
uint output_val;
|
||||
|
||||
input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
|
||||
//Unpack
|
||||
uint4 temp;
|
||||
temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
|
||||
temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
|
||||
|
||||
temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
|
||||
|
||||
//Pack
|
||||
output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
|
||||
write_imageui( out_plane, (int2)(gx, gy), output_val );
|
||||
}
|
||||
132
common/opencl/x264-cl.h
Normal file
132
common/opencl/x264-cl.h
Normal file
@@ -0,0 +1,132 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
||||
|
||||
constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
/* 7.18.1.1 Exact-width integer types */
|
||||
typedef signed char int8_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef unsigned uint32_t;
|
||||
|
||||
typedef uint8_t pixel;
|
||||
typedef uint16_t sum_t;
|
||||
typedef uint32_t sum2_t;
|
||||
|
||||
#define LOWRES_COST_MASK ((1<<14)-1)
|
||||
#define LOWRES_COST_SHIFT 14
|
||||
#define COST_MAX (1<<28)
|
||||
|
||||
#define PIXEL_MAX 255
|
||||
#define BITS_PER_SUM (8 * sizeof(sum_t))
|
||||
|
||||
/* Constants for offsets into frame statistics buffer */
|
||||
#define COST_EST 0
|
||||
#define COST_EST_AQ 1
|
||||
#define INTRA_MBS 2
|
||||
|
||||
#define COPY2_IF_LT( x, y, a, b )\
|
||||
if( (y) < (x) )\
|
||||
{\
|
||||
(x) = (y);\
|
||||
(a) = (b);\
|
||||
}
|
||||
|
||||
constant int2 dia_offs[4] =
|
||||
{
|
||||
{0, -1}, {-1, 0}, {1, 0}, {0, 1},
|
||||
};
|
||||
|
||||
inline pixel x264_clip_pixel( int x )
|
||||
{
|
||||
return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
|
||||
}
|
||||
|
||||
inline int2 x264_median_mv( short2 a, short2 b, short2 c )
|
||||
{
|
||||
short2 t1 = min(a, b);
|
||||
short2 t2 = min(max(a, b), c);
|
||||
return convert_int2(max(t1, t2));
|
||||
}
|
||||
|
||||
inline sum2_t abs2( sum2_t a )
|
||||
{
|
||||
sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
|
||||
return (a + s) ^ s;
|
||||
}
|
||||
|
||||
#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
|
||||
sum2_t t0 = s0 + s1;\
|
||||
sum2_t t1 = s0 - s1;\
|
||||
sum2_t t2 = s2 + s3;\
|
||||
sum2_t t3 = s2 - s3;\
|
||||
d0 = t0 + t2;\
|
||||
d2 = t0 - t2;\
|
||||
d1 = t1 + t3;\
|
||||
d3 = t1 - t3;\
|
||||
}
|
||||
|
||||
#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
|
||||
int2 t0 = s0 + s1;\
|
||||
int2 t1 = s0 - s1;\
|
||||
int2 t2 = s2 + s3;\
|
||||
int2 t3 = s2 - s3;\
|
||||
d0 = t0 + t2;\
|
||||
d2 = t0 - t2;\
|
||||
d1 = t1 + t3;\
|
||||
d3 = t1 - t3;\
|
||||
}
|
||||
|
||||
#define SATD_C_8x4_Q( name, q1, q2 )\
|
||||
int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
|
||||
{\
|
||||
sum2_t tmp[4][4];\
|
||||
sum2_t a0, a1, a2, a3;\
|
||||
sum2_t sum = 0;\
|
||||
for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
|
||||
{\
|
||||
a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
|
||||
a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
|
||||
a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
|
||||
a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
|
||||
HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
|
||||
}\
|
||||
for( int i = 0; i < 4; i++ )\
|
||||
{\
|
||||
HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
|
||||
sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
|
||||
}\
|
||||
return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
|
||||
}
|
||||
|
||||
/*
|
||||
* Utility function to perform a parallel sum reduction of an array of integers
|
||||
*/
|
||||
int parallel_sum( int value, int x, volatile local int *array )
|
||||
{
|
||||
array[x] = value;
|
||||
barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
int dim = get_local_size( 0 );
|
||||
|
||||
while( dim > 1 )
|
||||
{
|
||||
dim >>= 1;
|
||||
|
||||
if( x < dim )
|
||||
array[x] += array[x + dim];
|
||||
|
||||
if( dim > 32 )
|
||||
barrier( CLK_LOCAL_MEM_FENCE );
|
||||
}
|
||||
|
||||
return array[0];
|
||||
}
|
||||
|
||||
int mv_cost( uint2 mvd )
|
||||
{
|
||||
float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
|
||||
float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
|
||||
return (int) (cost.x + cost.y);
|
||||
}
|
||||
Reference in New Issue
Block a user