x264 source for verification 2026-05-22

This commit is contained in:
2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions

3895
encoder/analyse.c Normal file

File diff suppressed because it is too large Load Diff

55
encoder/analyse.h Normal file
View File

@@ -0,0 +1,55 @@
/*****************************************************************************
* analyse.h: macroblock analysis
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_ANALYSE_H
#define X264_ENCODER_ANALYSE_H
#define x264_analyse_init_costs x264_template(analyse_init_costs)
int x264_analyse_init_costs( x264_t *h );
#define x264_analyse_free_costs x264_template(analyse_free_costs)
void x264_analyse_free_costs( x264_t *h );
#define x264_analyse_weight_frame x264_template(analyse_weight_frame)
void x264_analyse_weight_frame( x264_t *h, int end );
#define x264_macroblock_analyse x264_template(macroblock_analyse)
void x264_macroblock_analyse( x264_t *h );
#define x264_slicetype_decide x264_template(slicetype_decide)
void x264_slicetype_decide( x264_t *h );
#define x264_slicetype_analyse x264_template(slicetype_analyse)
void x264_slicetype_analyse( x264_t *h, int intra_minigop );
#define x264_lookahead_init x264_template(lookahead_init)
int x264_lookahead_init( x264_t *h, int i_slicetype_length );
#define x264_lookahead_is_empty x264_template(lookahead_is_empty)
int x264_lookahead_is_empty( x264_t *h );
#define x264_lookahead_put_frame x264_template(lookahead_put_frame)
void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
#define x264_lookahead_get_frames x264_template(lookahead_get_frames)
void x264_lookahead_get_frames( x264_t *h );
#define x264_lookahead_delete x264_template(lookahead_delete)
void x264_lookahead_delete( x264_t *h );
#endif

199
encoder/api.c Normal file
View File

@@ -0,0 +1,199 @@
/*****************************************************************************
* api.c: bit depth independent interface
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Vittorio Giovara <vittorio.giovara@gmail.com>
* Luca Barbato <lu_zero@gentoo.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/base.h"
/****************************************************************************
* global symbols
****************************************************************************/
const int x264_chroma_format = X264_CHROMA_FORMAT;
x264_t *x264_8_encoder_open( x264_param_t *, void * );
void x264_8_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
int x264_8_encoder_reconfig( x264_t *, x264_param_t * );
void x264_8_encoder_parameters( x264_t *, x264_param_t * );
int x264_8_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
int x264_8_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
void x264_8_encoder_close( x264_t * );
int x264_8_encoder_delayed_frames( x264_t * );
int x264_8_encoder_maximum_delayed_frames( x264_t * );
void x264_8_encoder_intra_refresh( x264_t * );
int x264_8_encoder_invalidate_reference( x264_t *, int64_t pts );
x264_t *x264_10_encoder_open( x264_param_t *, void * );
void x264_10_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
int x264_10_encoder_reconfig( x264_t *, x264_param_t * );
void x264_10_encoder_parameters( x264_t *, x264_param_t * );
int x264_10_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
int x264_10_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
void x264_10_encoder_close( x264_t * );
int x264_10_encoder_delayed_frames( x264_t * );
int x264_10_encoder_maximum_delayed_frames( x264_t * );
void x264_10_encoder_intra_refresh( x264_t * );
int x264_10_encoder_invalidate_reference( x264_t *, int64_t pts );
typedef struct x264_api_t
{
/* Internal reference to x264_t data */
x264_t *x264;
/* API entry points */
void (*nal_encode)( x264_t *h, uint8_t *dst, x264_nal_t *nal );
int (*encoder_reconfig)( x264_t *, x264_param_t * );
void (*encoder_parameters)( x264_t *, x264_param_t * );
int (*encoder_headers)( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
int (*encoder_encode)( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
void (*encoder_close)( x264_t * );
int (*encoder_delayed_frames)( x264_t * );
int (*encoder_maximum_delayed_frames)( x264_t * );
void (*encoder_intra_refresh)( x264_t * );
int (*encoder_invalidate_reference)( x264_t *, int64_t pts );
} x264_api_t;
REALIGN_STACK x264_t *x264_encoder_open( x264_param_t *param )
{
x264_api_t *api = calloc( 1, sizeof( x264_api_t ) );
if( !api )
return NULL;
#if HAVE_BITDEPTH8
if( param->i_bitdepth == 8 )
{
api->nal_encode = x264_8_nal_encode;
api->encoder_reconfig = x264_8_encoder_reconfig;
api->encoder_parameters = x264_8_encoder_parameters;
api->encoder_headers = x264_8_encoder_headers;
api->encoder_encode = x264_8_encoder_encode;
api->encoder_close = x264_8_encoder_close;
api->encoder_delayed_frames = x264_8_encoder_delayed_frames;
api->encoder_maximum_delayed_frames = x264_8_encoder_maximum_delayed_frames;
api->encoder_intra_refresh = x264_8_encoder_intra_refresh;
api->encoder_invalidate_reference = x264_8_encoder_invalidate_reference;
api->x264 = x264_8_encoder_open( param, api );
}
else
#endif
#if HAVE_BITDEPTH10
if( param->i_bitdepth == 10 )
{
api->nal_encode = x264_10_nal_encode;
api->encoder_reconfig = x264_10_encoder_reconfig;
api->encoder_parameters = x264_10_encoder_parameters;
api->encoder_headers = x264_10_encoder_headers;
api->encoder_encode = x264_10_encoder_encode;
api->encoder_close = x264_10_encoder_close;
api->encoder_delayed_frames = x264_10_encoder_delayed_frames;
api->encoder_maximum_delayed_frames = x264_10_encoder_maximum_delayed_frames;
api->encoder_intra_refresh = x264_10_encoder_intra_refresh;
api->encoder_invalidate_reference = x264_10_encoder_invalidate_reference;
api->x264 = x264_10_encoder_open( param, api );
}
else
#endif
x264_log_internal( X264_LOG_ERROR, "not compiled with %d bit depth support\n", param->i_bitdepth );
if( !api->x264 )
{
free( api );
return NULL;
}
/* x264_t is opaque */
return (x264_t *)api;
}
REALIGN_STACK void x264_encoder_close( x264_t *h )
{
x264_api_t *api = (x264_api_t *)h;
api->encoder_close( api->x264 );
free( api );
}
REALIGN_STACK void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
{
x264_api_t *api = (x264_api_t *)h;
api->nal_encode( api->x264, dst, nal );
}
REALIGN_STACK int x264_encoder_reconfig( x264_t *h, x264_param_t *param)
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_reconfig( api->x264, param );
}
REALIGN_STACK void x264_encoder_parameters( x264_t *h, x264_param_t *param )
{
x264_api_t *api = (x264_api_t *)h;
api->encoder_parameters( api->x264, param );
}
REALIGN_STACK int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_headers( api->x264, pp_nal, pi_nal );
}
REALIGN_STACK int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out )
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_encode( api->x264, pp_nal, pi_nal, pic_in, pic_out );
}
REALIGN_STACK int x264_encoder_delayed_frames( x264_t *h )
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_delayed_frames( api->x264 );
}
REALIGN_STACK int x264_encoder_maximum_delayed_frames( x264_t *h )
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_maximum_delayed_frames( api->x264 );
}
REALIGN_STACK void x264_encoder_intra_refresh( x264_t *h )
{
x264_api_t *api = (x264_api_t *)h;
api->encoder_intra_refresh( api->x264 );
}
REALIGN_STACK int x264_encoder_invalidate_reference( x264_t *h, int64_t pts )
{
x264_api_t *api = (x264_api_t *)h;
return api->encoder_invalidate_reference( api->x264, pts );
}

1239
encoder/cabac.c Normal file

File diff suppressed because it is too large Load Diff

722
encoder/cavlc.c Normal file
View File

@@ -0,0 +1,722 @@
/*****************************************************************************
* cavlc.c: cavlc bitstream writing
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "macroblock.h"
#ifndef RDO_SKIP_BS
#define RDO_SKIP_BS 0
#endif
/* [400,420][inter,intra] */
static const uint8_t cbp_to_golomb[2][2][48] =
{
{{ 0, 1, 2, 5, 3, 6, 14, 10, 4, 15, 7, 11, 8, 12, 13, 9 },
{ 1, 10, 11, 6, 12, 7, 14, 2, 13, 15, 8, 3, 9, 4, 5, 0 }},
{{ 0, 2, 3, 7, 4, 8, 17, 13, 5, 18, 9, 14, 10, 15, 16, 11,
1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 },
{ 3, 29, 30, 17, 31, 18, 37, 8, 32, 38, 19, 9, 20, 10, 11, 2,
16, 33, 34, 21, 35, 22, 39, 4, 36, 40, 23, 5, 24, 6, 7, 1,
41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0 }}
};
static const uint8_t mb_type_b_to_golomb[3][9]=
{
{ 4, 8, 12, 10, 6, 14, 16, 18, 20 }, /* D_16x8 */
{ 5, 9, 13, 11, 7, 15, 17, 19, 21 }, /* D_8x16 */
{ 1, -1, -1, -1, 2, -1, -1, -1, 3 } /* D_16x16 */
};
static const uint8_t subpartition_p_to_golomb[4]=
{
3, 1, 2, 0
};
static const uint8_t subpartition_b_to_golomb[13]=
{
10, 4, 5, 1, 11, 6, 7, 2, 12, 8, 9, 3, 0
};
#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
/****************************************************************************
* x264_cavlc_block_residual:
****************************************************************************/
static inline int cavlc_block_residual_escape( x264_t *h, int i_suffix_length, int level )
{
bs_t *s = &h->out.bs;
static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
int i_level_prefix = 15;
int mask = level >> 31;
int abs_level = (level^mask)-mask;
int i_level_code = abs_level*2-mask-2;
if( ( i_level_code >> i_suffix_length ) < 15 )
{
bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
}
else
{
i_level_code -= 15 << i_suffix_length;
if( i_suffix_length == 0 )
i_level_code -= 15;
/* If the prefix size exceeds 15, High Profile is required. */
if( i_level_code >= 1<<12 )
{
if( h->sps->i_profile_idc >= PROFILE_HIGH )
{
while( i_level_code >= 1<<(i_level_prefix-3) )
{
i_level_code -= 1<<(i_level_prefix-3);
i_level_prefix++;
}
}
else
{
#if RDO_SKIP_BS
/* Weight highly against overflows. */
s->i_bits_encoded += 2000;
#else
/* We've had an overflow; note it down and re-encode the MB later. */
h->mb.b_overflow = 1;
#endif
}
}
bs_write( s, i_level_prefix + 1, 1 );
bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
}
if( i_suffix_length == 0 )
i_suffix_length++;
if( abs_level > next_suffix[i_suffix_length] )
i_suffix_length++;
return i_suffix_length;
}
static int cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC )
{
bs_t *s = &h->out.bs;
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
x264_run_level_t runlevel;
int i_total, i_trailing, i_total_zero, i_suffix_length;
unsigned int i_sign;
/* level and run and total */
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
/* branchless i_trailing calculation */
runlevel.level[i_total+0] = 2;
runlevel.level[i_total+1] = 2;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
| ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
| ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
i_trailing = ctz_index[i_trailing];
i_sign = ((runlevel.level[2] >> 31) & 1)
| ((runlevel.level[1] >> 31) & 2)
| ((runlevel.level[0] >> 31) & 4);
i_sign >>= 3-i_trailing;
/* total/trailing */
bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] );
i_suffix_length = i_total > 10 && i_trailing < 3;
bs_write( s, i_trailing, i_sign );
if( i_trailing < i_total )
{
int val = runlevel.level[i_trailing];
int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
}
else
i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( int i = i_trailing+1; i < i_total; i++ )
{
val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
if( (unsigned)val < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
}
else
i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
}
}
if( ctx_block_cat == DCT_CHROMA_DC )
{
if( i_total < 8>>CHROMA_V_SHIFT )
{
vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
: x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
bs_write_vlc( s, total_zeros );
}
}
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
int zero_run_code = x264_run_before[runlevel.mask];
bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
return i_total;
}
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
#define x264_cavlc_block_residual(h,cat,idx,l)\
{\
int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\
: ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
if( !*nnz )\
bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
else\
*nnz = cavlc_block_residual_internal(h,cat,l,nC);\
}
static void cavlc_qp_delta( x264_t *h )
{
bs_t *s = &h->out.bs;
int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
/* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
* flat background area. Don't do this if it would raise the quantizer, since that could
* cause unexpected deblocking artifacts. */
if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
&& !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
&& !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
&& !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]]
&& h->mb.i_qp > h->mb.i_last_qp )
{
#if !RDO_SKIP_BS
h->mb.i_qp = h->mb.i_last_qp;
#endif
i_dqp = 0;
}
if( i_dqp )
{
if( i_dqp < -(QP_MAX_SPEC+1)/2 )
i_dqp += QP_MAX_SPEC+1;
else if( i_dqp > QP_MAX_SPEC/2 )
i_dqp -= QP_MAX_SPEC+1;
}
bs_write_se( s, i_dqp );
}
static void cavlc_mvd( x264_t *h, int i_list, int idx, int width )
{
bs_t *s = &h->out.bs;
ALIGNED_4( int16_t mvp[2] );
x264_mb_predict_mv( h, i_list, idx, width, mvp );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
}
static inline void cavlc_8x8_mvd( x264_t *h, int i )
{
switch( h->mb.i_sub_partition[i] )
{
case D_L0_8x8:
cavlc_mvd( h, 0, 4*i, 2 );
break;
case D_L0_8x4:
cavlc_mvd( h, 0, 4*i+0, 2 );
cavlc_mvd( h, 0, 4*i+2, 2 );
break;
case D_L0_4x8:
cavlc_mvd( h, 0, 4*i+0, 1 );
cavlc_mvd( h, 0, 4*i+1, 1 );
break;
case D_L0_4x4:
cavlc_mvd( h, 0, 4*i+0, 1 );
cavlc_mvd( h, 0, 4*i+1, 1 );
cavlc_mvd( h, 0, 4*i+2, 1 );
cavlc_mvd( h, 0, 4*i+3, 1 );
break;
}
}
static ALWAYS_INLINE void cavlc_macroblock_luma_residual( x264_t *h, int plane_count )
{
if( h->mb.b_transform_8x8 )
{
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( int p = 0; p < plane_count; p++ )
for( int i8 = 0; i8 < 4; i8++ )
if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] )
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8],
&h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] );
}
for( int p = 0; p < plane_count; p++ )
FOREACH_BIT( i8, 0, h->mb.i_cbp_luma )
for( int i4 = 0; i4 < 4; i4++ )
x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
}
#if RDO_SKIP_BS
static ALWAYS_INLINE void cavlc_partition_luma_residual( x264_t *h, int i8, int p )
{
if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] )
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4],
&h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] );
if( h->mb.i_cbp_luma & (1 << i8) )
for( int i4 = 0; i4 < 4; i4++ )
x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
}
#endif
static void cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
{
bs_t *s = &h->out.bs;
if( i_mb_type == I_16x16 )
{
bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
}
else //if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
{
int di = i_mb_type == I_8x8 ? 4 : 1;
bs_write_ue( s, i_mb_i_offset + 0 );
if( h->pps->b_transform_8x8_mode )
bs_write1( s, h->mb.b_transform_8x8 );
/* Prediction: Luma */
for( int i = 0; i < 16; i += di )
{
int i_pred = x264_mb_predict_intra4x4_mode( h, i );
int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
if( i_pred == i_mode )
bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */
else
bs_write( s, 4, i_mode - (i_mode > i_pred) );
}
}
if( chroma )
bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
}
static ALWAYS_INLINE void cavlc_mb_header_p( x264_t *h, int i_mb_type, int chroma )
{
bs_t *s = &h->out.bs;
if( i_mb_type == P_L0 )
{
if( h->mb.i_partition == D_16x16 )
{
bs_write1( s, 1 );
if( h->mb.pic.i_fref[0] > 1 )
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
cavlc_mvd( h, 0, 0, 4 );
}
else if( h->mb.i_partition == D_16x8 )
{
bs_write_ue( s, 1 );
if( h->mb.pic.i_fref[0] > 1 )
{
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
}
cavlc_mvd( h, 0, 0, 4 );
cavlc_mvd( h, 0, 8, 4 );
}
else if( h->mb.i_partition == D_8x16 )
{
bs_write_ue( s, 2 );
if( h->mb.pic.i_fref[0] > 1 )
{
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
}
cavlc_mvd( h, 0, 0, 2 );
cavlc_mvd( h, 0, 4, 2 );
}
}
else if( i_mb_type == P_8x8 )
{
int b_sub_ref;
if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
{
bs_write_ue( s, 4 );
b_sub_ref = 0;
}
else
{
bs_write_ue( s, 3 );
b_sub_ref = 1;
}
/* sub mb type */
if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
for( int i = 0; i < 4; i++ )
bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i] ] );
else
bs_write( s, 4, 0xf );
/* ref0 */
if( b_sub_ref )
{
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
}
for( int i = 0; i < 4; i++ )
cavlc_8x8_mvd( h, i );
}
else //if( IS_INTRA( i_mb_type ) )
cavlc_mb_header_i( h, i_mb_type, 5, chroma );
}
static ALWAYS_INLINE void cavlc_mb_header_b( x264_t *h, int i_mb_type, int chroma )
{
bs_t *s = &h->out.bs;
if( i_mb_type == B_8x8 )
{
bs_write_ue( s, 22 );
/* sub mb type */
for( int i = 0; i < 4; i++ )
bs_write_ue( s, subpartition_b_to_golomb[ h->mb.i_sub_partition[i] ] );
/* ref */
if( h->mb.pic.i_fref[0] > 1 )
for( int i = 0; i < 4; i++ )
if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
if( h->mb.pic.i_fref[1] > 1 )
for( int i = 0; i < 4; i++ )
if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
/* mvd */
for( int i = 0; i < 4; i++ )
if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
cavlc_mvd( h, 0, 4*i, 2 );
for( int i = 0; i < 4; i++ )
if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
cavlc_mvd( h, 1, 4*i, 2 );
}
else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
{
/* All B mode */
/* Motion Vector */
const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
if( h->mb.i_partition == D_16x16 )
{
if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 );
if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 );
}
else
{
if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
if( h->mb.i_partition == D_16x8 )
{
if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 );
if( b_list[0][1] ) cavlc_mvd( h, 0, 8, 4 );
if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 );
if( b_list[1][1] ) cavlc_mvd( h, 1, 8, 4 );
}
else //if( h->mb.i_partition == D_8x16 )
{
if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 2 );
if( b_list[0][1] ) cavlc_mvd( h, 0, 4, 2 );
if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 2 );
if( b_list[1][1] ) cavlc_mvd( h, 1, 4, 2 );
}
}
}
else if( i_mb_type == B_DIRECT )
bs_write1( s, 1 );
else //if( IS_INTRA( i_mb_type ) )
cavlc_mb_header_i( h, i_mb_type, 23, chroma );
}
/*****************************************************************************
* x264_macroblock_write:
*****************************************************************************/
void x264_macroblock_write_cavlc( x264_t *h )
{
bs_t *s = &h->out.bs;
const int i_mb_type = h->mb.i_type;
int plane_count = CHROMA444 ? 3 : 1;
int chroma = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
#if RDO_SKIP_BS
s->i_bits_encoded = 0;
#else
const int i_mb_pos_start = bs_pos( s );
int i_mb_pos_tex;
#endif
if( SLICE_MBAFF
&& (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
{
bs_write1( s, MB_INTERLACED );
#if !RDO_SKIP_BS
h->mb.field_decoding_flag = MB_INTERLACED;
#endif
}
#if !RDO_SKIP_BS
if( i_mb_type == I_PCM )
{
static const uint8_t i_offsets[3] = {5,23,0};
uint8_t *p_start = s->p_start;
bs_write_ue( s, i_offsets[h->sh.i_type] + 25 );
i_mb_pos_tex = bs_pos( s );
h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
bs_align_0( s );
for( int p = 0; p < plane_count; p++ )
for( int i = 0; i < 256; i++ )
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
if( chroma )
for( int ch = 1; ch < 3; ch++ )
for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
for( int j = 0; j < 8; j++ )
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
bs_init( s, s->p, s->p_end - s->p );
s->p_start = p_start;
h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
return;
}
#endif
if( h->sh.i_type == SLICE_TYPE_P )
cavlc_mb_header_p( h, i_mb_type, chroma );
else if( h->sh.i_type == SLICE_TYPE_B )
cavlc_mb_header_b( h, i_mb_type, chroma );
else //if( h->sh.i_type == SLICE_TYPE_I )
cavlc_mb_header_i( h, i_mb_type, 0, chroma );
#if !RDO_SKIP_BS
i_mb_pos_tex = bs_pos( s );
h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
#endif
/* Coded block pattern */
if( i_mb_type != I_16x16 )
bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
/* transform size 8x8 flag */
if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
bs_write1( s, h->mb.b_transform_8x8 );
if( i_mb_type == I_16x16 )
{
cavlc_qp_delta( h );
/* DC Luma */
for( int p = 0; p < plane_count; p++ )
{
x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );
/* AC Luma */
if( h->mb.i_cbp_luma )
for( int i = p*16; i < p*16+16; i++ )
x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
}
}
else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
{
cavlc_qp_delta( h );
cavlc_macroblock_luma_residual( h, plane_count );
}
if( h->mb.i_cbp_chroma )
{
/* Chroma DC residual present */
x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
{
int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
}
}
#if !RDO_SKIP_BS
h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
#endif
}
#if RDO_SKIP_BS
/*****************************************************************************
* RD only; doesn't generate a valid bitstream
* doesn't write cbp or chroma dc (I don't know how much this matters)
* doesn't write ref (never varies between calls, so no point in doing so)
* only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
* works on all partition sizes except 16x16
*****************************************************************************/
static int partition_size_cavlc( x264_t *h, int i8, int i_pixel )
{
bs_t *s = &h->out.bs;
const int i_mb_type = h->mb.i_type;
int b_8x16 = h->mb.i_partition == D_8x16;
int plane_count = CHROMA444 ? 3 : 1;
int j;
h->out.bs.i_bits_encoded = 0;
if( i_mb_type == P_8x8 )
{
cavlc_8x8_mvd( h, i8 );
bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
}
else if( i_mb_type == P_L0 )
cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mvd( h, 1, 4*i8, 4>>b_8x16 );
}
else //if( i_mb_type == B_8x8 )
{
if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
cavlc_mvd( h, 0, 4*i8, 2 );
if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
cavlc_mvd( h, 1, 4*i8, 2 );
}
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
{
for( int p = 0; p < plane_count; p++ )
cavlc_partition_luma_residual( h, i8, p );
if( h->mb.i_cbp_chroma )
{
if( CHROMA_FORMAT == CHROMA_422 )
{
int offset = (5*i8) & 0x09;
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
}
else
{
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
}
}
i8 += x264_pixel_size[i_pixel].h >> 3;
}
return h->out.bs.i_bits_encoded;
}
static int subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
{
int plane_count = CHROMA444 ? 3 : 1;
int b_8x4 = i_pixel == PIXEL_8x4;
h->out.bs.i_bits_encoded = 0;
cavlc_mvd( h, 0, i4, 1+b_8x4 );
for( int p = 0; p < plane_count; p++ )
{
x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
if( i_pixel != PIXEL_4x4 )
x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] );
}
return h->out.bs.i_bits_encoded;
}
static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
{
if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
return 1;
else
return 4;
}
static int partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
{
int plane_count = CHROMA444 ? 3 : 1;
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
for( int p = 0; p < plane_count; p++ )
cavlc_partition_luma_residual( h, i8, p );
return h->out.bs.i_bits_encoded;
}
static int partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
int plane_count = CHROMA444 ? 3 : 1;
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
for( int p = 0; p < plane_count; p++ )
x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
return h->out.bs.i_bits_encoded;
}
static int chroma_size_cavlc( x264_t *h )
{
h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
if( h->mb.i_cbp_chroma )
{
x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma == 2 )
{
int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
}
}
return h->out.bs.i_bits_encoded;
}
#endif

4603
encoder/encoder.c Normal file

File diff suppressed because it is too large Load Diff

250
encoder/lookahead.c Normal file
View File

@@ -0,0 +1,250 @@
/*****************************************************************************
* lookahead.c: high-level lookahead functions
*****************************************************************************
* Copyright (C) 2010-2025 Avail Media and x264 project
*
* Authors: Michael Kazmier <mkazmier@availmedia.com>
* Alex Giladi <agiladi@availmedia.com>
* Steven Walters <kemuri9@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
/* LOOKAHEAD (threaded and non-threaded mode)
*
* Lookahead types:
* [1] Slice type / scene cut;
*
* In non-threaded mode, we run the existing slicetype decision code as it was.
* In threaded mode, we run in a separate thread, that lives between the calls
* to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
* the number of frames specified in rc_lookahead. Recommended setting is
* # of bframes + # of threads.
*/
#include "common/common.h"
#include "analyse.h"
static void lookahead_shift( x264_sync_frame_list_t *dst, x264_sync_frame_list_t *src, int count )
{
int i = count;
while( i-- )
{
assert( dst->i_size < dst->i_max_size );
assert( src->i_size );
dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
src->i_size--;
}
if( count )
{
x264_pthread_cond_broadcast( &dst->cv_fill );
x264_pthread_cond_broadcast( &src->cv_empty );
}
}
static void lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
{
if( h->lookahead->last_nonb )
x264_frame_push_unused( h, h->lookahead->last_nonb );
h->lookahead->last_nonb = new_nonb;
new_nonb->i_reference_count++;
}
#if HAVE_THREAD
static void lookahead_slicetype_decide( x264_t *h )
{
x264_slicetype_decide( h );
lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
x264_pthread_mutex_lock( &h->lookahead->next.mutex );
lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
/* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
x264_slicetype_analyse( h, shift_frames );
x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
}
REALIGN_STACK static void *lookahead_thread( x264_t *h )
{
while( 1 )
{
x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
if( h->lookahead->b_exit_thread )
{
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
break;
}
x264_pthread_mutex_lock( &h->lookahead->next.mutex );
int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
{
while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
}
else
{
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
lookahead_slicetype_decide( h );
}
} /* end of input frames */
x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
x264_pthread_mutex_lock( &h->lookahead->next.mutex );
lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
while( h->lookahead->next.i_size )
lookahead_slicetype_decide( h );
x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
h->lookahead->b_thread_active = 0;
x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
return NULL;
}
#endif
int x264_lookahead_init( x264_t *h, int i_slicetype_length )
{
x264_lookahead_t *look;
CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
for( int i = 0; i < h->param.i_threads; i++ )
h->thread[i]->lookahead = look;
look->i_last_keyframe = - h->param.i_keyint_max;
look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
&& !h->param.rc.b_stat_read;
look->i_slicetype_length = i_slicetype_length;
/* init frame lists */
if( x264_sync_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
x264_sync_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
x264_sync_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
goto fail;
if( !h->param.i_sync_lookahead )
return 0;
x264_t *look_h = h->thread[h->param.i_threads];
*look_h = *h;
if( x264_macroblock_cache_allocate( look_h ) )
goto fail;
if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
goto fail;
if( x264_pthread_create( &look->thread_handle, NULL, (void*)lookahead_thread, look_h ) )
goto fail;
look->b_thread_active = 1;
return 0;
fail:
x264_free( look );
return -1;
}
void x264_lookahead_delete( x264_t *h )
{
if( h->param.i_sync_lookahead )
{
x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
h->lookahead->b_exit_thread = 1;
x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
x264_pthread_join( h->lookahead->thread_handle, NULL );
x264_macroblock_cache_free( h->thread[h->param.i_threads] );
x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
x264_free( h->thread[h->param.i_threads] );
}
x264_sync_frame_list_delete( &h->lookahead->ifbuf );
x264_sync_frame_list_delete( &h->lookahead->next );
if( h->lookahead->last_nonb )
x264_frame_push_unused( h, h->lookahead->last_nonb );
x264_sync_frame_list_delete( &h->lookahead->ofbuf );
x264_free( h->lookahead );
}
void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
{
if( h->param.i_sync_lookahead )
x264_sync_frame_list_push( &h->lookahead->ifbuf, frame );
else
x264_sync_frame_list_push( &h->lookahead->next, frame );
}
int x264_lookahead_is_empty( x264_t *h )
{
x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
x264_pthread_mutex_lock( &h->lookahead->next.mutex );
int b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
return b_empty;
}
static void lookahead_encoder_shift( x264_t *h )
{
if( !h->lookahead->ofbuf.i_size )
return;
int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
while( i_frames-- )
{
x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
h->lookahead->ofbuf.i_size--;
}
x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
}
void x264_lookahead_get_frames( x264_t *h )
{
if( h->param.i_sync_lookahead )
{ /* We have a lookahead thread, so get frames from there */
x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
lookahead_encoder_shift( h );
x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
}
else
{ /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
if( h->frames.current[0] || !h->lookahead->next.i_size )
return;
x264_slicetype_decide( h );
lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
/* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
x264_slicetype_analyse( h, shift_frames );
lookahead_encoder_shift( h );
}
}

1425
encoder/macroblock.c Normal file

File diff suppressed because it is too large Load Diff

215
encoder/macroblock.h Normal file
View File

@@ -0,0 +1,215 @@
/*****************************************************************************
* macroblock.h: macroblock encoding
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_MACROBLOCK_H
#define X264_ENCODER_MACROBLOCK_H
#include "common/macroblock.h"
#define x264_rdo_init x264_template(rdo_init)
void x264_rdo_init( void );
#define x264_macroblock_probe_skip x264_template(macroblock_probe_skip)
int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
#define x264_macroblock_probe_pskip( h )\
x264_macroblock_probe_skip( h, 0 )
#define x264_macroblock_probe_bskip( h )\
x264_macroblock_probe_skip( h, 1 )
#define x264_predict_lossless_4x4 x264_template(predict_lossless_4x4)
void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
#define x264_predict_lossless_8x8 x264_template(predict_lossless_8x8)
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
#define x264_predict_lossless_16x16 x264_template(predict_lossless_16x16)
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
#define x264_predict_lossless_chroma x264_template(predict_lossless_chroma)
void x264_predict_lossless_chroma( x264_t *h, int i_mode );
#define x264_macroblock_encode x264_template(macroblock_encode)
void x264_macroblock_encode ( x264_t *h );
#define x264_macroblock_write_cabac x264_template(macroblock_write_cabac)
void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
#define x264_macroblock_write_cavlc x264_template(macroblock_write_cavlc)
void x264_macroblock_write_cavlc ( x264_t *h );
#define x264_macroblock_encode_p8x8 x264_template(macroblock_encode_p8x8)
void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
#define x264_macroblock_encode_p4x4 x264_template(macroblock_encode_p4x4)
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
#define x264_mb_encode_chroma x264_template(mb_encode_chroma)
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
#define x264_cabac_mb_skip x264_template(cabac_mb_skip)
void x264_cabac_mb_skip( x264_t *h, int b_skip );
#define x264_cabac_block_residual_c x264_template(cabac_block_residual_c)
void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
#define x264_cabac_block_residual_8x8_rd_c x264_template(cabac_block_residual_8x8_rd_c)
void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
#define x264_cabac_block_residual_rd_c x264_template(cabac_block_residual_rd_c)
void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
#define x264_quant_luma_dc_trellis x264_template(quant_luma_dc_trellis)
int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
int ctx_block_cat, int b_intra, int idx );
#define x264_quant_chroma_dc_trellis x264_template(quant_chroma_dc_trellis)
int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
#define x264_quant_4x4_trellis x264_template(quant_4x4_trellis)
int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
#define x264_quant_8x8_trellis x264_template(quant_8x8_trellis)
int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
#define x264_noise_reduction_update x264_template(noise_reduction_update)
void x264_noise_reduction_update( x264_t *h );
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
if( h->mb.b_trellis )
return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
else
return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}
static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
if( h->mb.b_trellis )
return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
else
return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
}
#define STORE_8x8_NNZ( p, idx, nz )\
do\
{\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
} while( 0 )
#define CLEAR_16x16_NNZ( p ) \
do\
{\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\
} while( 0 )
/* A special for loop that iterates branchlessly over each set
* bit in a 4-bit input. */
#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
{
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
if( b_predict )
{
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
else
h->predict_4x4[i_mode]( p_dst );
}
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
h->mb.i_cbp_luma |= nz<<(idx>>2);
return;
}
h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
if( nz )
{
h->mb.i_cbp_luma |= 1<<(idx>>2);
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
h->dctf.add4x4_idct( p_dst, dct4x4 );
}
}
static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict )
{
int x = idx&1;
int y = idx>>1;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
{
if( !edge )
{
h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
edge = edge_buf;
}
if( h->mb.b_lossless )
x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
else
h->predict_8x8[i_mode]( p_dst, edge );
}
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
STORE_8x8_NNZ( p, idx, nz );
h->mb.i_cbp_luma |= nz<<idx;
return;
}
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
if( nz )
{
h->mb.i_cbp_luma |= 1<<idx;
h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
h->dctf.add8x8_idct8( p_dst, dct8x8 );
STORE_8x8_NNZ( p, idx, 1 );
}
else
STORE_8x8_NNZ( p, idx, 0 );
}
#endif

1355
encoder/me.c Normal file

File diff suppressed because it is too large Load Diff

111
encoder/me.h Normal file
View File

@@ -0,0 +1,111 @@
/*****************************************************************************
* me.h: motion estimation
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_ME_H
#define X264_ENCODER_ME_H
#define COST_MAX (1<<28)
#define COST_MAX64 (1ULL<<60)
typedef struct
{
/* aligning the first member is a gcc hack to force the struct to be aligned,
* as well as force sizeof(struct) to be a multiple of the alignment. */
/* input */
ALIGNED_64( int i_pixel ); /* PIXEL_WxH */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
const x264_weight_t *weight;
pixel *p_fref[12];
pixel *p_fref_w;
pixel *p_fenc[3];
uint16_t *integral;
int i_stride[3];
ALIGNED_4( int16_t mvp[2] );
/* output */
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
ALIGNED_8( int16_t mv[2] );
} ALIGNED_64( x264_me_t );
#define x264_me_search_ref x264_template(me_search_ref)
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
x264_me_search_ref( h, m, mvc, i_mvc, NULL )
#define x264_me_refine_qpel x264_template(me_refine_qpel)
void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
#define x264_me_refine_qpel_refdupe x264_template(me_refine_qpel_refdupe)
void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
#define x264_me_refine_qpel_rd x264_template(me_refine_qpel_rd)
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
#define x264_me_refine_bidir_rd x264_template(me_refine_bidir_rd)
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
#define x264_me_refine_bidir_satd x264_template(me_refine_bidir_satd)
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
#define x264_rd_cost_part x264_template(rd_cost_part)
uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
#define COPY1_IF_LT(x,y)\
if( (y) < (x) )\
(x) = (y);
#define COPY2_IF_LT(x,y,a,b)\
if( (y) < (x) )\
{\
(x) = (y);\
(a) = (b);\
}
#define COPY3_IF_LT(x,y,a,b,c,d)\
if( (y) < (x) )\
{\
(x) = (y);\
(a) = (b);\
(c) = (d);\
}
#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
if( (y) < (x) )\
{\
(x) = (y);\
(a) = (b);\
(c) = (d);\
(e) = (f);\
}
#define COPY2_IF_GT(x,y,a,b)\
if( (y) > (x) )\
{\
(x) = (y);\
(a) = (b);\
}
#endif

3134
encoder/ratecontrol.c Normal file

File diff suppressed because it is too large Load Diff

87
encoder/ratecontrol.h Normal file
View File

@@ -0,0 +1,87 @@
/*****************************************************************************
* ratecontrol.h: ratecontrol
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_RATECONTROL_H
#define X264_ENCODER_RATECONTROL_H
/* Completely arbitrary. Ratecontrol lowers relative quality at higher framerates
* and the reverse at lower framerates; this serves as the center of the curve.
* Halve all the values for frame-packed 3D to compensate for the "doubled"
* framerate. */
#define BASE_FRAME_DURATION (0.04f / ((h->param.i_frame_packing == 5)+1))
/* Arbitrary limitations as a sanity check. */
#define MAX_FRAME_DURATION (1.00f / ((h->param.i_frame_packing == 5)+1))
#define MIN_FRAME_DURATION (0.01f / ((h->param.i_frame_packing == 5)+1))
#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
#define x264_ratecontrol_new x264_template(ratecontrol_new)
int x264_ratecontrol_new ( x264_t * );
#define x264_ratecontrol_delete x264_template(ratecontrol_delete)
void x264_ratecontrol_delete( x264_t * );
#define x264_ratecontrol_init_reconfigurable x264_template(ratecontrol_init_reconfigurable)
void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
#define x264_encoder_reconfig_apply x264_template(encoder_reconfig_apply)
int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param );
#define x264_adaptive_quant_frame x264_template(adaptive_quant_frame)
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
#define x264_macroblock_tree_read x264_template(macroblock_tree_read)
int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
#define x264_reference_build_list_optimal x264_template(reference_build_list_optimal)
int x264_reference_build_list_optimal( x264_t *h );
#define x264_thread_sync_ratecontrol x264_template(thread_sync_ratecontrol)
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
#define x264_ratecontrol_zone_init x264_template(ratecontrol_zone_init)
void x264_ratecontrol_zone_init( x264_t * );
#define x264_ratecontrol_start x264_template(ratecontrol_start)
void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
#define x264_ratecontrol_slice_type x264_template(ratecontrol_slice_type)
int x264_ratecontrol_slice_type( x264_t *, int i_frame );
#define x264_ratecontrol_set_weights x264_template(ratecontrol_set_weights)
void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
#define x264_ratecontrol_mb x264_template(ratecontrol_mb)
int x264_ratecontrol_mb( x264_t *, int bits );
#define x264_ratecontrol_qp x264_template(ratecontrol_qp)
int x264_ratecontrol_qp( x264_t * );
#define x264_ratecontrol_mb_qp x264_template(ratecontrol_mb_qp)
int x264_ratecontrol_mb_qp( x264_t *h );
#define x264_ratecontrol_end x264_template(ratecontrol_end)
int x264_ratecontrol_end( x264_t *, int bits, int *filler );
#define x264_ratecontrol_summary x264_template(ratecontrol_summary)
void x264_ratecontrol_summary( x264_t * );
#define x264_rc_analyse_slice x264_template(rc_analyse_slice)
int x264_rc_analyse_slice( x264_t *h );
#define x264_threads_distribute_ratecontrol x264_template(threads_distribute_ratecontrol)
void x264_threads_distribute_ratecontrol( x264_t *h );
#define x264_threads_merge_ratecontrol x264_template(threads_merge_ratecontrol)
void x264_threads_merge_ratecontrol( x264_t *h );
#define x264_hrd_fullness x264_template(hrd_fullness)
void x264_hrd_fullness( x264_t *h );
#endif

1184
encoder/rdo.c Normal file

File diff suppressed because it is too large Load Diff

913
encoder/set.c Normal file
View File

@@ -0,0 +1,913 @@
/*****************************************************************************
* set: header writing
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "set.h"
#define bs_write_ue bs_write_ue_big
// Indexed by pic_struct values
static const uint8_t num_clock_ts[10] = { 0, 1, 1, 1, 2, 2, 3, 3, 2, 3 };
static const uint8_t avcintra_uuid[] = {0xF7, 0x49, 0x3E, 0xB3, 0xD4, 0x00, 0x47, 0x96, 0x86, 0x86, 0xC9, 0x70, 0x7B, 0x64, 0x37, 0x2A};
static void transpose( uint8_t *buf, int w )
{
for( int i = 0; i < w; i++ )
for( int j = 0; j < i; j++ )
XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
}
static void scaling_list_write( bs_t *s, x264_sps_t *sps, int idx )
{
const int len = idx<4 ? 16 : 64;
const uint8_t *zigzag = idx<4 ? x264_zigzag_scan4[0] : x264_zigzag_scan8[0];
const uint8_t *list = sps->scaling_list[idx];
const uint8_t *def_list = (idx==CQM_4IC) ? sps->scaling_list[CQM_4IY]
: (idx==CQM_4PC) ? sps->scaling_list[CQM_4PY]
: (idx==CQM_8IC+4) ? sps->scaling_list[CQM_8IY+4]
: (idx==CQM_8PC+4) ? sps->scaling_list[CQM_8PY+4]
: x264_cqm_jvt[idx];
if( !memcmp( list, def_list, len ) )
bs_write1( s, 0 ); // scaling_list_present_flag
else if( !memcmp( list, x264_cqm_jvt[idx], len ) )
{
bs_write1( s, 1 ); // scaling_list_present_flag
bs_write_se( s, -8 ); // use jvt list
}
else
{
int run;
bs_write1( s, 1 ); // scaling_list_present_flag
// try run-length compression of trailing values
for( run = len; run > 1; run-- )
if( list[zigzag[run-1]] != list[zigzag[run-2]] )
break;
if( run < len && len - run < bs_size_se( (int8_t)-list[zigzag[run]] ) )
run = len;
for( int j = 0; j < run; j++ )
bs_write_se( s, (int8_t)(list[zigzag[j]] - (j>0 ? list[zigzag[j-1]] : 8)) ); // delta
if( run < len )
bs_write_se( s, (int8_t)-list[zigzag[run]] );
}
}
void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type )
{
int i;
bs_realign( s );
for( i = 0; i <= payload_type-255; i += 255 )
bs_write( s, 8, 255 );
bs_write( s, 8, payload_type-i );
for( i = 0; i <= payload_size-255; i += 255 )
bs_write( s, 8, 255 );
bs_write( s, 8, payload_size-i );
for( i = 0; i < payload_size; i++ )
bs_write( s, 8, payload[i] );
bs_rbsp_trailing( s );
bs_flush( s );
}
void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
{
int csp = param->i_csp & X264_CSP_MASK;
sps->i_id = i_id;
sps->i_mb_width = ( param->i_width + 15 ) / 16;
sps->i_mb_height= ( param->i_height + 15 ) / 16;
sps->b_frame_mbs_only = !(param->b_interlaced || param->b_fake_interlaced);
if( !sps->b_frame_mbs_only )
sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1;
sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
csp >= X264_CSP_I422 ? CHROMA_422 :
csp >= X264_CSP_I420 ? CHROMA_420 : CHROMA_400;
sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE;
else if( sps->i_chroma_format_idc == CHROMA_422 )
sps->i_profile_idc = PROFILE_HIGH422;
else if( BIT_DEPTH > 8 )
sps->i_profile_idc = PROFILE_HIGH10;
else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT || sps->i_chroma_format_idc == CHROMA_400 )
sps->i_profile_idc = PROFILE_HIGH;
else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
sps->i_profile_idc = PROFILE_MAIN;
else
sps->i_profile_idc = PROFILE_BASELINE;
sps->b_constraint_set0 = sps->i_profile_idc == PROFILE_BASELINE;
/* x264 doesn't support the features that are in Baseline and not in Main,
* namely arbitrary_slice_order and slice_groups. */
sps->b_constraint_set1 = sps->i_profile_idc <= PROFILE_MAIN;
/* Never set constraint_set2, it is not necessary and not used in real world. */
sps->b_constraint_set2 = 0;
sps->b_constraint_set3 = 0;
sps->i_level_idc = param->i_level_idc;
if( param->i_level_idc == 9 && ( sps->i_profile_idc == PROFILE_BASELINE || sps->i_profile_idc == PROFILE_MAIN ) )
{
sps->b_constraint_set3 = 1; /* level 1b with Baseline or Main profile is signalled via constraint_set3 */
sps->i_level_idc = 11;
}
/* Intra profiles */
if( param->i_keyint_max == 1 && sps->i_profile_idc >= PROFILE_HIGH )
sps->b_constraint_set3 = 1;
sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
/* extra slot with pyramid so that we don't have to override the
* order of forgetting old pictures */
sps->vui.i_max_dec_frame_buffering =
sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
if( param->i_keyint_max == 1 )
{
sps->i_num_ref_frames = 0;
sps->vui.i_max_dec_frame_buffering = 0;
}
/* number of refs + current frame */
int max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1;
/* Intra refresh cannot write a recovery time greater than max frame num-1 */
if( param->b_intra_refresh )
{
int time_to_recovery = X264_MIN( sps->i_mb_width - 1, param->i_keyint_max ) + param->i_bframe - 1;
max_frame_num = X264_MAX( max_frame_num, time_to_recovery+1 );
}
sps->i_log2_max_frame_num = 4;
while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
sps->i_log2_max_frame_num++;
sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2;
if( sps->i_poc_type == 0 )
{
int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
sps->i_log2_max_poc_lsb = 4;
while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 )
sps->i_log2_max_poc_lsb++;
}
sps->b_vui = 1;
sps->b_gaps_in_frame_num_value_allowed = 0;
sps->b_mb_adaptive_frame_field = param->b_interlaced;
sps->b_direct8x8_inference = 1;
x264_sps_init_reconfigurable( sps, param );
sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2;
if( sps->vui.b_overscan_info_present )
sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 );
sps->vui.b_signal_type_present = 0;
sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 );
sps->vui.b_fullrange = ( param->vui.b_fullrange >= 0 && param->vui.b_fullrange <= 1 ? param->vui.b_fullrange :
( csp >= X264_CSP_BGR ? 1 : 0 ) );
sps->vui.b_color_description_present = 0;
sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 12 ? param->vui.i_colorprim : 2 );
sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 18 ? param->vui.i_transfer : 2 );
sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 14 ? param->vui.i_colmatrix :
( csp >= X264_CSP_BGR ? 0 : 2 ) );
if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || sps->vui.i_colmatrix != 2 )
sps->vui.b_color_description_present = 1;
if( sps->vui.i_vidformat != 5 || sps->vui.b_fullrange || sps->vui.b_color_description_present )
sps->vui.b_signal_type_present = 1;
/* FIXME: not sufficient for interlaced video */
sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 &&
sps->i_chroma_format_idc == CHROMA_420;
if( sps->vui.b_chroma_loc_info_present )
{
sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc;
sps->vui.i_chroma_loc_bottom = param->vui.i_chroma_loc;
}
sps->vui.b_timing_info_present = param->i_timebase_num > 0 && param->i_timebase_den > 0;
if( sps->vui.b_timing_info_present )
{
sps->vui.i_num_units_in_tick = param->i_timebase_num;
sps->vui.i_time_scale = param->i_timebase_den * 2;
sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
}
sps->vui.b_vcl_hrd_parameters_present = 0; // we don't support VCL HRD
sps->vui.b_nal_hrd_parameters_present = !!param->i_nal_hrd;
sps->vui.b_pic_struct_present = param->b_pic_struct;
// NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable
sps->vui.b_bitstream_restriction = !(sps->b_constraint_set3 && sps->i_profile_idc >= PROFILE_HIGH);
if( sps->vui.b_bitstream_restriction )
{
sps->vui.b_motion_vectors_over_pic_boundaries = 1;
sps->vui.i_max_bytes_per_pic_denom = 0;
sps->vui.i_max_bits_per_mb_denom = 0;
sps->vui.i_log2_max_mv_length_horizontal =
sps->vui.i_log2_max_mv_length_vertical = (int)log2f( X264_MAX( 1, param->analyse.i_mv_range*4-1 ) ) + 1;
}
sps->b_avcintra_hd = param->i_avcintra_class && param->i_avcintra_class <= 200;
sps->b_avcintra_4k = param->i_avcintra_class > 200;
sps->i_cqm_preset = param->i_cqm_preset;
}
void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param )
{
sps->crop.i_left = param->crop_rect.i_left;
sps->crop.i_top = param->crop_rect.i_top;
sps->crop.i_right = param->crop_rect.i_right + sps->i_mb_width*16 - param->i_width;
sps->crop.i_bottom = param->crop_rect.i_bottom + sps->i_mb_height*16 - param->i_height;
sps->b_crop = sps->crop.i_left || sps->crop.i_top ||
sps->crop.i_right || sps->crop.i_bottom;
sps->vui.b_aspect_ratio_info_present = 0;
if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
{
sps->vui.b_aspect_ratio_info_present = 1;
sps->vui.i_sar_width = param->vui.i_sar_width;
sps->vui.i_sar_height= param->vui.i_sar_height;
}
}
void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param )
{
switch( sps->i_cqm_preset )
{
case X264_CQM_FLAT:
for( int i = 0; i < 8; i++ )
sps->scaling_list[i] = x264_cqm_flat16;
break;
case X264_CQM_JVT:
for( int i = 0; i < 8; i++ )
sps->scaling_list[i] = x264_cqm_jvt[i];
break;
case X264_CQM_CUSTOM:
/* match the transposed DCT & zigzag */
transpose( param->cqm_4iy, 4 );
transpose( param->cqm_4py, 4 );
transpose( param->cqm_4ic, 4 );
transpose( param->cqm_4pc, 4 );
transpose( param->cqm_8iy, 8 );
transpose( param->cqm_8py, 8 );
transpose( param->cqm_8ic, 8 );
transpose( param->cqm_8pc, 8 );
sps->scaling_list[CQM_4IY] = param->cqm_4iy;
sps->scaling_list[CQM_4PY] = param->cqm_4py;
sps->scaling_list[CQM_4IC] = param->cqm_4ic;
sps->scaling_list[CQM_4PC] = param->cqm_4pc;
sps->scaling_list[CQM_8IY+4] = param->cqm_8iy;
sps->scaling_list[CQM_8PY+4] = param->cqm_8py;
sps->scaling_list[CQM_8IC+4] = param->cqm_8ic;
sps->scaling_list[CQM_8PC+4] = param->cqm_8pc;
for( int i = 0; i < 8; i++ )
for( int j = 0; j < (i < 4 ? 16 : 64); j++ )
if( sps->scaling_list[i][j] == 0 )
sps->scaling_list[i] = x264_cqm_jvt[i];
break;
}
}
void x264_sps_write( bs_t *s, x264_sps_t *sps )
{
bs_realign( s );
bs_write( s, 8, sps->i_profile_idc );
bs_write1( s, sps->b_constraint_set0 );
bs_write1( s, sps->b_constraint_set1 );
bs_write1( s, sps->b_constraint_set2 );
bs_write1( s, sps->b_constraint_set3 );
bs_write( s, 4, 0 ); /* reserved */
bs_write( s, 8, sps->i_level_idc );
bs_write_ue( s, sps->i_id );
if( sps->i_profile_idc >= PROFILE_HIGH )
{
bs_write_ue( s, sps->i_chroma_format_idc );
if( sps->i_chroma_format_idc == CHROMA_444 )
bs_write1( s, 0 ); // separate_colour_plane_flag
bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
bs_write1( s, sps->b_qpprime_y_zero_transform_bypass );
/* Exactly match the AVC-Intra bitstream */
bs_write1( s, sps->b_avcintra_hd ); // seq_scaling_matrix_present_flag
if( sps->b_avcintra_hd )
{
scaling_list_write( s, sps, CQM_4IY );
scaling_list_write( s, sps, CQM_4IC );
scaling_list_write( s, sps, CQM_4IC );
bs_write1( s, 0 ); // no inter
bs_write1( s, 0 ); // no inter
bs_write1( s, 0 ); // no inter
scaling_list_write( s, sps, CQM_8IY+4 );
bs_write1( s, 0 ); // no inter
if( sps->i_chroma_format_idc == CHROMA_444 )
{
scaling_list_write( s, sps, CQM_8IC+4 );
bs_write1( s, 0 ); // no inter
scaling_list_write( s, sps, CQM_8IC+4 );
bs_write1( s, 0 ); // no inter
}
}
}
bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
bs_write_ue( s, sps->i_poc_type );
if( sps->i_poc_type == 0 )
bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
bs_write_ue( s, sps->i_num_ref_frames );
bs_write1( s, sps->b_gaps_in_frame_num_value_allowed );
bs_write_ue( s, sps->i_mb_width - 1 );
bs_write_ue( s, (sps->i_mb_height >> !sps->b_frame_mbs_only) - 1);
bs_write1( s, sps->b_frame_mbs_only );
if( !sps->b_frame_mbs_only )
bs_write1( s, sps->b_mb_adaptive_frame_field );
bs_write1( s, sps->b_direct8x8_inference );
bs_write1( s, sps->b_crop );
if( sps->b_crop )
{
int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
int v_shift = (sps->i_chroma_format_idc == CHROMA_420) + !sps->b_frame_mbs_only;
bs_write_ue( s, sps->crop.i_left >> h_shift );
bs_write_ue( s, sps->crop.i_right >> h_shift );
bs_write_ue( s, sps->crop.i_top >> v_shift );
bs_write_ue( s, sps->crop.i_bottom >> v_shift );
}
bs_write1( s, sps->b_vui );
if( sps->b_vui )
{
bs_write1( s, sps->vui.b_aspect_ratio_info_present );
if( sps->vui.b_aspect_ratio_info_present )
{
int i;
static const struct { uint8_t w, h, sar; } sar[] =
{
// aspect_ratio_idc = 0 -> unspecified
{ 1, 1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
{ 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
{ 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
{160, 99, 13}, { 4, 3, 14}, { 3, 2, 15}, { 2, 1, 16},
// aspect_ratio_idc = [17..254] -> reserved
{ 0, 0, 255 }
};
for( i = 0; sar[i].sar != 255; i++ )
{
if( sar[i].w == sps->vui.i_sar_width &&
sar[i].h == sps->vui.i_sar_height )
break;
}
bs_write( s, 8, sar[i].sar );
if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
{
bs_write( s, 16, sps->vui.i_sar_width );
bs_write( s, 16, sps->vui.i_sar_height );
}
}
bs_write1( s, sps->vui.b_overscan_info_present );
if( sps->vui.b_overscan_info_present )
bs_write1( s, sps->vui.b_overscan_info );
bs_write1( s, sps->vui.b_signal_type_present );
if( sps->vui.b_signal_type_present )
{
bs_write( s, 3, sps->vui.i_vidformat );
bs_write1( s, sps->vui.b_fullrange );
bs_write1( s, sps->vui.b_color_description_present );
if( sps->vui.b_color_description_present )
{
bs_write( s, 8, sps->vui.i_colorprim );
bs_write( s, 8, sps->vui.i_transfer );
bs_write( s, 8, sps->vui.i_colmatrix );
}
}
bs_write1( s, sps->vui.b_chroma_loc_info_present );
if( sps->vui.b_chroma_loc_info_present )
{
bs_write_ue( s, sps->vui.i_chroma_loc_top );
bs_write_ue( s, sps->vui.i_chroma_loc_bottom );
}
bs_write1( s, sps->vui.b_timing_info_present );
if( sps->vui.b_timing_info_present )
{
bs_write32( s, sps->vui.i_num_units_in_tick );
bs_write32( s, sps->vui.i_time_scale );
bs_write1( s, sps->vui.b_fixed_frame_rate );
}
bs_write1( s, sps->vui.b_nal_hrd_parameters_present );
if( sps->vui.b_nal_hrd_parameters_present )
{
bs_write_ue( s, sps->vui.hrd.i_cpb_cnt - 1 );
bs_write( s, 4, sps->vui.hrd.i_bit_rate_scale );
bs_write( s, 4, sps->vui.hrd.i_cpb_size_scale );
bs_write_ue( s, sps->vui.hrd.i_bit_rate_value - 1 );
bs_write_ue( s, sps->vui.hrd.i_cpb_size_value - 1 );
bs_write1( s, sps->vui.hrd.b_cbr_hrd );
bs_write( s, 5, sps->vui.hrd.i_initial_cpb_removal_delay_length - 1 );
bs_write( s, 5, sps->vui.hrd.i_cpb_removal_delay_length - 1 );
bs_write( s, 5, sps->vui.hrd.i_dpb_output_delay_length - 1 );
bs_write( s, 5, sps->vui.hrd.i_time_offset_length );
}
bs_write1( s, sps->vui.b_vcl_hrd_parameters_present );
if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
bs_write1( s, 0 ); /* low_delay_hrd_flag */
bs_write1( s, sps->vui.b_pic_struct_present );
bs_write1( s, sps->vui.b_bitstream_restriction );
if( sps->vui.b_bitstream_restriction )
{
bs_write1( s, sps->vui.b_motion_vectors_over_pic_boundaries );
bs_write_ue( s, sps->vui.i_max_bytes_per_pic_denom );
bs_write_ue( s, sps->vui.i_max_bits_per_mb_denom );
bs_write_ue( s, sps->vui.i_log2_max_mv_length_horizontal );
bs_write_ue( s, sps->vui.i_log2_max_mv_length_vertical );
bs_write_ue( s, sps->vui.i_num_reorder_frames );
bs_write_ue( s, sps->vui.i_max_dec_frame_buffering );
}
}
bs_rbsp_trailing( s );
bs_flush( s );
}
void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
{
pps->i_id = i_id;
pps->i_sps_id = sps->i_id;
pps->b_cabac = param->b_cabac;
pps->b_pic_order = !param->i_avcintra_class && param->b_interlaced;
pps->i_num_slice_groups = 1;
pps->i_num_ref_idx_l0_default_active = param->i_frame_reference;
pps->i_num_ref_idx_l1_default_active = 1;
pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
pps->i_pic_init_qs = 26 + QP_BD_OFFSET;
pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
pps->b_deblocking_filter_control = 1;
pps->b_constrained_intra_pred = param->b_constrained_intra;
pps->b_redundant_pic_cnt = 0;
pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
}
void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
{
bs_realign( s );
bs_write_ue( s, pps->i_id );
bs_write_ue( s, pps->i_sps_id );
bs_write1( s, pps->b_cabac );
bs_write1( s, pps->b_pic_order );
bs_write_ue( s, pps->i_num_slice_groups - 1 );
bs_write_ue( s, pps->i_num_ref_idx_l0_default_active - 1 );
bs_write_ue( s, pps->i_num_ref_idx_l1_default_active - 1 );
bs_write1( s, pps->b_weighted_pred );
bs_write( s, 2, pps->b_weighted_bipred );
bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
bs_write_se( s, pps->i_pic_init_qs - 26 - QP_BD_OFFSET );
bs_write_se( s, pps->i_chroma_qp_index_offset );
bs_write1( s, pps->b_deblocking_filter_control );
bs_write1( s, pps->b_constrained_intra_pred );
bs_write1( s, pps->b_redundant_pic_cnt );
int b_scaling_list = !sps->b_avcintra_hd && sps->i_cqm_preset != X264_CQM_FLAT;
if( pps->b_transform_8x8_mode || b_scaling_list )
{
bs_write1( s, pps->b_transform_8x8_mode );
bs_write1( s, b_scaling_list );
if( b_scaling_list )
{
scaling_list_write( s, sps, CQM_4IY );
scaling_list_write( s, sps, CQM_4IC );
if( sps->b_avcintra_4k )
{
scaling_list_write( s, sps, CQM_4IC );
bs_write1( s, 0 ); // no inter
bs_write1( s, 0 ); // no inter
bs_write1( s, 0 ); // no inter
}
else
{
bs_write1( s, 0 ); // Cr = Cb
scaling_list_write( s, sps, CQM_4PY );
scaling_list_write( s, sps, CQM_4PC );
bs_write1( s, 0 ); // Cr = Cb
}
if( pps->b_transform_8x8_mode )
{
scaling_list_write( s, sps, CQM_8IY+4 );
if( sps->b_avcintra_4k )
bs_write1( s, 0 ); // no inter
else
scaling_list_write( s, sps, CQM_8PY+4 );
if( sps->i_chroma_format_idc == CHROMA_444 )
{
scaling_list_write( s, sps, CQM_8IC+4 );
scaling_list_write( s, sps, CQM_8PC+4 );
bs_write1( s, 0 ); // Cr = Cb
bs_write1( s, 0 ); // Cr = Cb
}
}
}
bs_write_se( s, pps->i_chroma_qp_index_offset );
}
bs_rbsp_trailing( s );
bs_flush( s );
}
void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
{
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write_ue( &q, recovery_frame_cnt ); // recovery_frame_cnt
bs_write1( &q, 1 ); //exact_match_flag 1
bs_write1( &q, 0 ); //broken_link_flag 0
bs_write( &q, 2, 0 ); //changing_slice_group 0
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT );
}
int x264_sei_version_write( x264_t *h, bs_t *s )
{
// random ID number generated according to ISO-11578
static const uint8_t uuid[16] =
{
0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7,
0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
};
char *opts = x264_param2string( &h->param, 0 );
char *payload;
int length;
if( !opts )
return -1;
CHECKED_MALLOC( payload, 200 + strlen( opts ) );
memcpy( payload, uuid, 16 );
sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
"Copy%s 2003-2025 - http://www.videolan.org/x264.html - options: %s",
X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
length = strlen(payload)+1;
x264_sei_write( s, (uint8_t *)payload, length, SEI_USER_DATA_UNREGISTERED );
x264_free( opts );
x264_free( payload );
return 0;
fail:
x264_free( opts );
return -1;
}
void x264_sei_buffering_period_write( x264_t *h, bs_t *s )
{
x264_sps_t *sps = h->sps;
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write_ue( &q, sps->i_id );
if( sps->vui.b_nal_hrd_parameters_present )
{
bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay );
bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay_offset );
}
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_BUFFERING_PERIOD );
}
void x264_sei_pic_timing_write( x264_t *h, bs_t *s )
{
x264_sps_t *sps = h->sps;
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
{
bs_write( &q, sps->vui.hrd.i_cpb_removal_delay_length, h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset );
bs_write( &q, sps->vui.hrd.i_dpb_output_delay_length, h->fenc->i_dpb_output_delay );
}
if( sps->vui.b_pic_struct_present )
{
bs_write( &q, 4, h->fenc->i_pic_struct-1 ); // We use index 0 for "Auto"
// These clock timestamps are not standardised so we don't set them
// They could be time of origin, capture or alternative ideal display
for( int i = 0; i < num_clock_ts[h->fenc->i_pic_struct]; i++ )
bs_write1( &q, 0 ); // clock_timestamp_flag
}
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_PIC_TIMING );
}
void x264_sei_frame_packing_write( x264_t *h, bs_t *s )
{
int quincunx_sampling_flag = h->param.i_frame_packing == 0;
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write_ue( &q, 0 ); // frame_packing_arrangement_id
bs_write1( &q, 0 ); // frame_packing_arrangement_cancel_flag
bs_write ( &q, 7, h->param.i_frame_packing ); // frame_packing_arrangement_type
bs_write1( &q, quincunx_sampling_flag ); // quincunx_sampling_flag
// 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type
bs_write1( &q, 0 ); // spatial_flipping_flag
bs_write1( &q, 0 ); // frame0_flipped_flag
bs_write1( &q, 0 ); // field_views_flag
bs_write1( &q, h->param.i_frame_packing == 5 && !(h->fenc->i_frame&1) ); // current_frame_is_frame0_flag
bs_write1( &q, 0 ); // frame0_self_contained_flag
bs_write1( &q, 0 ); // frame1_self_contained_flag
if( quincunx_sampling_flag == 0 && h->param.i_frame_packing != 5 )
{
bs_write( &q, 4, 0 ); // frame0_grid_position_x
bs_write( &q, 4, 0 ); // frame0_grid_position_y
bs_write( &q, 4, 0 ); // frame1_grid_position_x
bs_write( &q, 4, 0 ); // frame1_grid_position_y
}
bs_write( &q, 8, 0 ); // frame_packing_arrangement_reserved_byte
// "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output"
// for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence
bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period
bs_write1( &q, 0 ); // frame_packing_arrangement_extension_flag
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_FRAME_PACKING );
}
void x264_sei_mastering_display_write( x264_t *h, bs_t *s )
{
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write( &q, 16, h->param.mastering_display.i_green_x );
bs_write( &q, 16, h->param.mastering_display.i_green_y );
bs_write( &q, 16, h->param.mastering_display.i_blue_x );
bs_write( &q, 16, h->param.mastering_display.i_blue_y );
bs_write( &q, 16, h->param.mastering_display.i_red_x );
bs_write( &q, 16, h->param.mastering_display.i_red_y );
bs_write( &q, 16, h->param.mastering_display.i_white_x );
bs_write( &q, 16, h->param.mastering_display.i_white_y );
bs_write32( &q, h->param.mastering_display.i_display_max );
bs_write32( &q, h->param.mastering_display.i_display_min );
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_MASTERING_DISPLAY );
}
void x264_sei_content_light_level_write( x264_t *h, bs_t *s )
{
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write( &q, 16, h->param.content_light_level.i_max_cll );
bs_write( &q, 16, h->param.content_light_level.i_max_fall );
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_CONTENT_LIGHT_LEVEL );
}
void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s )
{
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
bs_write ( &q, 8, h->param.i_alternative_transfer ); // preferred_transfer_characteristics
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_ALTERNATIVE_TRANSFER );
}
void x264_filler_write( x264_t *h, bs_t *s, int filler )
{
bs_realign( s );
for( int i = 0; i < filler; i++ )
bs_write( s, 8, 0xff );
bs_rbsp_trailing( s );
bs_flush( s );
}
void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s )
{
x264_slice_header_t *sh = &h->sh_backup;
bs_t q;
ALIGNED_4( uint8_t tmp_buf[100] );
M32( tmp_buf ) = 0; // shut up gcc
bs_init( &q, tmp_buf, 100 );
bs_realign( &q );
/* We currently only use this for repeating B-refs, as required by Blu-ray. */
bs_write1( &q, 0 ); //original_idr_flag
bs_write_ue( &q, sh->i_frame_num ); //original_frame_num
if( !h->sps->b_frame_mbs_only )
bs_write1( &q, 0 ); //original_field_pic_flag
bs_write1( &q, sh->i_mmco_command_count > 0 );
if( sh->i_mmco_command_count > 0 )
{
for( int i = 0; i < sh->i_mmco_command_count; i++ )
{
bs_write_ue( &q, 1 );
bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 );
}
bs_write_ue( &q, 0 );
}
bs_align_10( &q );
x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING );
}
int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s )
{
uint8_t data[512];
const char *msg = "UMID";
const int len = 497;
memset( data, 0xff, len );
memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
memcpy( data+16, msg, strlen(msg) );
data[20] = 0x13;
/* These bytes appear to be some sort of frame/seconds counter in certain applications,
* but others jump around, so leave them as zero for now */
data[22] = data[23] = data[25] = data[26] = 0;
data[28] = 0x14;
data[30] = data[31] = data[33] = data[34] = 0;
data[36] = 0x60;
data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */
data[60] = 0x62;
data[62] = data[63] = data[65] = data[66] = 0;
data[68] = 0x63;
data[70] = data[71] = data[73] = data[74] = 0;
x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
return 0;
}
int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
{
uint8_t data[6000];
const char *msg = "VANC";
if( len < 0 || (unsigned)len > sizeof(data) )
{
x264_log( h, X264_LOG_ERROR, "AVC-Intra SEI is too large (%d)\n", len );
return -1;
}
memset( data, 0xff, len );
memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
memcpy( data+16, msg, strlen(msg) );
x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
return 0;
}
#undef ERROR
#define ERROR(...)\
{\
if( verbose )\
x264_log( h, X264_LOG_WARNING, __VA_ARGS__ );\
ret = 1;\
}
int x264_validate_levels( x264_t *h, int verbose )
{
int ret = 0;
int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
int dpb = mbs * h->sps->vui.i_max_dec_frame_buffering;
int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
const x264_level_t *l = x264_levels;
while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
l++;
if( l->frame_size < mbs
|| l->frame_size*8 < h->sps->i_mb_width * h->sps->i_mb_width
|| l->frame_size*8 < h->sps->i_mb_height * h->sps->i_mb_height )
ERROR( "frame MB size (%dx%d) > level limit (%d)\n",
h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
if( dpb > l->dpb )
ERROR( "DPB size (%d frames, %d mbs) > level limit (%d frames, %d mbs)\n",
h->sps->vui.i_max_dec_frame_buffering, dpb, l->dpb / mbs, l->dpb );
#define CHECK( name, limit, val ) \
if( (val) > (limit) ) \
ERROR( name " (%"PRId64") > level limit (%d)\n", (int64_t)(val), (limit) );
CHECK( "VBV bitrate", (l->bitrate * cbp_factor) / 4, h->param.rc.i_vbv_max_bitrate );
CHECK( "VBV buffer", (l->cpb * cbp_factor) / 4, h->param.rc.i_vbv_buffer_size );
CHECK( "MV range", l->mv_range, h->param.analyse.i_mv_range );
CHECK( "interlaced", !l->frame_only, h->param.b_interlaced );
CHECK( "fake interlaced", !l->frame_only, h->param.b_fake_interlaced );
if( h->param.i_fps_den > 0 )
CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den );
/* TODO check the rest of the limits */
return ret;
}

71
encoder/set.h Normal file
View File

@@ -0,0 +1,71 @@
/*****************************************************************************
* set.h: header writing
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_SET_H
#define X264_ENCODER_SET_H
#define x264_sps_init x264_template(sps_init)
void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
#define x264_sps_init_reconfigurable x264_template(sps_init_reconfigurable)
void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param );
#define x264_sps_init_scaling_list x264_template(sps_init_scaling_list)
void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param );
#define x264_sps_write x264_template(sps_write)
void x264_sps_write( bs_t *s, x264_sps_t *sps );
#define x264_pps_init x264_template(pps_init)
void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
#define x264_pps_write x264_template(pps_write)
void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps );
#define x264_sei_recovery_point_write x264_template(sei_recovery_point_write)
void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
#define x264_sei_version_write x264_template(sei_version_write)
int x264_sei_version_write( x264_t *h, bs_t *s );
#define x264_validate_levels x264_template(validate_levels)
int x264_validate_levels( x264_t *h, int verbose );
#define x264_sei_buffering_period_write x264_template(sei_buffering_period_write)
void x264_sei_buffering_period_write( x264_t *h, bs_t *s );
#define x264_sei_pic_timing_write x264_template(sei_pic_timing_write)
void x264_sei_pic_timing_write( x264_t *h, bs_t *s );
#define x264_sei_dec_ref_pic_marking_write x264_template(sei_dec_ref_pic_marking_write)
void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s );
#define x264_sei_frame_packing_write x264_template(sei_frame_packing_write)
void x264_sei_frame_packing_write( x264_t *h, bs_t *s );
#define x264_sei_mastering_display_write x264_template(sei_mastering_display_write)
void x264_sei_mastering_display_write( x264_t *h, bs_t *s );
#define x264_sei_content_light_level_write x264_template(sei_content_light_level_write)
void x264_sei_content_light_level_write( x264_t *h, bs_t *s );
#define x264_sei_alternative_transfer_write x264_template(sei_alternative_transfer_write)
void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s );
#define x264_sei_avcintra_umid_write x264_template(sei_avcintra_umid_write)
int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s );
#define x264_sei_avcintra_vanc_write x264_template(sei_avcintra_vanc_write)
int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len );
#define x264_sei_write x264_template(sei_write)
void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type );
#define x264_filler_write x264_template(filler_write)
void x264_filler_write( x264_t *h, bs_t *s, int filler );
#endif

782
encoder/slicetype-cl.c Normal file
View File

@@ -0,0 +1,782 @@
/*****************************************************************************
* slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
*****************************************************************************
* Copyright (C) 2012-2025 x264 project
*
* Authors: Steve Borho <sborho@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "macroblock.h"
#include "me.h"
#include "slicetype-cl.h"
#if HAVE_OPENCL
#ifdef _WIN32
#include <windows.h>
#endif
#define x264_weights_analyse x264_template(weights_analyse)
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
* in the OpenCL headers shipped with NVIDIA drivers. We need to be
* able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
#define OCLCHECK( method, ... )\
do\
{\
if( h->opencl.b_fatal_error )\
return -1;\
status = ocl->method( __VA_ARGS__ );\
if( status != CL_SUCCESS ) {\
h->param.b_opencl = 0;\
h->opencl.b_fatal_error = 1;\
x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
return -1;\
}\
} while( 0 )
void x264_opencl_flush( x264_t *h )
{
x264_opencl_function_t *ocl = h->opencl.ocl;
ocl->clFinish( h->opencl.queue );
/* Finish copies from the GPU by copying from the page-locked buffer to
* their final destination */
for( int i = 0; i < h->opencl.num_copies; i++ )
memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
h->opencl.num_copies = 0;
h->opencl.pl_occupancy = 0;
}
static void *opencl_alloc_locked( x264_t *h, int bytes )
{
if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
x264_opencl_flush( h );
assert( bytes < PAGE_LOCKED_BUF_SIZE );
char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
h->opencl.pl_occupancy += bytes;
return ptr;
}
int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
{
if( fenc->b_intra_calculated )
return 0;
fenc->b_intra_calculated = 1;
x264_opencl_function_t *ocl = h->opencl.ocl;
int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
#define CREATEBUF( out, flags, size )\
out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
#define CREATEIMAGE( out, flags, pf, width, height )\
out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
int mb_count = h->mb.i_mb_count;
cl_int status;
if( !h->opencl.lowres_mv_costs )
{
/* Allocate shared memory buffers */
int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
cl_image_format pixel_format;
pixel_format.image_channel_order = CL_R;
pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
{
pixel_format.image_channel_order = CL_RGBA;
pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
width >>= 1;
height >>= 1;
}
CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
}
if( !fenc->opencl.intra_cost )
{
/* Allocate per-frame buffers */
int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
cl_image_format pixel_format;
pixel_format.image_channel_order = CL_R;
pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
{
pixel_format.image_channel_order = CL_RGBA;
pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
width >>= 1;
height >>= 1;
}
CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
}
#undef CREATEBUF
#undef CREATEIMAGE
/* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
char *locked = opencl_alloc_locked( h, luma_length );
memcpy( locked, fenc->plane[0], luma_length );
OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
size_t gdim[2];
if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
{
int size = h->mb.i_mb_count * sizeof(int16_t);
locked = opencl_alloc_locked( h, size );
memcpy( locked, fenc->i_inv_qscale_factor, size );
OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
}
else
{
/* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
cl_uint arg = 0;
int16_t value = 256;
OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
gdim[0] = h->mb.i_mb_count;
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
}
int stride = fenc->i_stride[0];
cl_uint arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
gdim[0] = 8 * h->mb.i_mb_width;
gdim[1] = 8 * h->mb.i_mb_height;
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
{
/* Workaround for AMD Southern Island:
*
* Alternate kernel instances. No perf impact to this, so we do it for
* all GPUs. It prevents the same kernel from being enqueued
* back-to-back, avoiding a dependency calculation bug in the driver.
*/
cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
arg = 0;
OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
gdim[0] >>= 1;
gdim[1] >>= 1;
if( gdim[0] < 16 || gdim[1] < 16 )
break;
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
}
size_t ldim[2];
gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
gdim[1] = 8*h->mb.i_mb_height;
ldim[0] = 32;
ldim[1] = 8;
arg = 0;
/* For presets slow, slower, and placebo, check all 10 intra modes that the
* C lookahead supports. For faster presets, only check the most frequent 8
* modes
*/
int slow = h->param.analyse.i_subpel_refine > 7;
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
gdim[0] = 256;
gdim[1] = h->mb.i_mb_height;
ldim[0] = 256;
ldim[1] = 1;
arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
x264_opencl_flush( h );
int size = h->mb.i_mb_count * sizeof(int16_t);
locked = opencl_alloc_locked( h, size );
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].bytes = size;
h->opencl.num_copies++;
size = h->mb.i_mb_height * sizeof(int);
locked = opencl_alloc_locked( h, size );
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].bytes = size;
h->opencl.num_copies++;
size = sizeof(int) * 4;
locked = opencl_alloc_locked( h, size );
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
h->opencl.num_copies++;
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
h->opencl.num_copies++;
h->opencl.last_buf = !h->opencl.last_buf;
return 0;
}
/* This function was tested empirically on a number of AMD and NV GPUs. Making a
* function which returns perfect launch dimensions is impossible; some
* applications will have self-tuning code to try many possible variables and
* measure the runtime. Here we simply make an educated guess based on what we
* know GPUs typically prefer. */
static void optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
{
x264_opencl_function_t *ocl = h->opencl.ocl;
size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
size_t preferred_multiple = 64;
cl_uint num_cus = 6;
ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
ldims[0] = preferred_multiple;
ldims[1] = 8;
/* make ldims[1] an even divisor of gdims[1] */
while( gdims[1] & (ldims[1] - 1) )
{
ldims[0] <<= 1;
ldims[1] >>= 1;
}
/* make total ldims fit under the max work-group dimensions for the device */
while( ldims[0] * ldims[1] > max_work_group )
{
if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
ldims[1] >>= 1;
else
ldims[0] >>= 1;
}
if( ldims[0] > gdims[0] )
{
/* remove preferred multiples until we're close to gdims[0] */
while( gdims[0] + preferred_multiple < ldims[0] )
ldims[0] -= preferred_multiple;
gdims[0] = ldims[0];
}
else
{
/* make gdims an even multiple of ldims */
gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
gdims[0] *= ldims[0];
}
/* make ldims smaller to spread work across compute units */
while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
{
if( ldims[0] > preferred_multiple )
ldims[0] >>= 1;
else if( ldims[1] > 1 )
ldims[1] >>= 1;
else
break;
}
/* for smaller GPUs, try not to abuse their texture cache */
if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
ldims[0] = 32;
}
int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
{
x264_opencl_function_t *ocl = h->opencl.ocl;
x264_frame_t *fenc = frames[b];
x264_frame_t *fref = frames[ref];
cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
cl_mem ref_luma_hpel;
cl_int status;
if( w && w->weightfn )
{
size_t gdims[2];
gdims[0] = 8 * h->mb.i_mb_width;
gdims[1] = 8 * h->mb.i_mb_height;
/* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
{
cl_uint arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
gdims[0] >>= 1;
gdims[1] >>= 1;
if( gdims[0] < 16 || gdims[1] < 16 )
break;
}
cl_uint arg = 0;
gdims[0] = 8 * h->mb.i_mb_width;
gdims[1] = 8 * h->mb.i_mb_height;
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
/* Use weighted reference planes for motion search */
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
ref_luma_hpel = h->opencl.weighted_luma_hpel;
}
else
{
/* Use unweighted reference planes for motion search */
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
ref_luma_hpel = fref->opencl.luma_hpel;
}
const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
int b_first_iteration = 1;
int b_reverse_references = 1;
int A = 1;
int mb_per_group = 0;
int cost_local_size = 0;
int mvc_local_size = 0;
int mb_width;
size_t gdims[2];
size_t ldims[2];
/* scale 0 is 8x8 */
for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
{
mb_width = h->mb.i_mb_width >> scale;
gdims[0] = mb_width;
gdims[1] = h->mb.i_mb_height >> scale;
if( gdims[0] < 2 || gdims[1] < 2 )
continue;
gdims[0] <<= 2;
optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
mb_per_group = (ldims[0] >> 2) * ldims[1];
cost_local_size = 4 * mb_per_group * sizeof(int16_t);
mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
int scaled_me_range = h->param.analyse.i_me_range >> scale;
int b_shift_index = 1;
cl_uint arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
for( int iter = 0; iter < num_iterations[scale]; iter++ )
{
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
b_shift_index = 0;
b_first_iteration = 0;
/* alternate top-left vs bot-right MB references at lower scales, so
* motion field smooths more quickly. */
if( scale > 2 )
b_reverse_references ^= 1;
else
b_reverse_references = 0;
A = !A;
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
}
}
int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
cl_uint arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
if( b_islist1 )
{
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
}
else
{
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
}
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
if( h->opencl.b_device_AMD_SI )
{
/* workaround for AMD Southern Island driver scheduling bug (fixed in
* July 2012), perform meaningless small copy to add a data dependency */
OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
}
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
x264_opencl_flush( h );
char *locked = opencl_alloc_locked( h, mvlen );
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
if( b_islist1 )
{
int mvs_offset = mvlen * (ref - b - 1);
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
}
else
{
int mvs_offset = mvlen * (b - ref - 1);
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
}
h->opencl.num_copies++;
return 0;
}
int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
{
x264_opencl_function_t *ocl = h->opencl.ocl;
cl_int status;
x264_frame_t *fenc = frames[b];
x264_frame_t *fref0 = frames[p0];
x264_frame_t *fref1 = frames[p1];
int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
/* Tasks for this kernel:
* 1. Select least cost mode (intra, ref0, ref1)
* list_used 0, 1, 2, or 3. if B frame, do not allow intra
* 2. if B frame, try bidir predictions.
* 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
size_t ldim_bidir[2];
size_t *ldims = NULL;
int cost_local_size = 4;
int satd_local_size = 4;
if( b < p1 )
{
/* For B frames, use 4 threads per MB for BIDIR checks */
ldims = ldim_bidir;
gdims[0] <<= 2;
optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
int mb_per_group = (ldims[0] >> 2) * ldims[1];
cost_local_size = 4 * mb_per_group * sizeof(int16_t);
satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
}
cl_uint arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
/* Sum costs across rows, atomicAdd down frame */
size_t gdim[2] = { 256, h->mb.i_mb_height };
size_t ldim[2] = { 256, 1 };
arg = 0;
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
x264_opencl_flush( h );
int size = h->mb.i_mb_count * sizeof(int16_t);
char *locked = opencl_alloc_locked( h, size );
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
h->opencl.copies[h->opencl.num_copies].bytes = size;
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.num_copies++;
size = h->mb.i_mb_height * sizeof(int);
locked = opencl_alloc_locked( h, size );
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
h->opencl.copies[h->opencl.num_copies].bytes = size;
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.num_copies++;
size = 4 * sizeof(int);
locked = opencl_alloc_locked( h, size );
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
h->opencl.last_buf = !h->opencl.last_buf;
h->opencl.copies[h->opencl.num_copies].src = locked;
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
h->opencl.num_copies++;
h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
h->opencl.num_copies++;
if( b == p1 ) // P frames only
{
h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
h->opencl.num_copies++;
}
return 0;
}
void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
{
if( h->param.b_opencl )
{
#ifdef _WIN32
/* Temporarily boost priority of this lookahead thread and the OpenCL
* driver's thread until the end of this function. On AMD GPUs this
* greatly reduces the latency of enqueuing kernels and getting results
* on Windows. */
HANDLE id = GetCurrentThread();
h->opencl.lookahead_thread_pri = GetThreadPriority( id );
SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
x264_opencl_function_t *ocl = h->opencl.ocl;
cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
if( status == CL_SUCCESS )
{
h->opencl.opencl_thread_pri = GetThreadPriority( id );
SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
}
#endif
/* precalculate intra and I frames */
for( int i = 0; i <= num_frames; i++ )
x264_opencl_lowres_init( h, frames[i], lambda );
x264_opencl_flush( h );
if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
{
/* For trellis B-Adapt, precompute exhaustive motion searches */
for( int b = 0; b <= num_frames; b++ )
{
for( int j = 1; j < h->param.i_bframe; j++ )
{
int p0 = b - j;
if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
{
const x264_weight_t *w = x264_weight_none;
if( h->param.analyse.i_weighted_pred )
{
x264_emms();
x264_weights_analyse( h, frames[b], frames[p0], 1 );
w = frames[b]->weight[0];
}
frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
}
int p1 = b + j;
if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
{
frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
}
}
}
x264_opencl_flush( h );
}
}
}
void x264_opencl_slicetype_end( x264_t *h )
{
#ifdef _WIN32
if( h->param.b_opencl )
{
HANDLE id = GetCurrentThread();
SetThreadPriority( id, h->opencl.lookahead_thread_pri );
x264_opencl_function_t *ocl = h->opencl.ocl;
cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
if( status == CL_SUCCESS )
SetThreadPriority( id, h->opencl.opencl_thread_pri );
}
#endif
}
int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
{
if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
return 0;
else
{
int do_search[2];
int dist_scale_factor = 128;
const x264_weight_t *w = x264_weight_none;
// avoid duplicating work
frames[b]->i_cost_est[b-p0][p1-b] = 0;
do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
if( do_search[0] )
{
if( h->param.analyse.i_weighted_pred && b == p1 )
{
x264_emms();
x264_weights_analyse( h, frames[b], frames[p0], 1 );
w = frames[b]->weight[0];
}
frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
}
if( do_search[1] )
frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
if( b == p1 )
frames[b]->i_intra_mbs[b-p0] = 0;
if( p1 != p0 )
dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
frames[b]->i_cost_est[b-p0][p1-b] = 0;
frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
x264_opencl_lowres_init( h, frames[b], lambda );
if( do_search[0] )
{
x264_opencl_lowres_init( h, frames[p0], lambda );
x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
}
if( do_search[1] )
{
x264_opencl_lowres_init( h, frames[p1], lambda );
x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
}
x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
return 1;
}
}
#endif

44
encoder/slicetype-cl.h Normal file
View File

@@ -0,0 +1,44 @@
/*****************************************************************************
* slicetype-cl.h: OpenCL slicetype decision code (lowres lookahead)
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ENCODER_SLICETYPE_CL_H
#define X264_ENCODER_SLICETYPE_CL_H
#define x264_opencl_lowres_init x264_template(opencl_lowres_init)
int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
#define x264_opencl_motionsearch x264_template(opencl_motionsearch)
int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
#define x264_opencl_finalize_cost x264_template(opencl_finalize_cost)
int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
#define x264_opencl_precalculate_frame_cost x264_template(opencl_precalculate_frame_cost)
int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
#define x264_opencl_flush x264_template(opencl_flush)
void x264_opencl_flush( x264_t *h );
#define x264_opencl_slicetype_prep x264_template(opencl_slicetype_prep)
void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
#define x264_opencl_slicetype_end x264_template(opencl_slicetype_end)
void x264_opencl_slicetype_end( x264_t *h );
#endif

2036
encoder/slicetype.c Normal file

File diff suppressed because it is too large Load Diff