x264 source for verification 2026-05-22

2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * analyse.h: macroblock analysis
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_ANALYSE_H
+#define X264_ENCODER_ANALYSE_H
+
+#define x264_analyse_init_costs x264_template(analyse_init_costs)
+int x264_analyse_init_costs( x264_t *h );
+#define x264_analyse_free_costs x264_template(analyse_free_costs)
+void x264_analyse_free_costs( x264_t *h );
+#define x264_analyse_weight_frame x264_template(analyse_weight_frame)
+void x264_analyse_weight_frame( x264_t *h, int end );
+#define x264_macroblock_analyse x264_template(macroblock_analyse)
+void x264_macroblock_analyse( x264_t *h );
+#define x264_slicetype_decide x264_template(slicetype_decide)
+void x264_slicetype_decide( x264_t *h );
+
+#define x264_slicetype_analyse x264_template(slicetype_analyse)
+void x264_slicetype_analyse( x264_t *h, int intra_minigop );
+
+#define x264_lookahead_init x264_template(lookahead_init)
+int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
+#define x264_lookahead_is_empty x264_template(lookahead_is_empty)
+int  x264_lookahead_is_empty( x264_t *h );
+#define x264_lookahead_put_frame x264_template(lookahead_put_frame)
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
+#define x264_lookahead_get_frames x264_template(lookahead_get_frames)
+void x264_lookahead_get_frames( x264_t *h );
+#define x264_lookahead_delete x264_template(lookahead_delete)
+void x264_lookahead_delete( x264_t *h );
+
+#endif
--- a/encoder/api.c
+++ b/encoder/api.c
@@ -0,0 +1,199 @@
+/*****************************************************************************
+ * api.c: bit depth independent interface
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Vittorio Giovara <vittorio.giovara@gmail.com>
+ *          Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/base.h"
+
+/****************************************************************************
+ * global symbols
+ ****************************************************************************/
+const int x264_chroma_format = X264_CHROMA_FORMAT;
+
+x264_t *x264_8_encoder_open( x264_param_t *, void * );
+void x264_8_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
+int  x264_8_encoder_reconfig( x264_t *, x264_param_t * );
+void x264_8_encoder_parameters( x264_t *, x264_param_t * );
+int  x264_8_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
+int  x264_8_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
+void x264_8_encoder_close( x264_t * );
+int  x264_8_encoder_delayed_frames( x264_t * );
+int  x264_8_encoder_maximum_delayed_frames( x264_t * );
+void x264_8_encoder_intra_refresh( x264_t * );
+int  x264_8_encoder_invalidate_reference( x264_t *, int64_t pts );
+
+x264_t *x264_10_encoder_open( x264_param_t *, void * );
+void x264_10_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
+int  x264_10_encoder_reconfig( x264_t *, x264_param_t * );
+void x264_10_encoder_parameters( x264_t *, x264_param_t * );
+int  x264_10_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
+int  x264_10_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
+void x264_10_encoder_close( x264_t * );
+int  x264_10_encoder_delayed_frames( x264_t * );
+int  x264_10_encoder_maximum_delayed_frames( x264_t * );
+void x264_10_encoder_intra_refresh( x264_t * );
+int  x264_10_encoder_invalidate_reference( x264_t *, int64_t pts );
+
+typedef struct x264_api_t
+{
+    /* Internal reference to x264_t data */
+    x264_t *x264;
+
+    /* API entry points */
+    void (*nal_encode)( x264_t *h, uint8_t *dst, x264_nal_t *nal );
+    int  (*encoder_reconfig)( x264_t *, x264_param_t * );
+    void (*encoder_parameters)( x264_t *, x264_param_t * );
+    int  (*encoder_headers)( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
+    int  (*encoder_encode)( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
+    void (*encoder_close)( x264_t * );
+    int  (*encoder_delayed_frames)( x264_t * );
+    int  (*encoder_maximum_delayed_frames)( x264_t * );
+    void (*encoder_intra_refresh)( x264_t * );
+    int  (*encoder_invalidate_reference)( x264_t *, int64_t pts );
+} x264_api_t;
+
+REALIGN_STACK x264_t *x264_encoder_open( x264_param_t *param )
+{
+    x264_api_t *api = calloc( 1, sizeof( x264_api_t ) );
+    if( !api )
+        return NULL;
+
+#if HAVE_BITDEPTH8
+    if( param->i_bitdepth == 8 )
+    {
+        api->nal_encode = x264_8_nal_encode;
+        api->encoder_reconfig = x264_8_encoder_reconfig;
+        api->encoder_parameters = x264_8_encoder_parameters;
+        api->encoder_headers = x264_8_encoder_headers;
+        api->encoder_encode = x264_8_encoder_encode;
+        api->encoder_close = x264_8_encoder_close;
+        api->encoder_delayed_frames = x264_8_encoder_delayed_frames;
+        api->encoder_maximum_delayed_frames = x264_8_encoder_maximum_delayed_frames;
+        api->encoder_intra_refresh = x264_8_encoder_intra_refresh;
+        api->encoder_invalidate_reference = x264_8_encoder_invalidate_reference;
+
+        api->x264 = x264_8_encoder_open( param, api );
+    }
+    else
+#endif
+#if HAVE_BITDEPTH10
+    if( param->i_bitdepth == 10 )
+    {
+        api->nal_encode = x264_10_nal_encode;
+        api->encoder_reconfig = x264_10_encoder_reconfig;
+        api->encoder_parameters = x264_10_encoder_parameters;
+        api->encoder_headers = x264_10_encoder_headers;
+        api->encoder_encode = x264_10_encoder_encode;
+        api->encoder_close = x264_10_encoder_close;
+        api->encoder_delayed_frames = x264_10_encoder_delayed_frames;
+        api->encoder_maximum_delayed_frames = x264_10_encoder_maximum_delayed_frames;
+        api->encoder_intra_refresh = x264_10_encoder_intra_refresh;
+        api->encoder_invalidate_reference = x264_10_encoder_invalidate_reference;
+
+        api->x264 = x264_10_encoder_open( param, api );
+    }
+    else
+#endif
+        x264_log_internal( X264_LOG_ERROR, "not compiled with %d bit depth support\n", param->i_bitdepth );
+
+    if( !api->x264 )
+    {
+        free( api );
+        return NULL;
+    }
+
+    /* x264_t is opaque */
+    return (x264_t *)api;
+}
+
+REALIGN_STACK void x264_encoder_close( x264_t *h )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    api->encoder_close( api->x264 );
+    free( api );
+}
+
+REALIGN_STACK void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    api->nal_encode( api->x264, dst, nal );
+}
+
+REALIGN_STACK int x264_encoder_reconfig( x264_t *h, x264_param_t *param)
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_reconfig( api->x264, param );
+}
+
+REALIGN_STACK void x264_encoder_parameters( x264_t *h, x264_param_t *param )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    api->encoder_parameters( api->x264, param );
+}
+
+REALIGN_STACK int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_headers( api->x264, pp_nal, pi_nal );
+}
+
+REALIGN_STACK int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_encode( api->x264, pp_nal, pi_nal, pic_in, pic_out );
+}
+
+REALIGN_STACK int x264_encoder_delayed_frames( x264_t *h )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_delayed_frames( api->x264 );
+}
+
+REALIGN_STACK int x264_encoder_maximum_delayed_frames( x264_t *h )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_maximum_delayed_frames( api->x264 );
+}
+
+REALIGN_STACK void x264_encoder_intra_refresh( x264_t *h )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    api->encoder_intra_refresh( api->x264 );
+}
+
+REALIGN_STACK int x264_encoder_invalidate_reference( x264_t *h, int64_t pts )
+{
+    x264_api_t *api = (x264_api_t *)h;
+
+    return api->encoder_invalidate_reference( api->x264, pts );
+}
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -0,0 +1,722 @@
+/*****************************************************************************
+ * cavlc.c: cavlc bitstream writing
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+
+#ifndef RDO_SKIP_BS
+#define RDO_SKIP_BS 0
+#endif
+
+/* [400,420][inter,intra] */
+static const uint8_t cbp_to_golomb[2][2][48] =
+{
+    {{ 0,  1,  2,  5,  3,  6, 14, 10,  4, 15,  7, 11,  8, 12, 13,  9 },
+     { 1, 10, 11,  6, 12,  7, 14,  2, 13, 15,  8,  3,  9,  4,  5,  0 }},
+    {{ 0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+       1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+       6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 },
+     { 3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+      16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+      41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0 }}
+};
+
+static const uint8_t mb_type_b_to_golomb[3][9]=
+{
+    { 4,  8, 12, 10,  6, 14, 16, 18, 20 }, /* D_16x8 */
+    { 5,  9, 13, 11,  7, 15, 17, 19, 21 }, /* D_8x16 */
+    { 1, -1, -1, -1,  2, -1, -1, -1,  3 }  /* D_16x16 */
+};
+
+static const uint8_t subpartition_p_to_golomb[4]=
+{
+    3, 1, 2, 0
+};
+
+static const uint8_t subpartition_b_to_golomb[13]=
+{
+    10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
+};
+
+#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
+
+/****************************************************************************
+ * x264_cavlc_block_residual:
+ ****************************************************************************/
+static inline int cavlc_block_residual_escape( x264_t *h, int i_suffix_length, int level )
+{
+    bs_t *s = &h->out.bs;
+    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
+    int i_level_prefix = 15;
+    int mask = level >> 31;
+    int abs_level = (level^mask)-mask;
+    int i_level_code = abs_level*2-mask-2;
+    if( ( i_level_code >> i_suffix_length ) < 15 )
+    {
+        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
+                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
+    }
+    else
+    {
+        i_level_code -= 15 << i_suffix_length;
+        if( i_suffix_length == 0 )
+            i_level_code -= 15;
+
+        /* If the prefix size exceeds 15, High Profile is required. */
+        if( i_level_code >= 1<<12 )
+        {
+            if( h->sps->i_profile_idc >= PROFILE_HIGH )
+            {
+                while( i_level_code >= 1<<(i_level_prefix-3) )
+                {
+                    i_level_code -= 1<<(i_level_prefix-3);
+                    i_level_prefix++;
+                }
+            }
+            else
+            {
+#if RDO_SKIP_BS
+                /* Weight highly against overflows. */
+                s->i_bits_encoded += 2000;
+#else
+                /* We've had an overflow; note it down and re-encode the MB later. */
+                h->mb.b_overflow = 1;
+#endif
+            }
+        }
+        bs_write( s, i_level_prefix + 1, 1 );
+        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
+    }
+    if( i_suffix_length == 0 )
+        i_suffix_length++;
+    if( abs_level > next_suffix[i_suffix_length] )
+        i_suffix_length++;
+    return i_suffix_length;
+}
+
+static int cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC )
+{
+    bs_t *s = &h->out.bs;
+    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
+    static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
+    x264_run_level_t runlevel;
+    int i_total, i_trailing, i_total_zero, i_suffix_length;
+    unsigned int i_sign;
+
+    /* level and run and total */
+    i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+    x264_prefetch( &x264_run_before[runlevel.mask] );
+    i_total_zero = runlevel.last + 1 - i_total;
+
+    /* branchless i_trailing calculation */
+    runlevel.level[i_total+0] = 2;
+    runlevel.level[i_total+1] = 2;
+    i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
+               | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
+               | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
+    i_trailing = ctz_index[i_trailing];
+    i_sign = ((runlevel.level[2] >> 31) & 1)
+           | ((runlevel.level[1] >> 31) & 2)
+           | ((runlevel.level[0] >> 31) & 4);
+    i_sign >>= 3-i_trailing;
+
+    /* total/trailing */
+    bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] );
+
+    i_suffix_length = i_total > 10 && i_trailing < 3;
+    bs_write( s, i_trailing, i_sign );
+
+    if( i_trailing < i_total )
+    {
+        int val = runlevel.level[i_trailing];
+        int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
+        val += LEVEL_TABLE_SIZE/2;
+
+        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
+        {
+            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
+        }
+        else
+            i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+        for( int i = i_trailing+1; i < i_total; i++ )
+        {
+            val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
+            if( (unsigned)val < LEVEL_TABLE_SIZE )
+            {
+                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
+            }
+            else
+                i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+        }
+    }
+
+    if( ctx_block_cat == DCT_CHROMA_DC )
+    {
+        if( i_total < 8>>CHROMA_V_SHIFT )
+        {
+            vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
+                                                            : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
+            bs_write_vlc( s, total_zeros );
+        }
+    }
+    else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+        bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+
+    int zero_run_code = x264_run_before[runlevel.mask];
+    bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
+
+    return i_total;
+}
+
+static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+
+#define x264_cavlc_block_residual(h,cat,idx,l)\
+{\
+    int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\
+                                  : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
+    uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
+    if( !*nnz )\
+        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
+    else\
+        *nnz = cavlc_block_residual_internal(h,cat,l,nC);\
+}
+
+static void cavlc_qp_delta( x264_t *h )
+{
+    bs_t *s = &h->out.bs;
+    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
+
+    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
+     * flat background area. Don't do this if it would raise the quantizer, since that could
+     * cause unexpected deblocking artifacts. */
+    if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
+        && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
+        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
+        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]]
+        && h->mb.i_qp > h->mb.i_last_qp )
+    {
+#if !RDO_SKIP_BS
+        h->mb.i_qp = h->mb.i_last_qp;
+#endif
+        i_dqp = 0;
+    }
+
+    if( i_dqp )
+    {
+        if( i_dqp < -(QP_MAX_SPEC+1)/2 )
+            i_dqp += QP_MAX_SPEC+1;
+        else if( i_dqp > QP_MAX_SPEC/2 )
+            i_dqp -= QP_MAX_SPEC+1;
+    }
+    bs_write_se( s, i_dqp );
+}
+
+static void cavlc_mvd( x264_t *h, int i_list, int idx, int width )
+{
+    bs_t *s = &h->out.bs;
+    ALIGNED_4( int16_t mvp[2] );
+    x264_mb_predict_mv( h, i_list, idx, width, mvp );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
+}
+
+static inline void cavlc_8x8_mvd( x264_t *h, int i )
+{
+    switch( h->mb.i_sub_partition[i] )
+    {
+        case D_L0_8x8:
+            cavlc_mvd( h, 0, 4*i, 2 );
+            break;
+        case D_L0_8x4:
+            cavlc_mvd( h, 0, 4*i+0, 2 );
+            cavlc_mvd( h, 0, 4*i+2, 2 );
+            break;
+        case D_L0_4x8:
+            cavlc_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mvd( h, 0, 4*i+1, 1 );
+            break;
+        case D_L0_4x4:
+            cavlc_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mvd( h, 0, 4*i+1, 1 );
+            cavlc_mvd( h, 0, 4*i+2, 1 );
+            cavlc_mvd( h, 0, 4*i+3, 1 );
+            break;
+    }
+}
+
+static ALWAYS_INLINE void cavlc_macroblock_luma_residual( x264_t *h, int plane_count )
+{
+    if( h->mb.b_transform_8x8 )
+    {
+        /* shuffle 8x8 dct coeffs into 4x4 lists */
+        for( int p = 0; p < plane_count; p++ )
+            for( int i8 = 0; i8 < 4; i8++ )
+                if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] )
+                    h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8],
+                                                     &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] );
+    }
+
+    for( int p = 0; p < plane_count; p++ )
+        FOREACH_BIT( i8, 0, h->mb.i_cbp_luma )
+            for( int i4 = 0; i4 < 4; i4++ )
+                x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
+}
+
+#if RDO_SKIP_BS
+static ALWAYS_INLINE void cavlc_partition_luma_residual( x264_t *h, int i8, int p )
+{
+    if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] )
+        h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4],
+                                         &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] );
+
+    if( h->mb.i_cbp_luma & (1 << i8) )
+        for( int i4 = 0; i4 < 4; i4++ )
+            x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
+}
+#endif
+
+static void cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
+{
+    bs_t *s = &h->out.bs;
+    if( i_mb_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
+                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
+    }
+    else //if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
+    {
+        int di = i_mb_type == I_8x8 ? 4 : 1;
+        bs_write_ue( s, i_mb_i_offset + 0 );
+        if( h->pps->b_transform_8x8_mode )
+            bs_write1( s, h->mb.b_transform_8x8 );
+
+        /* Prediction: Luma */
+        for( int i = 0; i < 16; i += di )
+        {
+            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+            int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
+
+            if( i_pred == i_mode )
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            else
+                bs_write( s, 4, i_mode - (i_mode > i_pred) );
+        }
+
+    }
+    if( chroma )
+        bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
+}
+
+static ALWAYS_INLINE void cavlc_mb_header_p( x264_t *h, int i_mb_type, int chroma )
+{
+    bs_t *s = &h->out.bs;
+    if( i_mb_type == P_L0 )
+    {
+        if( h->mb.i_partition == D_16x16 )
+        {
+            bs_write1( s, 1 );
+
+            if( h->mb.pic.i_fref[0] > 1 )
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            cavlc_mvd( h, 0, 0, 4 );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->mb.pic.i_fref[0] > 1 )
+            {
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            }
+            cavlc_mvd( h, 0, 0, 4 );
+            cavlc_mvd( h, 0, 8, 4 );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->mb.pic.i_fref[0] > 1 )
+            {
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            }
+            cavlc_mvd( h, 0, 0, 2 );
+            cavlc_mvd( h, 0, 4, 2 );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        int b_sub_ref;
+        if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
+             h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref = 1;
+        }
+
+        /* sub mb type */
+        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
+            for( int i = 0; i < 4; i++ )
+                bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i] ] );
+        else
+            bs_write( s, 4, 0xf );
+
+        /* ref0 */
+        if( b_sub_ref )
+        {
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
+        }
+
+        for( int i = 0; i < 4; i++ )
+            cavlc_8x8_mvd( h, i );
+    }
+    else //if( IS_INTRA( i_mb_type ) )
+        cavlc_mb_header_i( h, i_mb_type, 5, chroma );
+}
+
+static ALWAYS_INLINE void cavlc_mb_header_b( x264_t *h, int i_mb_type, int chroma )
+{
+    bs_t *s = &h->out.bs;
+    if( i_mb_type == B_8x8 )
+    {
+        bs_write_ue( s, 22 );
+
+        /* sub mb type */
+        for( int i = 0; i < 4; i++ )
+            bs_write_ue( s, subpartition_b_to_golomb[ h->mb.i_sub_partition[i] ] );
+
+        /* ref */
+        if( h->mb.pic.i_fref[0] > 1 )
+            for( int i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
+        if( h->mb.pic.i_fref[1] > 1 )
+            for( int i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
+
+        /* mvd */
+        for( int i = 0; i < 4; i++ )
+            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                cavlc_mvd( h, 0, 4*i, 2 );
+        for( int i = 0; i < 4; i++ )
+            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                cavlc_mvd( h, 1, 4*i, 2 );
+    }
+    else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
+        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
+        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
+
+        bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
+            if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 );
+            if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 );
+        }
+        else
+        {
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
+            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
+            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
+            if( h->mb.i_partition == D_16x8 )
+            {
+                if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 );
+                if( b_list[0][1] ) cavlc_mvd( h, 0, 8, 4 );
+                if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 );
+                if( b_list[1][1] ) cavlc_mvd( h, 1, 8, 4 );
+            }
+            else //if( h->mb.i_partition == D_8x16 )
+            {
+                if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 2 );
+                if( b_list[0][1] ) cavlc_mvd( h, 0, 4, 2 );
+                if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 2 );
+                if( b_list[1][1] ) cavlc_mvd( h, 1, 4, 2 );
+            }
+        }
+    }
+    else if( i_mb_type == B_DIRECT )
+        bs_write1( s, 1 );
+    else //if( IS_INTRA( i_mb_type ) )
+        cavlc_mb_header_i( h, i_mb_type, 23, chroma );
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h )
+{
+    bs_t *s = &h->out.bs;
+    const int i_mb_type = h->mb.i_type;
+    int plane_count = CHROMA444 ? 3 : 1;
+    int chroma = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
+
+#if RDO_SKIP_BS
+    s->i_bits_encoded = 0;
+#else
+    const int i_mb_pos_start = bs_pos( s );
+    int       i_mb_pos_tex;
+#endif
+
+    if( SLICE_MBAFF
+        && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
+    {
+        bs_write1( s, MB_INTERLACED );
+#if !RDO_SKIP_BS
+        h->mb.field_decoding_flag = MB_INTERLACED;
+#endif
+    }
+
+#if !RDO_SKIP_BS
+    if( i_mb_type == I_PCM )
+    {
+        static const uint8_t i_offsets[3] = {5,23,0};
+        uint8_t *p_start = s->p_start;
+        bs_write_ue( s, i_offsets[h->sh.i_type] + 25 );
+        i_mb_pos_tex = bs_pos( s );
+        h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
+
+        bs_align_0( s );
+
+        for( int p = 0; p < plane_count; p++ )
+            for( int i = 0; i < 256; i++ )
+                bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
+        if( chroma )
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
+                    for( int j = 0; j < 8; j++ )
+                        bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+
+        bs_init( s, s->p, s->p_end - s->p );
+        s->p_start = p_start;
+
+        h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
+        return;
+    }
+#endif
+
+    if( h->sh.i_type == SLICE_TYPE_P )
+        cavlc_mb_header_p( h, i_mb_type, chroma );
+    else if( h->sh.i_type == SLICE_TYPE_B )
+        cavlc_mb_header_b( h, i_mb_type, chroma );
+    else //if( h->sh.i_type == SLICE_TYPE_I )
+        cavlc_mb_header_i( h, i_mb_type, 0, chroma );
+
+#if !RDO_SKIP_BS
+    i_mb_pos_tex = bs_pos( s );
+    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
+#endif
+
+    /* Coded block pattern */
+    if( i_mb_type != I_16x16 )
+        bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
+
+    /* transform size 8x8 flag */
+    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
+        bs_write1( s, h->mb.b_transform_8x8 );
+
+    if( i_mb_type == I_16x16 )
+    {
+        cavlc_qp_delta( h );
+
+        /* DC Luma */
+        for( int p = 0; p < plane_count; p++ )
+        {
+            x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );
+
+            /* AC Luma */
+            if( h->mb.i_cbp_luma )
+                for( int i = p*16; i < p*16+16; i++ )
+                    x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
+        }
+    }
+    else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
+    {
+        cavlc_qp_delta( h );
+        cavlc_macroblock_luma_residual( h, plane_count );
+    }
+    if( h->mb.i_cbp_chroma )
+    {
+        /* Chroma DC residual present */
+        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
+        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
+        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+        {
+            int step = 8 << CHROMA_V_SHIFT;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
+    }
+
+#if !RDO_SKIP_BS
+    h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
+#endif
+}
+
+#if RDO_SKIP_BS
+/*****************************************************************************
+ * RD only; doesn't generate a valid bitstream
+ * doesn't write cbp or chroma dc (I don't know how much this matters)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
+ * works on all partition sizes except 16x16
+ *****************************************************************************/
+static int partition_size_cavlc( x264_t *h, int i8, int i_pixel )
+{
+    bs_t *s = &h->out.bs;
+    const int i_mb_type = h->mb.i_type;
+    int b_8x16 = h->mb.i_partition == D_8x16;
+    int plane_count = CHROMA444 ? 3 : 1;
+    int j;
+
+    h->out.bs.i_bits_encoded = 0;
+
+    if( i_mb_type == P_8x8 )
+    {
+        cavlc_8x8_mvd( h, i8 );
+        bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
+    }
+    else if( i_mb_type == P_L0 )
+        cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
+    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
+    {
+        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 );
+        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mvd( h, 1, 4*i8, 4>>b_8x16 );
+    }
+    else //if( i_mb_type == B_8x8 )
+    {
+        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mvd( h, 0, 4*i8, 2 );
+        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mvd( h, 1, 4*i8, 2 );
+    }
+
+    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
+    {
+        for( int p = 0; p < plane_count; p++ )
+            cavlc_partition_luma_residual( h, i8, p );
+        if( h->mb.i_cbp_chroma )
+        {
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                int offset = (5*i8) & 0x09;
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
+            }
+            else
+            {
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+                x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+            }
+        }
+        i8 += x264_pixel_size[i_pixel].h >> 3;
+    }
+
+    return h->out.bs.i_bits_encoded;
+}
+
+static int subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
+{
+    int plane_count = CHROMA444 ? 3 : 1;
+    int b_8x4 = i_pixel == PIXEL_8x4;
+    h->out.bs.i_bits_encoded = 0;
+    cavlc_mvd( h, 0, i4, 1+b_8x4 );
+    for( int p = 0; p < plane_count; p++ )
+    {
+        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
+        if( i_pixel != PIXEL_4x4 )
+            x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] );
+    }
+
+    return h->out.bs.i_bits_encoded;
+}
+
+static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
+{
+    if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
+        return 1;
+    else
+        return 4;
+}
+
+static int partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
+{
+    int plane_count = CHROMA444 ? 3 : 1;
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
+    bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
+    for( int p = 0; p < plane_count; p++ )
+        cavlc_partition_luma_residual( h, i8, p );
+    return h->out.bs.i_bits_encoded;
+}
+
+static int partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
+{
+    int plane_count = CHROMA444 ? 3 : 1;
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
+    for( int p = 0; p < plane_count; p++ )
+        x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
+    return h->out.bs.i_bits_encoded;
+}
+
+static int chroma_size_cavlc( x264_t *h )
+{
+    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
+    if( h->mb.i_cbp_chroma )
+    {
+        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
+        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
+
+        if( h->mb.i_cbp_chroma == 2 )
+        {
+            int step = 8 << CHROMA_V_SHIFT;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
+    }
+    return h->out.bs.i_bits_encoded;
+}
+#endif
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -0,0 +1,250 @@
+/*****************************************************************************
+ * lookahead.c: high-level lookahead functions
+ *****************************************************************************
+ * Copyright (C) 2010-2025 Avail Media and x264 project
+ *
+ * Authors: Michael Kazmier <mkazmier@availmedia.com>
+ *          Alex Giladi <agiladi@availmedia.com>
+ *          Steven Walters <kemuri9@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+/* LOOKAHEAD (threaded and non-threaded mode)
+ *
+ * Lookahead types:
+ *     [1] Slice type / scene cut;
+ *
+ * In non-threaded mode, we run the existing slicetype decision code as it was.
+ * In threaded mode, we run in a separate thread, that lives between the calls
+ * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
+ * the number of frames specified in rc_lookahead.  Recommended setting is
+ * # of bframes + # of threads.
+ */
+#include "common/common.h"
+#include "analyse.h"
+
+static void lookahead_shift( x264_sync_frame_list_t *dst, x264_sync_frame_list_t *src, int count )
+{
+    int i = count;
+    while( i-- )
+    {
+        assert( dst->i_size < dst->i_max_size );
+        assert( src->i_size );
+        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
+        src->i_size--;
+    }
+    if( count )
+    {
+        x264_pthread_cond_broadcast( &dst->cv_fill );
+        x264_pthread_cond_broadcast( &src->cv_empty );
+    }
+}
+
+static void lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
+{
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    h->lookahead->last_nonb = new_nonb;
+    new_nonb->i_reference_count++;
+}
+
+#if HAVE_THREAD
+static void lookahead_slicetype_decide( x264_t *h )
+{
+    x264_slicetype_decide( h );
+
+    lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+    int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
+
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
+        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
+
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+
+    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+        x264_slicetype_analyse( h, shift_frames );
+
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+
+REALIGN_STACK static void *lookahead_thread( x264_t *h )
+{
+    while( 1 )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        if( h->lookahead->b_exit_thread )
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            break;
+        }
+        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+        int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+        lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
+        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input )
+        {
+            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
+                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        }
+        else
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            lookahead_slicetype_decide( h );
+        }
+    }   /* end of input frames */
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    while( h->lookahead->next.i_size )
+        lookahead_slicetype_decide( h );
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    h->lookahead->b_thread_active = 0;
+    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    return NULL;
+}
+
+#endif
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length )
+{
+    x264_lookahead_t *look;
+    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
+    for( int i = 0; i < h->param.i_threads; i++ )
+        h->thread[i]->lookahead = look;
+
+    look->i_last_keyframe = - h->param.i_keyint_max;
+    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
+                               && !h->param.rc.b_stat_read;
+    look->i_slicetype_length = i_slicetype_length;
+
+    /* init frame lists */
+    if( x264_sync_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
+        x264_sync_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
+        x264_sync_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
+        goto fail;
+
+    if( !h->param.i_sync_lookahead )
+        return 0;
+
+    x264_t *look_h = h->thread[h->param.i_threads];
+    *look_h = *h;
+    if( x264_macroblock_cache_allocate( look_h ) )
+        goto fail;
+
+    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
+        goto fail;
+
+    if( x264_pthread_create( &look->thread_handle, NULL, (void*)lookahead_thread, look_h ) )
+        goto fail;
+    look->b_thread_active = 1;
+
+    return 0;
+fail:
+    x264_free( look );
+    return -1;
+}
+
+void x264_lookahead_delete( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        x264_pthread_join( h->lookahead->thread_handle, NULL );
+        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
+        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
+        x264_free( h->thread[h->param.i_threads] );
+    }
+    x264_sync_frame_list_delete( &h->lookahead->ifbuf );
+    x264_sync_frame_list_delete( &h->lookahead->next );
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    x264_sync_frame_list_delete( &h->lookahead->ofbuf );
+    x264_free( h->lookahead );
+}
+
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
+{
+    if( h->param.i_sync_lookahead )
+        x264_sync_frame_list_push( &h->lookahead->ifbuf, frame );
+    else
+        x264_sync_frame_list_push( &h->lookahead->next, frame );
+}
+
+int x264_lookahead_is_empty( x264_t *h )
+{
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    int b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    return b_empty;
+}
+
+static void lookahead_encoder_shift( x264_t *h )
+{
+    if( !h->lookahead->ofbuf.i_size )
+        return;
+    int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
+    while( i_frames-- )
+    {
+        x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
+        h->lookahead->ofbuf.i_size--;
+    }
+    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
+}
+
+void x264_lookahead_get_frames( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {   /* We have a lookahead thread, so get frames from there */
+        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        lookahead_encoder_shift( h );
+        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    }
+    else
+    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
+
+        if( h->frames.current[0] || !h->lookahead->next.i_size )
+            return;
+
+        x264_slicetype_decide( h );
+        lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+        int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
+        lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
+
+        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+            x264_slicetype_analyse( h, shift_frames );
+
+        lookahead_encoder_shift( h );
+    }
+}
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -0,0 +1,215 @@
+/*****************************************************************************
+ * macroblock.h: macroblock encoding
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_MACROBLOCK_H
+#define X264_ENCODER_MACROBLOCK_H
+
+#include "common/macroblock.h"
+
+#define x264_rdo_init x264_template(rdo_init)
+void x264_rdo_init( void );
+
+#define x264_macroblock_probe_skip x264_template(macroblock_probe_skip)
+int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
+
+#define x264_macroblock_probe_pskip( h )\
+    x264_macroblock_probe_skip( h, 0 )
+#define x264_macroblock_probe_bskip( h )\
+    x264_macroblock_probe_skip( h, 1 )
+
+#define x264_predict_lossless_4x4 x264_template(predict_lossless_4x4)
+void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
+#define x264_predict_lossless_8x8 x264_template(predict_lossless_8x8)
+void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
+#define x264_predict_lossless_16x16 x264_template(predict_lossless_16x16)
+void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
+#define x264_predict_lossless_chroma x264_template(predict_lossless_chroma)
+void x264_predict_lossless_chroma( x264_t *h, int i_mode );
+
+#define x264_macroblock_encode x264_template(macroblock_encode)
+void x264_macroblock_encode      ( x264_t *h );
+#define x264_macroblock_write_cabac x264_template(macroblock_write_cabac)
+void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
+#define x264_macroblock_write_cavlc x264_template(macroblock_write_cavlc)
+void x264_macroblock_write_cavlc ( x264_t *h );
+
+#define x264_macroblock_encode_p8x8 x264_template(macroblock_encode_p8x8)
+void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
+#define x264_macroblock_encode_p4x4 x264_template(macroblock_encode_p4x4)
+void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
+#define x264_mb_encode_chroma x264_template(mb_encode_chroma)
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
+
+#define x264_cabac_mb_skip x264_template(cabac_mb_skip)
+void x264_cabac_mb_skip( x264_t *h, int b_skip );
+#define x264_cabac_block_residual_c x264_template(cabac_block_residual_c)
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+#define x264_cabac_block_residual_8x8_rd_c x264_template(cabac_block_residual_8x8_rd_c)
+void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+#define x264_cabac_block_residual_rd_c x264_template(cabac_block_residual_rd_c)
+void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+
+#define x264_quant_luma_dc_trellis x264_template(quant_luma_dc_trellis)
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
+                                int ctx_block_cat, int b_intra, int idx );
+#define x264_quant_chroma_dc_trellis x264_template(quant_chroma_dc_trellis)
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
+#define x264_quant_4x4_trellis x264_template(quant_4x4_trellis)
+int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
+                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
+#define x264_quant_8x8_trellis x264_template(quant_8x8_trellis)
+int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
+                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
+
+#define x264_noise_reduction_update x264_template(noise_reduction_update)
+void x264_noise_reduction_update( x264_t *h );
+
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
+{
+    int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
+    if( h->mb.b_noise_reduction )
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+    if( h->mb.b_trellis )
+        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
+    else
+        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+}
+
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
+{
+    int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
+    if( h->mb.b_noise_reduction )
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
+    if( h->mb.b_trellis )
+        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
+    else
+        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+}
+
+#define STORE_8x8_NNZ( p, idx, nz )\
+do\
+{\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
+} while( 0 )
+
+#define CLEAR_16x16_NNZ( p ) \
+do\
+{\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\
+} while( 0 )
+
+/* A special for loop that iterates branchlessly over each set
+ * bit in a 4-bit input. */
+#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
+
+static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
+{
+    int nz;
+    pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
+    pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
+    ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
+
+    if( b_predict )
+    {
+        if( h->mb.b_lossless )
+            x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
+        else
+            h->predict_4x4[i_mode]( p_dst );
+    }
+
+    if( h->mb.b_lossless )
+    {
+        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
+        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
+        h->mb.i_cbp_luma |= nz<<(idx>>2);
+        return;
+    }
+
+    h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
+
+    nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
+    h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
+    if( nz )
+    {
+        h->mb.i_cbp_luma |= 1<<(idx>>2);
+        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
+        h->dctf.add4x4_idct( p_dst, dct4x4 );
+    }
+}
+
+static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict )
+{
+    int x = idx&1;
+    int y = idx>>1;
+    int nz;
+    pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
+    pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
+    ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
+
+    if( b_predict )
+    {
+        if( !edge )
+        {
+            h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
+            edge = edge_buf;
+        }
+
+        if( h->mb.b_lossless )
+            x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
+        else
+            h->predict_8x8[i_mode]( p_dst, edge );
+    }
+
+    if( h->mb.b_lossless )
+    {
+        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
+        STORE_8x8_NNZ( p, idx, nz );
+        h->mb.i_cbp_luma |= nz<<idx;
+        return;
+    }
+
+    h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
+
+    nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
+    if( nz )
+    {
+        h->mb.i_cbp_luma |= 1<<idx;
+        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
+        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
+        h->dctf.add8x8_idct8( p_dst, dct8x8 );
+        STORE_8x8_NNZ( p, idx, 1 );
+    }
+    else
+        STORE_8x8_NNZ( p, idx, 0 );
+}
+
+#endif
--- a/encoder/me.c
+++ b/encoder/me.c
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+ * me.h: motion estimation
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_ME_H
+#define X264_ENCODER_ME_H
+
+#define COST_MAX (1<<28)
+#define COST_MAX64 (1ULL<<60)
+
+typedef struct
+{
+    /* aligning the first member is a gcc hack to force the struct to be aligned,
+     * as well as force sizeof(struct) to be a multiple of the alignment. */
+    /* input */
+    ALIGNED_64( int i_pixel );   /* PIXEL_WxH */
+    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+    int      i_ref_cost;
+    int      i_ref;
+    const x264_weight_t *weight;
+
+    pixel *p_fref[12];
+    pixel *p_fref_w;
+    pixel *p_fenc[3];
+    uint16_t *integral;
+    int      i_stride[3];
+
+    ALIGNED_4( int16_t mvp[2] );
+
+    /* output */
+    int cost_mv;        /* lambda * nbits for the chosen mv */
+    int cost;           /* satd + lambda * nbits */
+    ALIGNED_8( int16_t mv[2] );
+} ALIGNED_64( x264_me_t );
+
+#define x264_me_search_ref x264_template(me_search_ref)
+void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
+#define x264_me_search( h, m, mvc, i_mvc )\
+    x264_me_search_ref( h, m, mvc, i_mvc, NULL )
+
+#define x264_me_refine_qpel x264_template(me_refine_qpel)
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+#define x264_me_refine_qpel_refdupe x264_template(me_refine_qpel_refdupe)
+void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
+#define x264_me_refine_qpel_rd x264_template(me_refine_qpel_rd)
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
+#define x264_me_refine_bidir_rd x264_template(me_refine_bidir_rd)
+void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
+#define x264_me_refine_bidir_satd x264_template(me_refine_bidir_satd)
+void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
+#define x264_rd_cost_part x264_template(rd_cost_part)
+uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
+
+#define COPY1_IF_LT(x,y)\
+if( (y) < (x) )\
+    (x) = (y);
+
+#define COPY2_IF_LT(x,y,a,b)\
+if( (y) < (x) )\
+{\
+    (x) = (y);\
+    (a) = (b);\
+}
+
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+if( (y) < (x) )\
+{\
+    (x) = (y);\
+    (a) = (b);\
+    (c) = (d);\
+}
+
+#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
+if( (y) < (x) )\
+{\
+    (x) = (y);\
+    (a) = (b);\
+    (c) = (d);\
+    (e) = (f);\
+}
+
+#define COPY2_IF_GT(x,y,a,b)\
+if( (y) > (x) )\
+{\
+    (x) = (y);\
+    (a) = (b);\
+}
+
+#endif
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -0,0 +1,87 @@
+/*****************************************************************************
+ * ratecontrol.h: ratecontrol
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_RATECONTROL_H
+#define X264_ENCODER_RATECONTROL_H
+
+/* Completely arbitrary.  Ratecontrol lowers relative quality at higher framerates
+ * and the reverse at lower framerates; this serves as the center of the curve.
+ * Halve all the values for frame-packed 3D to compensate for the "doubled"
+ * framerate. */
+#define BASE_FRAME_DURATION (0.04f / ((h->param.i_frame_packing == 5)+1))
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION (1.00f / ((h->param.i_frame_packing == 5)+1))
+#define MIN_FRAME_DURATION (0.01f / ((h->param.i_frame_packing == 5)+1))
+
+#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
+
+#define x264_ratecontrol_new x264_template(ratecontrol_new)
+int  x264_ratecontrol_new   ( x264_t * );
+#define x264_ratecontrol_delete x264_template(ratecontrol_delete)
+void x264_ratecontrol_delete( x264_t * );
+
+#define x264_ratecontrol_init_reconfigurable x264_template(ratecontrol_init_reconfigurable)
+void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
+#define x264_encoder_reconfig_apply x264_template(encoder_reconfig_apply)
+int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param );
+
+#define x264_adaptive_quant_frame x264_template(adaptive_quant_frame)
+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets );
+#define x264_macroblock_tree_read x264_template(macroblock_tree_read)
+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets );
+#define x264_reference_build_list_optimal x264_template(reference_build_list_optimal)
+int  x264_reference_build_list_optimal( x264_t *h );
+#define x264_thread_sync_ratecontrol x264_template(thread_sync_ratecontrol)
+void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
+#define x264_ratecontrol_zone_init x264_template(ratecontrol_zone_init)
+void x264_ratecontrol_zone_init( x264_t * );
+#define x264_ratecontrol_start x264_template(ratecontrol_start)
+void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
+#define x264_ratecontrol_slice_type x264_template(ratecontrol_slice_type)
+int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
+#define x264_ratecontrol_set_weights x264_template(ratecontrol_set_weights)
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
+#define x264_ratecontrol_mb x264_template(ratecontrol_mb)
+int  x264_ratecontrol_mb( x264_t *, int bits );
+#define x264_ratecontrol_qp x264_template(ratecontrol_qp)
+int  x264_ratecontrol_qp( x264_t * );
+#define x264_ratecontrol_mb_qp x264_template(ratecontrol_mb_qp)
+int  x264_ratecontrol_mb_qp( x264_t *h );
+#define x264_ratecontrol_end x264_template(ratecontrol_end)
+int  x264_ratecontrol_end( x264_t *, int bits, int *filler );
+#define x264_ratecontrol_summary x264_template(ratecontrol_summary)
+void x264_ratecontrol_summary( x264_t * );
+#define x264_rc_analyse_slice x264_template(rc_analyse_slice)
+int  x264_rc_analyse_slice( x264_t *h );
+#define x264_threads_distribute_ratecontrol x264_template(threads_distribute_ratecontrol)
+void x264_threads_distribute_ratecontrol( x264_t *h );
+#define x264_threads_merge_ratecontrol x264_template(threads_merge_ratecontrol)
+void x264_threads_merge_ratecontrol( x264_t *h );
+#define x264_hrd_fullness x264_template(hrd_fullness)
+void x264_hrd_fullness( x264_t *h );
+
+#endif
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -0,0 +1,913 @@
+/*****************************************************************************
+ * set: header writing
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "set.h"
+
+#define bs_write_ue bs_write_ue_big
+
+// Indexed by pic_struct values
+static const uint8_t num_clock_ts[10] = { 0, 1, 1, 1, 2, 2, 3, 3, 2, 3 };
+static const uint8_t avcintra_uuid[] = {0xF7, 0x49, 0x3E, 0xB3, 0xD4, 0x00, 0x47, 0x96, 0x86, 0x86, 0xC9, 0x70, 0x7B, 0x64, 0x37, 0x2A};
+
+static void transpose( uint8_t *buf, int w )
+{
+    for( int i = 0; i < w; i++ )
+        for( int j = 0; j < i; j++ )
+            XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
+}
+
+static void scaling_list_write( bs_t *s, x264_sps_t *sps, int idx )
+{
+    const int len = idx<4 ? 16 : 64;
+    const uint8_t *zigzag = idx<4 ? x264_zigzag_scan4[0] : x264_zigzag_scan8[0];
+    const uint8_t *list = sps->scaling_list[idx];
+    const uint8_t *def_list = (idx==CQM_4IC) ? sps->scaling_list[CQM_4IY]
+                            : (idx==CQM_4PC) ? sps->scaling_list[CQM_4PY]
+                            : (idx==CQM_8IC+4) ? sps->scaling_list[CQM_8IY+4]
+                            : (idx==CQM_8PC+4) ? sps->scaling_list[CQM_8PY+4]
+                            : x264_cqm_jvt[idx];
+    if( !memcmp( list, def_list, len ) )
+        bs_write1( s, 0 );   // scaling_list_present_flag
+    else if( !memcmp( list, x264_cqm_jvt[idx], len ) )
+    {
+        bs_write1( s, 1 );   // scaling_list_present_flag
+        bs_write_se( s, -8 ); // use jvt list
+    }
+    else
+    {
+        int run;
+        bs_write1( s, 1 );   // scaling_list_present_flag
+
+        // try run-length compression of trailing values
+        for( run = len; run > 1; run-- )
+            if( list[zigzag[run-1]] != list[zigzag[run-2]] )
+                break;
+        if( run < len && len - run < bs_size_se( (int8_t)-list[zigzag[run]] ) )
+            run = len;
+
+        for( int j = 0; j < run; j++ )
+            bs_write_se( s, (int8_t)(list[zigzag[j]] - (j>0 ? list[zigzag[j-1]] : 8)) ); // delta
+
+        if( run < len )
+            bs_write_se( s, (int8_t)-list[zigzag[run]] );
+    }
+}
+
+void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type )
+{
+    int i;
+
+    bs_realign( s );
+
+    for( i = 0; i <= payload_type-255; i += 255 )
+        bs_write( s, 8, 255 );
+    bs_write( s, 8, payload_type-i );
+
+    for( i = 0; i <= payload_size-255; i += 255 )
+        bs_write( s, 8, 255 );
+    bs_write( s, 8, payload_size-i );
+
+    for( i = 0; i < payload_size; i++ )
+        bs_write( s, 8, payload[i] );
+
+    bs_rbsp_trailing( s );
+    bs_flush( s );
+}
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
+{
+    int csp = param->i_csp & X264_CSP_MASK;
+
+    sps->i_id = i_id;
+    sps->i_mb_width = ( param->i_width + 15 ) / 16;
+    sps->i_mb_height= ( param->i_height + 15 ) / 16;
+    sps->b_frame_mbs_only = !(param->b_interlaced || param->b_fake_interlaced);
+    if( !sps->b_frame_mbs_only )
+        sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1;
+    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
+                               csp >= X264_CSP_I422 ? CHROMA_422 :
+                               csp >= X264_CSP_I420 ? CHROMA_420 : CHROMA_400;
+
+    sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
+    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
+        sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
+    else if( sps->i_chroma_format_idc == CHROMA_422 )
+        sps->i_profile_idc  = PROFILE_HIGH422;
+    else if( BIT_DEPTH > 8 )
+        sps->i_profile_idc  = PROFILE_HIGH10;
+    else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT || sps->i_chroma_format_idc == CHROMA_400 )
+        sps->i_profile_idc  = PROFILE_HIGH;
+    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
+        sps->i_profile_idc  = PROFILE_MAIN;
+    else
+        sps->i_profile_idc  = PROFILE_BASELINE;
+
+    sps->b_constraint_set0  = sps->i_profile_idc == PROFILE_BASELINE;
+    /* x264 doesn't support the features that are in Baseline and not in Main,
+     * namely arbitrary_slice_order and slice_groups. */
+    sps->b_constraint_set1  = sps->i_profile_idc <= PROFILE_MAIN;
+    /* Never set constraint_set2, it is not necessary and not used in real world. */
+    sps->b_constraint_set2  = 0;
+    sps->b_constraint_set3  = 0;
+
+    sps->i_level_idc = param->i_level_idc;
+    if( param->i_level_idc == 9 && ( sps->i_profile_idc == PROFILE_BASELINE || sps->i_profile_idc == PROFILE_MAIN ) )
+    {
+        sps->b_constraint_set3 = 1; /* level 1b with Baseline or Main profile is signalled via constraint_set3 */
+        sps->i_level_idc      = 11;
+    }
+    /* Intra profiles */
+    if( param->i_keyint_max == 1 && sps->i_profile_idc >= PROFILE_HIGH )
+        sps->b_constraint_set3 = 1;
+
+    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
+    /* extra slot with pyramid so that we don't have to override the
+     * order of forgetting old pictures */
+    sps->vui.i_max_dec_frame_buffering =
+    sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
+                            param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
+    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
+    if( param->i_keyint_max == 1 )
+    {
+        sps->i_num_ref_frames = 0;
+        sps->vui.i_max_dec_frame_buffering = 0;
+    }
+
+    /* number of refs + current frame */
+    int max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1;
+    /* Intra refresh cannot write a recovery time greater than max frame num-1 */
+    if( param->b_intra_refresh )
+    {
+        int time_to_recovery = X264_MIN( sps->i_mb_width - 1, param->i_keyint_max ) + param->i_bframe - 1;
+        max_frame_num = X264_MAX( max_frame_num, time_to_recovery+1 );
+    }
+
+    sps->i_log2_max_frame_num = 4;
+    while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
+        sps->i_log2_max_frame_num++;
+
+    sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2;
+    if( sps->i_poc_type == 0 )
+    {
+        int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
+        sps->i_log2_max_poc_lsb = 4;
+        while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 )
+            sps->i_log2_max_poc_lsb++;
+    }
+
+    sps->b_vui = 1;
+
+    sps->b_gaps_in_frame_num_value_allowed = 0;
+    sps->b_mb_adaptive_frame_field = param->b_interlaced;
+    sps->b_direct8x8_inference = 1;
+
+    x264_sps_init_reconfigurable( sps, param );
+
+    sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2;
+    if( sps->vui.b_overscan_info_present )
+        sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 );
+
+    sps->vui.b_signal_type_present = 0;
+    sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 );
+    sps->vui.b_fullrange = ( param->vui.b_fullrange >= 0 && param->vui.b_fullrange <= 1 ? param->vui.b_fullrange :
+                           ( csp >= X264_CSP_BGR ? 1 : 0 ) );
+    sps->vui.b_color_description_present = 0;
+
+    sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 12 ? param->vui.i_colorprim : 2 );
+    sps->vui.i_transfer  = ( param->vui.i_transfer  >= 0 && param->vui.i_transfer  <= 18 ? param->vui.i_transfer  : 2 );
+    sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 14 ? param->vui.i_colmatrix :
+                           ( csp >= X264_CSP_BGR ? 0 : 2 ) );
+    if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || sps->vui.i_colmatrix != 2 )
+        sps->vui.b_color_description_present = 1;
+
+    if( sps->vui.i_vidformat != 5 || sps->vui.b_fullrange || sps->vui.b_color_description_present )
+        sps->vui.b_signal_type_present = 1;
+
+    /* FIXME: not sufficient for interlaced video */
+    sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 &&
+                                         sps->i_chroma_format_idc == CHROMA_420;
+    if( sps->vui.b_chroma_loc_info_present )
+    {
+        sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc;
+        sps->vui.i_chroma_loc_bottom = param->vui.i_chroma_loc;
+    }
+
+    sps->vui.b_timing_info_present = param->i_timebase_num > 0 && param->i_timebase_den > 0;
+
+    if( sps->vui.b_timing_info_present )
+    {
+        sps->vui.i_num_units_in_tick = param->i_timebase_num;
+        sps->vui.i_time_scale = param->i_timebase_den * 2;
+        sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
+    }
+
+    sps->vui.b_vcl_hrd_parameters_present = 0; // we don't support VCL HRD
+    sps->vui.b_nal_hrd_parameters_present = !!param->i_nal_hrd;
+    sps->vui.b_pic_struct_present = param->b_pic_struct;
+
+    // NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable
+
+    sps->vui.b_bitstream_restriction = !(sps->b_constraint_set3 && sps->i_profile_idc >= PROFILE_HIGH);
+    if( sps->vui.b_bitstream_restriction )
+    {
+        sps->vui.b_motion_vectors_over_pic_boundaries = 1;
+        sps->vui.i_max_bytes_per_pic_denom = 0;
+        sps->vui.i_max_bits_per_mb_denom = 0;
+        sps->vui.i_log2_max_mv_length_horizontal =
+        sps->vui.i_log2_max_mv_length_vertical = (int)log2f( X264_MAX( 1, param->analyse.i_mv_range*4-1 ) ) + 1;
+    }
+
+    sps->b_avcintra_hd = param->i_avcintra_class && param->i_avcintra_class <= 200;
+    sps->b_avcintra_4k = param->i_avcintra_class > 200;
+    sps->i_cqm_preset = param->i_cqm_preset;
+}
+
+void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param )
+{
+    sps->crop.i_left   = param->crop_rect.i_left;
+    sps->crop.i_top    = param->crop_rect.i_top;
+    sps->crop.i_right  = param->crop_rect.i_right + sps->i_mb_width*16 - param->i_width;
+    sps->crop.i_bottom = param->crop_rect.i_bottom + sps->i_mb_height*16 - param->i_height;
+    sps->b_crop = sps->crop.i_left  || sps->crop.i_top ||
+                  sps->crop.i_right || sps->crop.i_bottom;
+
+    sps->vui.b_aspect_ratio_info_present = 0;
+    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+    {
+        sps->vui.b_aspect_ratio_info_present = 1;
+        sps->vui.i_sar_width = param->vui.i_sar_width;
+        sps->vui.i_sar_height= param->vui.i_sar_height;
+    }
+}
+
+void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param )
+{
+    switch( sps->i_cqm_preset )
+    {
+    case X264_CQM_FLAT:
+        for( int i = 0; i < 8; i++ )
+            sps->scaling_list[i] = x264_cqm_flat16;
+        break;
+    case X264_CQM_JVT:
+        for( int i = 0; i < 8; i++ )
+            sps->scaling_list[i] = x264_cqm_jvt[i];
+        break;
+    case X264_CQM_CUSTOM:
+        /* match the transposed DCT & zigzag */
+        transpose( param->cqm_4iy, 4 );
+        transpose( param->cqm_4py, 4 );
+        transpose( param->cqm_4ic, 4 );
+        transpose( param->cqm_4pc, 4 );
+        transpose( param->cqm_8iy, 8 );
+        transpose( param->cqm_8py, 8 );
+        transpose( param->cqm_8ic, 8 );
+        transpose( param->cqm_8pc, 8 );
+        sps->scaling_list[CQM_4IY] = param->cqm_4iy;
+        sps->scaling_list[CQM_4PY] = param->cqm_4py;
+        sps->scaling_list[CQM_4IC] = param->cqm_4ic;
+        sps->scaling_list[CQM_4PC] = param->cqm_4pc;
+        sps->scaling_list[CQM_8IY+4] = param->cqm_8iy;
+        sps->scaling_list[CQM_8PY+4] = param->cqm_8py;
+        sps->scaling_list[CQM_8IC+4] = param->cqm_8ic;
+        sps->scaling_list[CQM_8PC+4] = param->cqm_8pc;
+        for( int i = 0; i < 8; i++ )
+            for( int j = 0; j < (i < 4 ? 16 : 64); j++ )
+                if( sps->scaling_list[i][j] == 0 )
+                    sps->scaling_list[i] = x264_cqm_jvt[i];
+        break;
+    }
+}
+
+void x264_sps_write( bs_t *s, x264_sps_t *sps )
+{
+    bs_realign( s );
+    bs_write( s, 8, sps->i_profile_idc );
+    bs_write1( s, sps->b_constraint_set0 );
+    bs_write1( s, sps->b_constraint_set1 );
+    bs_write1( s, sps->b_constraint_set2 );
+    bs_write1( s, sps->b_constraint_set3 );
+
+    bs_write( s, 4, 0 );    /* reserved */
+
+    bs_write( s, 8, sps->i_level_idc );
+
+    bs_write_ue( s, sps->i_id );
+
+    if( sps->i_profile_idc >= PROFILE_HIGH )
+    {
+        bs_write_ue( s, sps->i_chroma_format_idc );
+        if( sps->i_chroma_format_idc == CHROMA_444 )
+            bs_write1( s, 0 ); // separate_colour_plane_flag
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
+        bs_write1( s, sps->b_qpprime_y_zero_transform_bypass );
+        /* Exactly match the AVC-Intra bitstream */
+        bs_write1( s, sps->b_avcintra_hd ); // seq_scaling_matrix_present_flag
+        if( sps->b_avcintra_hd )
+        {
+            scaling_list_write( s, sps, CQM_4IY );
+            scaling_list_write( s, sps, CQM_4IC );
+            scaling_list_write( s, sps, CQM_4IC );
+            bs_write1( s, 0 ); // no inter
+            bs_write1( s, 0 ); // no inter
+            bs_write1( s, 0 ); // no inter
+            scaling_list_write( s, sps, CQM_8IY+4 );
+            bs_write1( s, 0 ); // no inter
+            if( sps->i_chroma_format_idc == CHROMA_444 )
+            {
+                scaling_list_write( s, sps, CQM_8IC+4 );
+                bs_write1( s, 0 ); // no inter
+                scaling_list_write( s, sps, CQM_8IC+4 );
+                bs_write1( s, 0 ); // no inter
+            }
+        }
+    }
+
+    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
+    bs_write_ue( s, sps->i_poc_type );
+    if( sps->i_poc_type == 0 )
+        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
+    bs_write_ue( s, sps->i_num_ref_frames );
+    bs_write1( s, sps->b_gaps_in_frame_num_value_allowed );
+    bs_write_ue( s, sps->i_mb_width - 1 );
+    bs_write_ue( s, (sps->i_mb_height >> !sps->b_frame_mbs_only) - 1);
+    bs_write1( s, sps->b_frame_mbs_only );
+    if( !sps->b_frame_mbs_only )
+        bs_write1( s, sps->b_mb_adaptive_frame_field );
+    bs_write1( s, sps->b_direct8x8_inference );
+
+    bs_write1( s, sps->b_crop );
+    if( sps->b_crop )
+    {
+        int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
+        int v_shift = (sps->i_chroma_format_idc == CHROMA_420) + !sps->b_frame_mbs_only;
+        bs_write_ue( s, sps->crop.i_left   >> h_shift );
+        bs_write_ue( s, sps->crop.i_right  >> h_shift );
+        bs_write_ue( s, sps->crop.i_top    >> v_shift );
+        bs_write_ue( s, sps->crop.i_bottom >> v_shift );
+    }
+
+    bs_write1( s, sps->b_vui );
+    if( sps->b_vui )
+    {
+        bs_write1( s, sps->vui.b_aspect_ratio_info_present );
+        if( sps->vui.b_aspect_ratio_info_present )
+        {
+            int i;
+            static const struct { uint8_t w, h, sar; } sar[] =
+            {
+                // aspect_ratio_idc = 0 -> unspecified
+                {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+                { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+                { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+                {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
+                // aspect_ratio_idc = [17..254] -> reserved
+                { 0, 0, 255 }
+            };
+            for( i = 0; sar[i].sar != 255; i++ )
+            {
+                if( sar[i].w == sps->vui.i_sar_width &&
+                    sar[i].h == sps->vui.i_sar_height )
+                    break;
+            }
+            bs_write( s, 8, sar[i].sar );
+            if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */
+            {
+                bs_write( s, 16, sps->vui.i_sar_width );
+                bs_write( s, 16, sps->vui.i_sar_height );
+            }
+        }
+
+        bs_write1( s, sps->vui.b_overscan_info_present );
+        if( sps->vui.b_overscan_info_present )
+            bs_write1( s, sps->vui.b_overscan_info );
+
+        bs_write1( s, sps->vui.b_signal_type_present );
+        if( sps->vui.b_signal_type_present )
+        {
+            bs_write( s, 3, sps->vui.i_vidformat );
+            bs_write1( s, sps->vui.b_fullrange );
+            bs_write1( s, sps->vui.b_color_description_present );
+            if( sps->vui.b_color_description_present )
+            {
+                bs_write( s, 8, sps->vui.i_colorprim );
+                bs_write( s, 8, sps->vui.i_transfer );
+                bs_write( s, 8, sps->vui.i_colmatrix );
+            }
+        }
+
+        bs_write1( s, sps->vui.b_chroma_loc_info_present );
+        if( sps->vui.b_chroma_loc_info_present )
+        {
+            bs_write_ue( s, sps->vui.i_chroma_loc_top );
+            bs_write_ue( s, sps->vui.i_chroma_loc_bottom );
+        }
+
+        bs_write1( s, sps->vui.b_timing_info_present );
+        if( sps->vui.b_timing_info_present )
+        {
+            bs_write32( s, sps->vui.i_num_units_in_tick );
+            bs_write32( s, sps->vui.i_time_scale );
+            bs_write1( s, sps->vui.b_fixed_frame_rate );
+        }
+
+        bs_write1( s, sps->vui.b_nal_hrd_parameters_present );
+        if( sps->vui.b_nal_hrd_parameters_present )
+        {
+            bs_write_ue( s, sps->vui.hrd.i_cpb_cnt - 1 );
+            bs_write( s, 4, sps->vui.hrd.i_bit_rate_scale );
+            bs_write( s, 4, sps->vui.hrd.i_cpb_size_scale );
+
+            bs_write_ue( s, sps->vui.hrd.i_bit_rate_value - 1 );
+            bs_write_ue( s, sps->vui.hrd.i_cpb_size_value - 1 );
+
+            bs_write1( s, sps->vui.hrd.b_cbr_hrd );
+
+            bs_write( s, 5, sps->vui.hrd.i_initial_cpb_removal_delay_length - 1 );
+            bs_write( s, 5, sps->vui.hrd.i_cpb_removal_delay_length - 1 );
+            bs_write( s, 5, sps->vui.hrd.i_dpb_output_delay_length - 1 );
+            bs_write( s, 5, sps->vui.hrd.i_time_offset_length );
+        }
+
+        bs_write1( s, sps->vui.b_vcl_hrd_parameters_present );
+
+        if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
+            bs_write1( s, 0 );   /* low_delay_hrd_flag */
+
+        bs_write1( s, sps->vui.b_pic_struct_present );
+        bs_write1( s, sps->vui.b_bitstream_restriction );
+        if( sps->vui.b_bitstream_restriction )
+        {
+            bs_write1( s, sps->vui.b_motion_vectors_over_pic_boundaries );
+            bs_write_ue( s, sps->vui.i_max_bytes_per_pic_denom );
+            bs_write_ue( s, sps->vui.i_max_bits_per_mb_denom );
+            bs_write_ue( s, sps->vui.i_log2_max_mv_length_horizontal );
+            bs_write_ue( s, sps->vui.i_log2_max_mv_length_vertical );
+            bs_write_ue( s, sps->vui.i_num_reorder_frames );
+            bs_write_ue( s, sps->vui.i_max_dec_frame_buffering );
+        }
+    }
+
+    bs_rbsp_trailing( s );
+    bs_flush( s );
+}
+
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
+{
+    pps->i_id = i_id;
+    pps->i_sps_id = sps->i_id;
+    pps->b_cabac = param->b_cabac;
+
+    pps->b_pic_order = !param->i_avcintra_class && param->b_interlaced;
+    pps->i_num_slice_groups = 1;
+
+    pps->i_num_ref_idx_l0_default_active = param->i_frame_reference;
+    pps->i_num_ref_idx_l1_default_active = 1;
+
+    pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
+    pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
+
+    pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
+    pps->i_pic_init_qs = 26 + QP_BD_OFFSET;
+
+    pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
+    pps->b_deblocking_filter_control = 1;
+    pps->b_constrained_intra_pred = param->b_constrained_intra;
+    pps->b_redundant_pic_cnt = 0;
+
+    pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
+}
+
+void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
+{
+    bs_realign( s );
+    bs_write_ue( s, pps->i_id );
+    bs_write_ue( s, pps->i_sps_id );
+
+    bs_write1( s, pps->b_cabac );
+    bs_write1( s, pps->b_pic_order );
+    bs_write_ue( s, pps->i_num_slice_groups - 1 );
+
+    bs_write_ue( s, pps->i_num_ref_idx_l0_default_active - 1 );
+    bs_write_ue( s, pps->i_num_ref_idx_l1_default_active - 1 );
+    bs_write1( s, pps->b_weighted_pred );
+    bs_write( s, 2, pps->b_weighted_bipred );
+
+    bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
+    bs_write_se( s, pps->i_pic_init_qs - 26 - QP_BD_OFFSET );
+    bs_write_se( s, pps->i_chroma_qp_index_offset );
+
+    bs_write1( s, pps->b_deblocking_filter_control );
+    bs_write1( s, pps->b_constrained_intra_pred );
+    bs_write1( s, pps->b_redundant_pic_cnt );
+
+    int b_scaling_list = !sps->b_avcintra_hd && sps->i_cqm_preset != X264_CQM_FLAT;
+    if( pps->b_transform_8x8_mode || b_scaling_list )
+    {
+        bs_write1( s, pps->b_transform_8x8_mode );
+        bs_write1( s, b_scaling_list );
+        if( b_scaling_list )
+        {
+            scaling_list_write( s, sps, CQM_4IY );
+            scaling_list_write( s, sps, CQM_4IC );
+            if( sps->b_avcintra_4k )
+            {
+                scaling_list_write( s, sps, CQM_4IC );
+                bs_write1( s, 0 ); // no inter
+                bs_write1( s, 0 ); // no inter
+                bs_write1( s, 0 ); // no inter
+            }
+            else
+            {
+                bs_write1( s, 0 ); // Cr = Cb
+                scaling_list_write( s, sps, CQM_4PY );
+                scaling_list_write( s, sps, CQM_4PC );
+                bs_write1( s, 0 ); // Cr = Cb
+            }
+            if( pps->b_transform_8x8_mode )
+            {
+                scaling_list_write( s, sps, CQM_8IY+4 );
+                if( sps->b_avcintra_4k )
+                    bs_write1( s, 0 ); // no inter
+                else
+                    scaling_list_write( s, sps, CQM_8PY+4 );
+                if( sps->i_chroma_format_idc == CHROMA_444 )
+                {
+                    scaling_list_write( s, sps, CQM_8IC+4 );
+                    scaling_list_write( s, sps, CQM_8PC+4 );
+                    bs_write1( s, 0 ); // Cr = Cb
+                    bs_write1( s, 0 ); // Cr = Cb
+                }
+            }
+        }
+        bs_write_se( s, pps->i_chroma_qp_index_offset );
+    }
+
+    bs_rbsp_trailing( s );
+    bs_flush( s );
+}
+
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
+{
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    bs_write_ue( &q, recovery_frame_cnt ); // recovery_frame_cnt
+    bs_write1( &q, 1 );   //exact_match_flag 1
+    bs_write1( &q, 0 );   //broken_link_flag 0
+    bs_write( &q, 2, 0 ); //changing_slice_group 0
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT );
+}
+
+int x264_sei_version_write( x264_t *h, bs_t *s )
+{
+    // random ID number generated according to ISO-11578
+    static const uint8_t uuid[16] =
+    {
+        0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7,
+        0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
+    };
+    char *opts = x264_param2string( &h->param, 0 );
+    char *payload;
+    int length;
+
+    if( !opts )
+        return -1;
+    CHECKED_MALLOC( payload, 200 + strlen( opts ) );
+
+    memcpy( payload, uuid, 16 );
+    sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
+             "Copy%s 2003-2025 - http://www.videolan.org/x264.html - options: %s",
+             X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
+    length = strlen(payload)+1;
+
+    x264_sei_write( s, (uint8_t *)payload, length, SEI_USER_DATA_UNREGISTERED );
+
+    x264_free( opts );
+    x264_free( payload );
+    return 0;
+fail:
+    x264_free( opts );
+    return -1;
+}
+
+void x264_sei_buffering_period_write( x264_t *h, bs_t *s )
+{
+    x264_sps_t *sps = h->sps;
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+    bs_write_ue( &q, sps->i_id );
+
+    if( sps->vui.b_nal_hrd_parameters_present )
+    {
+        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay );
+        bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay_offset );
+    }
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_BUFFERING_PERIOD );
+}
+
+void x264_sei_pic_timing_write( x264_t *h, bs_t *s )
+{
+    x264_sps_t *sps = h->sps;
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present )
+    {
+        bs_write( &q, sps->vui.hrd.i_cpb_removal_delay_length, h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset );
+        bs_write( &q, sps->vui.hrd.i_dpb_output_delay_length, h->fenc->i_dpb_output_delay );
+    }
+
+    if( sps->vui.b_pic_struct_present )
+    {
+        bs_write( &q, 4, h->fenc->i_pic_struct-1 ); // We use index 0 for "Auto"
+
+        // These clock timestamps are not standardised so we don't set them
+        // They could be time of origin, capture or alternative ideal display
+        for( int i = 0; i < num_clock_ts[h->fenc->i_pic_struct]; i++ )
+            bs_write1( &q, 0 ); // clock_timestamp_flag
+    }
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_PIC_TIMING );
+}
+
+void x264_sei_frame_packing_write( x264_t *h, bs_t *s )
+{
+    int quincunx_sampling_flag = h->param.i_frame_packing == 0;
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    bs_write_ue( &q, 0 );                         // frame_packing_arrangement_id
+    bs_write1( &q, 0 );                           // frame_packing_arrangement_cancel_flag
+    bs_write ( &q, 7, h->param.i_frame_packing ); // frame_packing_arrangement_type
+    bs_write1( &q, quincunx_sampling_flag );      // quincunx_sampling_flag
+
+    // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
+    bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type
+
+    bs_write1( &q, 0 );                           // spatial_flipping_flag
+    bs_write1( &q, 0 );                           // frame0_flipped_flag
+    bs_write1( &q, 0 );                           // field_views_flag
+    bs_write1( &q, h->param.i_frame_packing == 5 && !(h->fenc->i_frame&1) ); // current_frame_is_frame0_flag
+    bs_write1( &q, 0 );                           // frame0_self_contained_flag
+    bs_write1( &q, 0 );                           // frame1_self_contained_flag
+    if( quincunx_sampling_flag == 0 && h->param.i_frame_packing != 5 )
+    {
+        bs_write( &q, 4, 0 );                     // frame0_grid_position_x
+        bs_write( &q, 4, 0 );                     // frame0_grid_position_y
+        bs_write( &q, 4, 0 );                     // frame1_grid_position_x
+        bs_write( &q, 4, 0 );                     // frame1_grid_position_y
+    }
+    bs_write( &q, 8, 0 );                         // frame_packing_arrangement_reserved_byte
+    // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output"
+    // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence
+    bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period
+    bs_write1( &q, 0 );                           // frame_packing_arrangement_extension_flag
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_FRAME_PACKING );
+}
+
+void x264_sei_mastering_display_write( x264_t *h, bs_t *s )
+{
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    bs_write( &q, 16, h->param.mastering_display.i_green_x );
+    bs_write( &q, 16, h->param.mastering_display.i_green_y );
+    bs_write( &q, 16, h->param.mastering_display.i_blue_x );
+    bs_write( &q, 16, h->param.mastering_display.i_blue_y );
+    bs_write( &q, 16, h->param.mastering_display.i_red_x );
+    bs_write( &q, 16, h->param.mastering_display.i_red_y );
+    bs_write( &q, 16, h->param.mastering_display.i_white_x );
+    bs_write( &q, 16, h->param.mastering_display.i_white_y );
+    bs_write32( &q, h->param.mastering_display.i_display_max );
+    bs_write32( &q, h->param.mastering_display.i_display_min );
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_MASTERING_DISPLAY );
+}
+
+void x264_sei_content_light_level_write( x264_t *h, bs_t *s )
+{
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    bs_write( &q, 16, h->param.content_light_level.i_max_cll );
+    bs_write( &q, 16, h->param.content_light_level.i_max_fall );
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_CONTENT_LIGHT_LEVEL );
+}
+
+void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s )
+{
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    bs_write ( &q, 8, h->param.i_alternative_transfer ); // preferred_transfer_characteristics
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_ALTERNATIVE_TRANSFER );
+}
+
+void x264_filler_write( x264_t *h, bs_t *s, int filler )
+{
+    bs_realign( s );
+
+    for( int i = 0; i < filler; i++ )
+        bs_write( s, 8, 0xff );
+
+    bs_rbsp_trailing( s );
+    bs_flush( s );
+}
+
+void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s )
+{
+    x264_slice_header_t *sh = &h->sh_backup;
+    bs_t q;
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
+    bs_init( &q, tmp_buf, 100 );
+
+    bs_realign( &q );
+
+    /* We currently only use this for repeating B-refs, as required by Blu-ray. */
+    bs_write1( &q, 0 );                 //original_idr_flag
+    bs_write_ue( &q, sh->i_frame_num ); //original_frame_num
+    if( !h->sps->b_frame_mbs_only )
+        bs_write1( &q, 0 );             //original_field_pic_flag
+
+    bs_write1( &q, sh->i_mmco_command_count > 0 );
+    if( sh->i_mmco_command_count > 0 )
+    {
+        for( int i = 0; i < sh->i_mmco_command_count; i++ )
+        {
+            bs_write_ue( &q, 1 );
+            bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 );
+        }
+        bs_write_ue( &q, 0 );
+    }
+
+    bs_align_10( &q );
+
+    x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING );
+}
+
+int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s )
+{
+    uint8_t data[512];
+    const char *msg = "UMID";
+    const int len = 497;
+
+    memset( data, 0xff, len );
+    memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
+    memcpy( data+16, msg, strlen(msg) );
+
+    data[20] = 0x13;
+    /* These bytes appear to be some sort of frame/seconds counter in certain applications,
+     * but others jump around, so leave them as zero for now */
+    data[22] = data[23] = data[25] = data[26] = 0;
+    data[28] = 0x14;
+    data[30] = data[31] = data[33] = data[34] = 0;
+    data[36] = 0x60;
+    data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */
+    data[60] = 0x62;
+    data[62] = data[63] = data[65] = data[66] = 0;
+    data[68] = 0x63;
+    data[70] = data[71] = data[73] = data[74] = 0;
+
+    x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
+
+    return 0;
+}
+
+int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
+{
+    uint8_t data[6000];
+    const char *msg = "VANC";
+    if( len < 0 || (unsigned)len > sizeof(data) )
+    {
+        x264_log( h, X264_LOG_ERROR, "AVC-Intra SEI is too large (%d)\n", len );
+        return -1;
+    }
+
+    memset( data, 0xff, len );
+    memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) );
+    memcpy( data+16, msg, strlen(msg) );
+
+    x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
+
+    return 0;
+}
+
+#undef ERROR
+#define ERROR(...)\
+{\
+    if( verbose )\
+        x264_log( h, X264_LOG_WARNING, __VA_ARGS__ );\
+    ret = 1;\
+}
+
+int x264_validate_levels( x264_t *h, int verbose )
+{
+    int ret = 0;
+    int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
+    int dpb = mbs * h->sps->vui.i_max_dec_frame_buffering;
+    int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
+                     h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
+                     h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
+
+    const x264_level_t *l = x264_levels;
+    while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
+        l++;
+
+    if( l->frame_size < mbs
+        || l->frame_size*8 < h->sps->i_mb_width * h->sps->i_mb_width
+        || l->frame_size*8 < h->sps->i_mb_height * h->sps->i_mb_height )
+        ERROR( "frame MB size (%dx%d) > level limit (%d)\n",
+               h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
+    if( dpb > l->dpb )
+        ERROR( "DPB size (%d frames, %d mbs) > level limit (%d frames, %d mbs)\n",
+                h->sps->vui.i_max_dec_frame_buffering, dpb, l->dpb / mbs, l->dpb );
+
+#define CHECK( name, limit, val ) \
+    if( (val) > (limit) ) \
+        ERROR( name " (%"PRId64") > level limit (%d)\n", (int64_t)(val), (limit) );
+
+    CHECK( "VBV bitrate", (l->bitrate * cbp_factor) / 4, h->param.rc.i_vbv_max_bitrate );
+    CHECK( "VBV buffer", (l->cpb * cbp_factor) / 4, h->param.rc.i_vbv_buffer_size );
+    CHECK( "MV range", l->mv_range, h->param.analyse.i_mv_range );
+    CHECK( "interlaced", !l->frame_only, h->param.b_interlaced );
+    CHECK( "fake interlaced", !l->frame_only, h->param.b_fake_interlaced );
+
+    if( h->param.i_fps_den > 0 )
+        CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den );
+
+    /* TODO check the rest of the limits */
+    return ret;
+}
--- a/encoder/set.h
+++ b/encoder/set.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ * set.h: header writing
+ *****************************************************************************
+ * Copyright (C) 2003-2025 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_SET_H
+#define X264_ENCODER_SET_H
+
+#define x264_sps_init x264_template(sps_init)
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
+#define x264_sps_init_reconfigurable x264_template(sps_init_reconfigurable)
+void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param );
+#define x264_sps_init_scaling_list x264_template(sps_init_scaling_list)
+void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param );
+#define x264_sps_write x264_template(sps_write)
+void x264_sps_write( bs_t *s, x264_sps_t *sps );
+#define x264_pps_init x264_template(pps_init)
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
+#define x264_pps_write x264_template(pps_write)
+void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps );
+#define x264_sei_recovery_point_write x264_template(sei_recovery_point_write)
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
+#define x264_sei_version_write x264_template(sei_version_write)
+int  x264_sei_version_write( x264_t *h, bs_t *s );
+#define x264_validate_levels x264_template(validate_levels)
+int  x264_validate_levels( x264_t *h, int verbose );
+#define x264_sei_buffering_period_write x264_template(sei_buffering_period_write)
+void x264_sei_buffering_period_write( x264_t *h, bs_t *s );
+#define x264_sei_pic_timing_write x264_template(sei_pic_timing_write)
+void x264_sei_pic_timing_write( x264_t *h, bs_t *s );
+#define x264_sei_dec_ref_pic_marking_write x264_template(sei_dec_ref_pic_marking_write)
+void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s );
+#define x264_sei_frame_packing_write x264_template(sei_frame_packing_write)
+void x264_sei_frame_packing_write( x264_t *h, bs_t *s );
+#define x264_sei_mastering_display_write x264_template(sei_mastering_display_write)
+void x264_sei_mastering_display_write( x264_t *h, bs_t *s );
+#define x264_sei_content_light_level_write x264_template(sei_content_light_level_write)
+void x264_sei_content_light_level_write( x264_t *h, bs_t *s );
+#define x264_sei_alternative_transfer_write x264_template(sei_alternative_transfer_write)
+void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s );
+#define x264_sei_avcintra_umid_write x264_template(sei_avcintra_umid_write)
+int  x264_sei_avcintra_umid_write( x264_t *h, bs_t *s );
+#define x264_sei_avcintra_vanc_write x264_template(sei_avcintra_vanc_write)
+int  x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len );
+#define x264_sei_write x264_template(sei_write)
+void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type );
+#define x264_filler_write x264_template(filler_write)
+void x264_filler_write( x264_t *h, bs_t *s, int filler );
+
+#endif
--- a/encoder/slicetype-cl.c
+++ b/encoder/slicetype-cl.c
@@ -0,0 +1,782 @@
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012-2025 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+#include "me.h"
+#include "slicetype-cl.h"
+
+#if HAVE_OPENCL
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#define x264_weights_analyse x264_template(weights_analyse)
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
+ * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
+ * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+
+#define OCLCHECK( method, ... )\
+do\
+{\
+    if( h->opencl.b_fatal_error )\
+        return -1;\
+    status = ocl->method( __VA_ARGS__ );\
+    if( status != CL_SUCCESS ) {\
+        h->param.b_opencl = 0;\
+        h->opencl.b_fatal_error = 1;\
+        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
+        return -1;\
+    }\
+} while( 0 )
+
+void x264_opencl_flush( x264_t *h )
+{
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+
+    ocl->clFinish( h->opencl.queue );
+
+    /* Finish copies from the GPU by copying from the page-locked buffer to
+     * their final destination */
+    for( int i = 0; i < h->opencl.num_copies; i++ )
+        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+    h->opencl.num_copies = 0;
+    h->opencl.pl_occupancy = 0;
+}
+
+static void *opencl_alloc_locked( x264_t *h, int bytes )
+{
+    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+        x264_opencl_flush( h );
+    assert( bytes < PAGE_LOCKED_BUF_SIZE );
+    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+    h->opencl.pl_occupancy += bytes;
+    return ptr;
+}
+
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+    if( fenc->b_intra_calculated )
+        return 0;
+    fenc->b_intra_calculated = 1;
+
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+    out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+    out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+    int mb_count = h->mb.i_mb_count;
+    cl_int status;
+
+    if( !h->opencl.lowres_mv_costs )
+    {
+        /* Allocate shared memory buffers */
+        int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
+        int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+
+        CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
+        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
+    }
+
+    if( !fenc->opencl.intra_cost )
+    {
+        /* Allocate per-frame buffers */
+        int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
+        int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+    }
+#undef CREATEBUF
+#undef CREATEIMAGE
+
+    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+    char *locked = opencl_alloc_locked( h, luma_length );
+    memcpy( locked, fenc->plane[0], luma_length );
+    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+    size_t gdim[2];
+    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+    {
+        int size = h->mb.i_mb_count * sizeof(int16_t);
+        locked = opencl_alloc_locked( h, size );
+        memcpy( locked, fenc->i_inv_qscale_factor, size );
+        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    }
+    else
+    {
+        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+        cl_uint arg = 0;
+        int16_t value = 256;
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+        gdim[0] = h->mb.i_mb_count;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    int stride = fenc->i_stride[0];
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+    gdim[0] = 8 * h->mb.i_mb_width;
+    gdim[1] = 8 * h->mb.i_mb_height;
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+    {
+        /* Workaround for AMD Southern Island:
+         *
+         * Alternate kernel instances.  No perf impact to this, so we do it for
+         * all GPUs.  It prevents the same kernel from being enqueued
+         * back-to-back, avoiding a dependency calculation bug in the driver.
+         */
+        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
+
+        arg = 0;
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
+        gdim[0] >>= 1;
+        gdim[1] >>= 1;
+        if( gdim[0] < 16 || gdim[1] < 16 )
+            break;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    size_t ldim[2];
+    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
+    gdim[1] = 8*h->mb.i_mb_height;
+    ldim[0] = 32;
+    ldim[1] = 8;
+    arg = 0;
+
+    /* For presets slow, slower, and placebo, check all 10 intra modes that the
+     * C lookahead supports.  For faster presets, only check the most frequent 8
+     * modes
+     */
+    int slow = h->param.analyse.i_subpel_refine > 7;
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    gdim[0] = 256;
+    gdim[1] = h->mb.i_mb_height;
+    ldim[0] = 256;
+    ldim[1] = 1;
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size = h->mb.i_mb_count * sizeof(int16_t);
+    locked = opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = h->mb.i_mb_height * sizeof(int);
+    locked = opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = sizeof(int) * 4;
+    locked = opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    h->opencl.last_buf = !h->opencl.last_buf;
+    return 0;
+}
+
+/* This function was tested empirically on a number of AMD and NV GPUs.  Making a
+ * function which returns perfect launch dimensions is impossible; some
+ * applications will have self-tuning code to try many possible variables and
+ * measure the runtime.  Here we simply make an educated guess based on what we
+ * know GPUs typically prefer.  */
+static void optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
+{
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
+    size_t preferred_multiple = 64;
+    cl_uint num_cus = 6;
+
+    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
+    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
+    ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
+
+    ldims[0] = preferred_multiple;
+    ldims[1] = 8;
+
+    /* make ldims[1] an even divisor of gdims[1] */
+    while( gdims[1] & (ldims[1] - 1) )
+    {
+        ldims[0] <<= 1;
+        ldims[1] >>= 1;
+    }
+    /* make total ldims fit under the max work-group dimensions for the device */
+    while( ldims[0] * ldims[1] > max_work_group )
+    {
+        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
+            ldims[1] >>= 1;
+        else
+            ldims[0] >>= 1;
+    }
+
+    if( ldims[0] > gdims[0] )
+    {
+        /* remove preferred multiples until we're close to gdims[0] */
+        while( gdims[0] + preferred_multiple < ldims[0] )
+            ldims[0] -= preferred_multiple;
+        gdims[0] = ldims[0];
+    }
+    else
+    {
+        /* make gdims an even multiple of ldims */
+        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
+        gdims[0] *= ldims[0];
+    }
+
+    /* make ldims smaller to spread work across compute units */
+    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
+    {
+        if( ldims[0] > preferred_multiple )
+            ldims[0] >>= 1;
+        else if( ldims[1] > 1 )
+            ldims[1] >>= 1;
+        else
+            break;
+    }
+    /* for smaller GPUs, try not to abuse their texture cache */
+    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
+        ldims[0] = 32;
+}
+
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
+{
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref = frames[ref];
+
+    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
+    cl_mem ref_luma_hpel;
+    cl_int status;
+
+    if( w && w->weightfn )
+    {
+        size_t gdims[2];
+
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            cl_uint arg = 0;
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+            gdims[0] >>= 1;
+            gdims[1] >>= 1;
+            if( gdims[0] < 16 || gdims[1] < 16 )
+                break;
+        }
+
+        cl_uint arg = 0;
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+        /* Use weighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
+        ref_luma_hpel = h->opencl.weighted_luma_hpel;
+    }
+    else
+    {
+        /* Use unweighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
+        ref_luma_hpel = fref->opencl.luma_hpel;
+    }
+
+    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
+    int b_first_iteration = 1;
+    int b_reverse_references = 1;
+    int A = 1;
+
+
+    int mb_per_group = 0;
+    int cost_local_size = 0;
+    int mvc_local_size = 0;
+    int mb_width;
+
+    size_t gdims[2];
+    size_t ldims[2];
+
+    /* scale 0 is 8x8 */
+    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
+    {
+        mb_width = h->mb.i_mb_width >> scale;
+        gdims[0] = mb_width;
+        gdims[1] = h->mb.i_mb_height >> scale;
+        if( gdims[0] < 2 || gdims[1] < 2 )
+            continue;
+        gdims[0] <<= 2;
+        optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
+
+        mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
+        int scaled_me_range = h->param.analyse.i_me_range >> scale;
+        int b_shift_index = 1;
+
+        cl_uint arg = 0;
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
+
+        for( int iter = 0; iter < num_iterations[scale]; iter++ )
+        {
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+            b_shift_index = 0;
+            b_first_iteration = 0;
+
+            /* alternate top-left vs bot-right MB references at lower scales, so
+             * motion field smooths more quickly.  */
+            if( scale > 2 )
+                b_reverse_references ^= 1;
+            else
+                b_reverse_references = 0;
+            A = !A;
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
+        }
+    }
+
+    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
+
+    if( b_islist1 )
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    }
+    else
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    }
+
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
+
+    if( h->opencl.b_device_AMD_SI )
+    {
+        /* workaround for AMD Southern Island driver scheduling bug (fixed in
+         * July 2012), perform meaningless small copy to add a data dependency */
+        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
+    }
+
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
+        x264_opencl_flush( h );
+
+    char *locked = opencl_alloc_locked( h, mvlen );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
+
+    if( b_islist1 )
+    {
+        int mvs_offset = mvlen * (ref - b - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
+    }
+    else
+    {
+        int mvs_offset = mvlen * (b - ref - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
+    }
+
+    h->opencl.num_copies++;
+
+    return 0;
+}
+
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
+{
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    cl_int status;
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref0 = frames[p0];
+    x264_frame_t *fref1 = frames[p1];
+
+    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
+
+    /* Tasks for this kernel:
+     * 1. Select least cost mode (intra, ref0, ref1)
+     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
+     * 2. if B frame, try bidir predictions.
+     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
+    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
+    size_t ldim_bidir[2];
+    size_t *ldims = NULL;
+    int cost_local_size = 4;
+    int satd_local_size = 4;
+    if( b < p1 )
+    {
+        /* For B frames, use 4 threads per MB for BIDIR checks */
+        ldims = ldim_bidir;
+        gdims[0] <<= 2;
+        optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
+        int mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
+    }
+
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    /* Sum costs across rows, atomicAdd down frame */
+    size_t gdim[2] = { 256, h->mb.i_mb_height };
+    size_t ldim[2] = { 256, 1 };
+
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size =  h->mb.i_mb_count * sizeof(int16_t);
+    char *locked = opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  h->mb.i_mb_height * sizeof(int);
+    locked = opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  4 * sizeof(int);
+    locked = opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.last_buf = !h->opencl.last_buf;
+
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    if( b == p1 ) // P frames only
+    {
+        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
+        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
+        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+        h->opencl.num_copies++;
+    }
+    return 0;
+}
+
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
+{
+    if( h->param.b_opencl )
+    {
+#ifdef _WIN32
+        /* Temporarily boost priority of this lookahead thread and the OpenCL
+         * driver's thread until the end of this function.  On AMD GPUs this
+         * greatly reduces the latency of enqueuing kernels and getting results
+         * on Windows. */
+        HANDLE id = GetCurrentThread();
+        h->opencl.lookahead_thread_pri = GetThreadPriority( id );
+        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        x264_opencl_function_t *ocl = h->opencl.ocl;
+        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+        {
+            h->opencl.opencl_thread_pri = GetThreadPriority( id );
+            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        }
+#endif
+
+        /* precalculate intra and I frames */
+        for( int i = 0; i <= num_frames; i++ )
+            x264_opencl_lowres_init( h, frames[i], lambda );
+        x264_opencl_flush( h );
+
+        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
+        {
+            /* For trellis B-Adapt, precompute exhaustive motion searches */
+            for( int b = 0; b <= num_frames; b++ )
+            {
+                for( int j = 1; j < h->param.i_bframe; j++ )
+                {
+                    int p0 = b - j;
+                    if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
+                    {
+                        const x264_weight_t *w = x264_weight_none;
+
+                        if( h->param.analyse.i_weighted_pred )
+                        {
+                            x264_emms();
+                            x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                            w = frames[b]->weight[0];
+                        }
+                        frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+                        x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+                    }
+                    int p1 = b + j;
+                    if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
+                    {
+                        frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+                        x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+                    }
+                }
+            }
+
+            x264_opencl_flush( h );
+        }
+    }
+}
+
+
+void x264_opencl_slicetype_end( x264_t *h )
+{
+#ifdef _WIN32
+    if( h->param.b_opencl )
+    {
+        HANDLE id = GetCurrentThread();
+        SetThreadPriority( id, h->opencl.lookahead_thread_pri );
+        x264_opencl_function_t *ocl = h->opencl.ocl;
+        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+            SetThreadPriority( id, h->opencl.opencl_thread_pri );
+    }
+#endif
+}
+
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
+{
+    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
+        return 0;
+    else
+    {
+        int do_search[2];
+        int dist_scale_factor = 128;
+        const x264_weight_t *w = x264_weight_none;
+
+        // avoid duplicating work
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+
+        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        if( do_search[0] )
+        {
+            if( h->param.analyse.i_weighted_pred && b == p1 )
+            {
+                x264_emms();
+                x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                w = frames[b]->weight[0];
+            }
+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        }
+        if( do_search[1] )
+            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( b == p1 )
+            frames[b]->i_intra_mbs[b-p0] = 0;
+        if( p1 != p0 )
+            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        x264_opencl_lowres_init( h, frames[b], lambda );
+
+        if( do_search[0] )
+        {
+            x264_opencl_lowres_init( h, frames[p0], lambda );
+            x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+        }
+        if( do_search[1] )
+        {
+            x264_opencl_lowres_init( h, frames[p1], lambda );
+            x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+        }
+        x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
+        return 1;
+    }
+}
+
+#endif
--- a/encoder/slicetype-cl.h
+++ b/encoder/slicetype-cl.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * slicetype-cl.h: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2017-2025 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ENCODER_SLICETYPE_CL_H
+#define X264_ENCODER_SLICETYPE_CL_H
+
+#define x264_opencl_lowres_init x264_template(opencl_lowres_init)
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
+#define x264_opencl_motionsearch x264_template(opencl_motionsearch)
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
+#define x264_opencl_finalize_cost x264_template(opencl_finalize_cost)
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
+#define x264_opencl_precalculate_frame_cost x264_template(opencl_precalculate_frame_cost)
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
+#define x264_opencl_flush x264_template(opencl_flush)
+void x264_opencl_flush( x264_t *h );
+#define x264_opencl_slicetype_prep x264_template(opencl_slicetype_prep)
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
+#define x264_opencl_slicetype_end x264_template(opencl_slicetype_end)
+void x264_opencl_slicetype_end( x264_t *h );
+
+#endif
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c