x264 source for verification 2026-05-22

2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -0,0 +1,263 @@
+/*****************************************************************************
+ * asm.S: arm utility macros
+ *****************************************************************************
+ * Copyright (C) 2008-2025 x264 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "config.h"
+
+.syntax unified
+
+#ifdef __ELF__
+.arch armv7-a
+.fpu neon
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
+#ifdef PREFIX
+#   define BASE _x264_
+#   define SYM_PREFIX _
+#else
+#   define BASE x264_
+#   define SYM_PREFIX
+#endif
+
+#ifdef BIT_DEPTH
+#   define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
+#else
+#   define EXTERN_ASM BASE
+#endif
+
+#define X(s) JOIN(EXTERN_ASM, s)
+#define X264(s) JOIN(BASE, s)
+#define EXT(s) JOIN(SYM_PREFIX, s)
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#ifdef __MACH__
+#   define MACH
+#   define NONMACH @
+#else
+#   define MACH @
+#   define NONMACH
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+#if SYS_LINUX || SYS_OPENBSD
+#define HAVE_SECTION_DATA_REL_RO 1
+#else
+#define HAVE_SECTION_DATA_REL_RO 0
+#endif
+
+.macro require8, val=1
+ELF     .eabi_attribute 24, \val
+.endm
+
+.macro preserve8, val=1
+ELF     .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=1
+    .macro endfunc
+.if \export
+ELF     .size   EXTERN_ASM\name, . - EXTERN_ASM\name
+.else
+ELF     .size   \name, . - \name
+.endif
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .text
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+.macro const name, align=2, relocate=0
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+.if HAVE_SECTION_DATA_REL_RO && \relocate
+        .section        .data.rel.ro
+.else
+NONMACH .section        .rodata
+MACH    .const_data
+.endif
+        .align          \align
+\name:
+.endm
+
+.macro movrel rd, val
+#if defined(PIC)
+        ldr             \rd,  1f
+        b               2f
+1:
+@ FIXME: thumb
+        .word           \val - (2f + 8)
+2:
+        add             \rd,  \rd,  pc
+#elif HAVE_ARMV6T2
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro movrelx rd, val, got
+#if defined(PIC) && defined(__ELF__)
+        ldr             \got, 2f
+        ldr             \rd,  1f
+        b               3f
+1:
+@ FIXME: thumb
+        .word \val(GOT)
+2:
+        .word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
+3:
+        add             \got, \got, pc
+        ldr             \rd, [\got, \rd]
+#elif defined(PIC) && defined(__APPLE__)
+        ldr             \rd,  1f
+        b               2f
+1:
+@ FIXME: thumb
+        .word           3f - (2f + 8)
+2:
+        ldr             \rd, [pc, \rd]
+        .non_lazy_symbol_pointer
+3:
+        .indirect_symbol \val
+        .word           0
+        .text
+#else
+        movrel          \rd, \val
+#endif
+.endm
+
+.macro movconst rd, val
+#if HAVE_ARMV6T2
+    movw        \rd, #:lower16:\val
+.if \val >> 16
+    movt        \rd, #:upper16:\val
+.endif
+#else
+    ldr         \rd, =\val
+#endif
+.endm
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+    vadd.u16    \a, \a, \b
+.endif
+    vpaddl.u16  \a, \a
+    vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+    vadd.s16    \sum,  \a, \b
+    vsub.s16    \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB   \s1, \d1, \a, \b
+    SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+    vabs.s16 \a, \a
+    vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+    vtrn.16     \s1, \s2
+.else
+    vtrn.32     \s1, \s2
+.endif
+.ifc \op, sumsub
+    SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+    vabs.s16    \s1, \s1
+    vabs.s16    \s2, \s2
+    vmax.s16    \d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+    vtrn.32         \r0, \r4
+    vtrn.32         \r1, \r5
+    vtrn.32         \r2, \r6
+    vtrn.32         \r3, \r7
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.16         \r4, \r6
+    vtrn.16         \r5, \r7
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+    vtrn.8          \r4, \r5
+    vtrn.8          \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  d0 d1 d2 d3
+    vtrn.32     \d0, \d2
+    vtrn.32     \d1, \d3
+    vtrn.16     \d0, \d1
+    vtrn.16     \d2, \d3
+.endm
--- a/common/arm/bitstream-a.S
+++ b/common/arm/bitstream-a.S
@@ -0,0 +1,84 @@
+/*****************************************************************************
+ * bitstream-a.S: arm bitstream functions
+ *****************************************************************************
+ * Copyright (C) 2014-2025 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+function nal_escape_neon
+    push        {r4-r5,lr}
+    vmov.u8     q0,  #0xff
+    vmov.u8     q8,  #4
+    mov         r3,  #3
+    subs        lr,  r1,  r2
+    beq         99f
+0:
+    cmn         lr,  #15
+    blt         16f
+    mov         r1,  r2
+    b           100f
+16:
+    vld1.8      {q1}, [r1]!
+    vext.8      q2,  q0,  q1, #14
+    vext.8      q3,  q0,  q1, #15
+    vcgt.u8     q11, q8,  q1
+    vceq.u8     q9,  q2,  #0
+    vceq.u8     q10, q3,  #0
+    vand        q9,  q9,  q11
+    vand        q9,  q9,  q10
+    vshrn.u16   d22, q9,  #4
+    vmov        ip,  lr,  d22
+    orrs        ip,  ip,  lr
+    beq         16f
+    mov         lr,  #-16
+100:
+    vmov.u8     r5,  d1[6]
+    vmov.u8     r4,  d1[7]
+    orr         r5,  r4,  r5, lsl #8
+101:
+    ldrb        r4,  [r1, lr]
+    orr         ip,  r4,  r5, lsl #16
+    cmp         ip,  #3
+    bhi         102f
+    strb        r3,  [r0], #1
+    orr         r5,  r3,  r5, lsl #8
+102:
+    adds        lr,  lr,  #1
+    strb        r4,  [r0], #1
+    orr         r5,  r4,  r5, lsl #8
+    blt         101b
+    subs        lr,  r1,  r2
+    lsr         ip,  r5,  #8
+    vmov.u8     d1[6],  ip
+    vmov.u8     d1[7],  r5
+    blt         0b
+
+    pop         {r4-r5,pc}
+16:
+    subs        lr,  r1,  r2
+    vst1.8      {q1}, [r0]!
+    vmov        q0, q1
+    blt         0b
+99:
+    pop         {r4-r5,pc}
+endfunc
--- a/common/arm/bitstream.h
+++ b/common/arm/bitstream.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * bitstream.h: arm bitstream functions
+ *****************************************************************************
+ * Copyright (C) 2017-2025 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_BITSTREAM_H
+#define X264_ARM_BITSTREAM_H
+
+#define x264_nal_escape_neon x264_template(nal_escape_neon)
+uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
+
+#endif
--- a/common/arm/cpu-a.S
+++ b/common/arm/cpu-a.S
@@ -0,0 +1,108 @@
+/*****************************************************************************
+ * cpu-a.S: arm cpu detection
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 2
+
+// done in gas because .fpu neon overrides the refusal to assemble
+// instructions the selected -march/-mcpu doesn't support
+function cpu_neon_test
+    vadd.i16    q0, q0, q0
+    bx          lr
+endfunc
+
+// return: 0 on success
+//         1 if counters were already enabled
+//         9 if lo-res counters were already enabled
+function cpu_enable_armv7_counter, export=0
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
+    ands        r0, r2, #1
+    andne       r0, r2, #9
+
+    orr         r2, r2, #1                  // enable counters
+    bic         r2, r2, #8                  // full resolution
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
+    mov         r2, #1 << 31                // enable cycle counter
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
+    bx          lr
+endfunc
+
+function cpu_disable_armv7_counter, export=0
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
+    bic         r0, r0, #1                  // disable counters
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
+    bx          lr
+endfunc
+
+
+.macro READ_TIME r
+    mrc         p15, 0, \r, c9, c13, 0
+.endm
+
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
+//         nonzero otherwise
+function cpu_fast_neon_mrc_test
+    // check for user access to performance counters
+    mrc         p15, 0, r0, c9, c14, 0
+    cmp         r0, #0
+    bxeq        lr
+
+    push        {r4-r6,lr}
+    bl          cpu_enable_armv7_counter
+    ands        r1, r0, #8
+    mov         r3, #0
+    mov         ip, #4
+    mov         r6, #4
+    moveq       r5, #1
+    movne       r5, #64
+
+average_loop:
+    mov         r4, r5
+    READ_TIME   r1
+1:  subs        r4, r4, #1
+.rept 8
+    vmov.u32    lr, d0[0]
+    add         lr, lr, lr
+.endr
+    bgt         1b
+    READ_TIME   r2
+
+    subs        r6, r6, #1
+    sub         r2, r2, r1
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
+    addle       r3, r3, r2
+    subsle      ip, ip, #1
+    bgt         average_loop
+
+    // disable counters if we enabled them
+    ands        r0, r0, #1
+    bleq        cpu_disable_armv7_counter
+
+    lsr         r0, r3, #5
+    cmp         r0, #10
+    movgt       r0, #0
+    pop         {r4-r6,pc}
+endfunc
--- a/common/arm/dct-a.S
+++ b/common/arm/dct-a.S
@@ -0,0 +1,764 @@
+/****************************************************************************
+ * dct-a.S: arm transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Martin Storsjo <martin@martin.st>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const scan4x4_frame, align=4
+.byte    0,1,   8,9,   2,3,   4,5
+.byte    2,3,   8,9,  16,17, 10,11
+.byte   12,13,  6,7,  14,15, 20,21
+.byte   10,11, 12,13,  6,7,  14,15
+endconst
+
+.text
+
+// sum = a + (b>>shift)   sub = (a>>shift) - b
+.macro SUMSUB_SHR shift sum sub a b t0 t1
+    vshr.s16    \t0,  \b, #\shift
+    vshr.s16    \t1,  \a, #\shift
+    vadd.s16    \sum, \a, \t0
+    vsub.s16    \sub, \t1, \b
+.endm
+
+// sum = (a>>shift) + b   sub = a - (b>>shift)
+.macro SUMSUB_SHR2 shift sum sub a b t0 t1
+    vshr.s16    \t0,  \a, #\shift
+    vshr.s16    \t1,  \b, #\shift
+    vadd.s16    \sum, \t0, \b
+    vsub.s16    \sub, \a, \t1
+.endm
+
+// a += 1.5*ma   b -= 1.5*mb
+.macro SUMSUB_15 a b ma mb t0 t1
+    vshr.s16    \t0, \ma, #1
+    vshr.s16    \t1, \mb, #1
+    vadd.s16    \t0, \t0, \ma
+    vadd.s16    \t1, \t1, \mb
+    vadd.s16    \a,  \a,  \t0
+    vsub.s16    \b,  \b,  \t1
+.endm
+
+
+function dct4x4dc_neon
+    vld1.64         {d0-d3}, [r0,:128]
+    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
+    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
+
+    vmov.s16        d31, #1
+    HADAMARD        1, sumsub, q2, q3, q0, q1
+    vtrn.32         d4,  d5
+    vadd.s16        d16, d4,  d31
+    vtrn.32         d6,  d7
+    vadd.s16        d17, d6,  d31
+    vrhadd.s16      d0,  d4,  d5
+    vhsub.s16       d1,  d16, d5
+    vhsub.s16       d2,  d17, d7
+    vrhadd.s16      d3,  d6,  d7
+    vst1.64         {d0-d3}, [r0,:128]
+    bx              lr
+endfunc
+
+function idct4x4dc_neon
+    vld1.64         {d0-d3}, [r0,:128]
+    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
+    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
+
+    HADAMARD        1, sumsub, q2, q3, q0, q1
+    HADAMARD        2, sumsub, d0, d1, d4, d5
+    HADAMARD        2, sumsub, d3, d2, d6, d7
+    vst1.64         {d0-d3}, [r0,:128]
+    bx              lr
+endfunc
+
+
+.macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
+    SUMSUB_AB       \d1, \d6, \d5, \d6
+    SUMSUB_AB       \d3, \d7, \d4, \d7
+    vadd.s16        \d0, \d3, \d1
+    vadd.s16        \d4, \d7, \d7
+    vadd.s16        \d5, \d6, \d6
+    vsub.s16        \d2, \d3, \d1
+    vadd.s16        \d1, \d4, \d6
+    vsub.s16        \d3, \d7, \d5
+.endm
+
+function sub4x4_dct_neon
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    vld1.32         {d0[]}, [r1,:32], r3
+    vld1.32         {d1[]}, [r2,:32], ip
+    vld1.32         {d2[]}, [r1,:32], r3
+    vsubl.u8        q8,  d0,  d1
+    vld1.32         {d3[]}, [r2,:32], ip
+    vld1.32         {d4[]}, [r1,:32], r3
+    vsubl.u8        q9,  d2,  d3
+    vld1.32         {d5[]}, [r2,:32], ip
+    vld1.32         {d6[]}, [r1,:32], r3
+    vsubl.u8        q10, d4,  d5
+    vld1.32         {d7[]}, [r2,:32], ip
+    vsubl.u8        q11, d6,  d7
+
+    DCT_1D          d0, d1, d2, d3, d16, d18, d20, d22
+    TRANSPOSE4x4_16 d0, d1, d2, d3
+    DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
+    vst1.64         {d4-d7}, [r0,:128]
+    bx              lr
+endfunc
+
+function sub8x4_dct_neon, export=0
+    vld1.64         {d0}, [r1,:64], r3
+    vld1.64         {d1}, [r2,:64], ip
+    vsubl.u8        q8,  d0,  d1
+    vld1.64         {d2}, [r1,:64], r3
+    vld1.64         {d3}, [r2,:64], ip
+    vsubl.u8        q9,  d2,  d3
+    vld1.64         {d4}, [r1,:64], r3
+    vld1.64         {d5}, [r2,:64], ip
+    vsubl.u8        q10, d4,  d5
+    vld1.64         {d6}, [r1,:64], r3
+    vld1.64         {d7}, [r2,:64], ip
+    vsubl.u8        q11, d6,  d7
+
+    DCT_1D          q0, q1, q2, q3,  q8, q9, q10, q11
+    TRANSPOSE4x4_16 q0, q1, q2, q3
+
+    SUMSUB_AB       q8,  q12, q0,  q3
+    SUMSUB_AB       q9,  q10, q1,  q2
+    vadd.i16        q13, q12, q12
+    vadd.i16        q11, q10, q10
+    vadd.i16        d0,  d16, d18
+    vadd.i16        d1,  d26, d20
+    vsub.i16        d2,  d16, d18
+    vsub.i16        d3,  d24, d22
+    vst1.64         {d0-d1}, [r0,:128]!
+    vadd.i16        d4,  d17, d19
+    vadd.i16        d5,  d27, d21
+    vst1.64         {d2-d3}, [r0,:128]!
+    vsub.i16        d6,  d17, d19
+    vsub.i16        d7,  d25, d23
+    vst1.64         {d4-d5}, [r0,:128]!
+    vst1.64         {d6-d7}, [r0,:128]!
+    bx              lr
+endfunc
+
+function sub8x8_dct_neon
+    push            {lr}
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    bl              sub8x4_dct_neon
+    pop             {lr}
+    b               sub8x4_dct_neon
+endfunc
+
+function sub16x16_dct_neon
+    push            {lr}
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    bl              sub8x4_dct_neon
+    bl              sub8x4_dct_neon
+    sub             r1, r1, #8*FENC_STRIDE-8
+    sub             r2, r2, #8*FDEC_STRIDE-8
+    bl              sub8x4_dct_neon
+    bl              sub8x4_dct_neon
+    sub             r1, r1, #8
+    sub             r2, r2, #8
+    bl              sub8x4_dct_neon
+    bl              sub8x4_dct_neon
+    sub             r1, r1, #8*FENC_STRIDE-8
+    sub             r2, r2, #8*FDEC_STRIDE-8
+    bl              sub8x4_dct_neon
+    pop             {lr}
+    b               sub8x4_dct_neon
+endfunc
+
+
+.macro DCT8_1D type
+    SUMSUB_AB       q2,  q1,  q11, q12  // s34/d34
+    SUMSUB_AB       q3,  q11, q10, q13  // s25/d25
+    SUMSUB_AB       q13, q10, q9,  q14  // s16/d16
+    SUMSUB_AB       q14, q8,  q8,  q15  // s07/d07
+
+    SUMSUB_AB       q9,  q2,  q14, q2   // a0/a2
+    SUMSUB_AB       q12, q14, q13, q3   // a1/a3
+
+    SUMSUB_AB       q3,  q13, q8,  q1   // a6/a5
+    vshr.s16        q0,  q10, #1
+    vshr.s16        q15, q11, #1
+    vadd.s16        q0,  q0,  q10
+    vadd.s16        q15, q15, q11
+    vsub.s16        q3,  q3,  q0
+    vsub.s16        q13, q13, q15
+
+    SUMSUB_AB       q0,  q15, q10, q11  // a4/a7
+    vshr.s16        q10, q8,  #1
+    vshr.s16        q11, q1,  #1
+    vadd.s16        q10, q10, q8
+    vadd.s16        q11, q11, q1
+    vadd.s16        q10, q0,  q10
+    vadd.s16        q15, q15, q11
+
+    SUMSUB_AB       q8,  q12, q9,  q12
+    SUMSUB_SHR      2, q9,  q15, q10, q15,  q0, q1
+    SUMSUB_SHR      1, q10, q14, q2,  q14,  q0, q1
+    SUMSUB_SHR2     2, q11, q13, q3,  q13,  q0, q1
+.endm
+
+function sub8x8_dct8_neon
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64], r3
+    vld1.64         {d17}, [r2,:64], ip
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d18}, [r1,:64], r3
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vld1.64         {d31}, [r2,:64], ip
+    vsubl.u8        q15, d30, d31
+
+    DCT8_1D         row
+    vswp            d17, d24    // 8, 12
+    vswp            d21, d28    // 10,14
+    vtrn.32         q8,  q10
+    vtrn.32         q12, q14
+
+    vswp            d19, d26    // 9, 13
+    vswp            d23, d30    // 11,15
+    vtrn.32         q9,  q11
+    vtrn.32         q13, q15
+
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q8,  q9
+    vtrn.16         q14, q15
+    DCT8_1D         col
+
+    vst1.64         {d16-d19}, [r0,:128]!
+    vst1.64         {d20-d23}, [r0,:128]!
+    vst1.64         {d24-d27}, [r0,:128]!
+    vst1.64         {d28-d31}, [r0,:128]!
+    bx              lr
+endfunc
+
+function sub16x16_dct8_neon
+    push            {lr}
+    bl              X(sub8x8_dct8_neon)
+    sub             r1,  r1,  #FENC_STRIDE*8 - 8
+    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
+    bl              X(sub8x8_dct8_neon)
+    sub             r1,  r1,  #8
+    sub             r2,  r2,  #8
+    bl              X(sub8x8_dct8_neon)
+    pop             {lr}
+    sub             r1,  r1,  #FENC_STRIDE*8 - 8
+    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
+    b               X(sub8x8_dct8_neon)
+endfunc
+
+
+// First part of IDCT (minus final SUMSUB_BA)
+.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
+    SUMSUB_AB       \d4, \d5, \d0, \d2
+    vshr.s16        \d7, \d1, #1
+    vshr.s16        \d6, \d3, #1
+    vsub.s16        \d7, \d7, \d3
+    vadd.s16        \d6, \d6, \d1
+.endm
+
+function add4x4_idct_neon
+    mov             r2, #FDEC_STRIDE
+    vld1.64         {d0-d3}, [r1,:128]
+
+    IDCT_1D         d4, d5, d6, d7, d0, d1, d2, d3
+    vld1.32         {d30[0]}, [r0,:32], r2
+    SUMSUB_AB       q0, q1, q2, q3
+
+    TRANSPOSE4x4_16 d0, d1, d3, d2
+
+    IDCT_1D         d4, d5, d6, d7, d0, d1, d3, d2
+    vld1.32         {d30[1]}, [r0,:32], r2
+    SUMSUB_AB       q0, q1, q2, q3
+
+    vrshr.s16       q0, q0, #6
+    vld1.32         {d31[1]}, [r0,:32], r2
+    vrshr.s16       q1, q1, #6
+    vld1.32         {d31[0]}, [r0,:32], r2
+
+    sub             r0, r0, r2, lsl #2
+    vaddw.u8        q0, q0, d30
+    vaddw.u8        q1, q1, d31
+    vqmovun.s16     d0, q0
+    vqmovun.s16     d2, q1
+
+    vst1.32         {d0[0]}, [r0,:32], r2
+    vst1.32         {d0[1]}, [r0,:32], r2
+    vst1.32         {d2[1]}, [r0,:32], r2
+    vst1.32         {d2[0]}, [r0,:32], r2
+    bx              lr
+endfunc
+
+function add8x4_idct_neon, export=0
+    vld1.64         {d0-d3}, [r1,:128]!
+    IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
+    vld1.64         {d4-d7}, [r1,:128]!
+    IDCT_1D         d17, d19, d21, d23, d4, d5, d6, d7
+    SUMSUB_AB       q0,  q3,  q8,  q10
+    SUMSUB_AB       q1,  q2,  q9,  q11
+
+    TRANSPOSE4x4_16 q0,  q1,  q2,  q3
+
+    IDCT_1D         q8,  q9,  q10, q11, q0, q1, q2, q3
+    SUMSUB_AB       q0,  q3,  q8,  q10
+    SUMSUB_AB       q1,  q2,  q9,  q11
+
+    vrshr.s16       q0,  q0,  #6
+    vld1.32         {d28}, [r0,:64], r2
+    vrshr.s16       q1,  q1,  #6
+    vld1.32         {d29}, [r0,:64], r2
+    vrshr.s16       q2,  q2,  #6
+    vld1.32         {d30}, [r0,:64], r2
+    vrshr.s16       q3,  q3,  #6
+    vld1.32         {d31}, [r0,:64], r2
+
+    sub             r0,  r0,  r2,  lsl #2
+    vaddw.u8        q0,  q0,  d28
+    vaddw.u8        q1,  q1,  d29
+    vaddw.u8        q2,  q2,  d30
+    vaddw.u8        q3,  q3,  d31
+
+    vqmovun.s16     d0,  q0
+    vqmovun.s16     d1,  q1
+    vst1.32         {d0}, [r0,:64], r2
+    vqmovun.s16     d2,  q2
+    vst1.32         {d1}, [r0,:64], r2
+    vqmovun.s16     d3,  q3
+    vst1.32         {d2}, [r0,:64], r2
+    vst1.32         {d3}, [r0,:64], r2
+    bx              lr
+endfunc
+
+function add8x8_idct_neon
+    mov             r2, #FDEC_STRIDE
+    mov             ip, lr
+    bl              add8x4_idct_neon
+    mov             lr, ip
+    b               add8x4_idct_neon
+endfunc
+
+function add16x16_idct_neon
+    mov             r2, #FDEC_STRIDE
+    mov             ip, lr
+    bl              add8x4_idct_neon
+    bl              add8x4_idct_neon
+    sub             r0, r0, #8*FDEC_STRIDE-8
+    bl              add8x4_idct_neon
+    bl              add8x4_idct_neon
+    sub             r0, r0, #8
+    bl              add8x4_idct_neon
+    bl              add8x4_idct_neon
+    sub             r0, r0, #8*FDEC_STRIDE-8
+    bl              add8x4_idct_neon
+    mov             lr, ip
+    b               add8x4_idct_neon
+endfunc
+
+
+.macro IDCT8_1D type
+.ifc \type, col
+    vswp            d21, d28
+.endif
+    SUMSUB_AB       q0,  q1,  q8,  q12              // a0/a2
+.ifc \type, row
+    vld1.64         {d28-d31}, [r1,:128]!
+.else
+    vswp            d19, d26
+.endif
+    SUMSUB_SHR      1, q2,  q3,  q10, q14,  q8, q12    // a6/a4
+.ifc \type, col
+    vswp            d23, d30
+.endif
+    SUMSUB_AB       q8,  q10, q13, q11
+    SUMSUB_15       q8,  q10, q9,  q15,  q12, q14   // a7/a1
+    SUMSUB_AB       q14, q15, q15, q9
+    SUMSUB_15       q15, q14, q13, q11,  q12, q9    // a5/a3
+
+    SUMSUB_SHR      2, q13, q14, q14, q15,  q11, q9    // b3/b5
+    SUMSUB_SHR2     2, q12, q15, q8,  q10,  q11, q9    // b1/b7
+
+    SUMSUB_AB       q10, q2,  q0,  q2               // b0/b6
+    SUMSUB_AB       q11, q3,  q1,  q3               // b2/b4
+
+    SUMSUB_AB       q8,  q15, q10, q15
+    SUMSUB_AB       q9,  q14, q11, q14
+    SUMSUB_AB       q10, q13, q3,  q13
+.ifc \type, row
+    vtrn.16         q8,  q9
+.endif
+    SUMSUB_AB       q11, q12, q2,  q12
+.endm
+
+function add8x8_idct8_neon
+    mov             r2,  #FDEC_STRIDE
+    vld1.64         {d16-d19}, [r1,:128]!
+    vld1.64         {d20-d23}, [r1,:128]!
+    vld1.64         {d24-d27}, [r1,:128]!
+
+    IDCT8_1D        row
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vswp            d17, d24
+    IDCT8_1D        col
+
+    vld1.64         {d0}, [r0,:64], r2
+    vrshr.s16       q8,  q8,  #6
+    vld1.64         {d1}, [r0,:64], r2
+    vrshr.s16       q9,  q9,  #6
+    vld1.64         {d2}, [r0,:64], r2
+    vrshr.s16       q10, q10, #6
+    vld1.64         {d3}, [r0,:64], r2
+    vrshr.s16       q11, q11, #6
+    vld1.64         {d4}, [r0,:64], r2
+    vrshr.s16       q12, q12, #6
+    vld1.64         {d5}, [r0,:64], r2
+    vrshr.s16       q13, q13, #6
+    vld1.64         {d6}, [r0,:64], r2
+    vrshr.s16       q14, q14, #6
+    vld1.64         {d7}, [r0,:64], r2
+    vrshr.s16       q15, q15, #6
+    sub             r0,  r0,  r2,  lsl #3
+
+    vaddw.u8        q8,  q8,  d0
+    vaddw.u8        q9,  q9,  d1
+    vaddw.u8        q10, q10, d2
+    vqmovun.s16     d0,  q8
+    vqmovun.s16     d1,  q9
+    vqmovun.s16     d2,  q10
+    vaddw.u8        q11, q11, d3
+    vst1.64         {d0}, [r0,:64], r2
+    vaddw.u8        q12, q12, d4
+    vst1.64         {d1}, [r0,:64], r2
+    vaddw.u8        q13, q13, d5
+    vst1.64         {d2}, [r0,:64], r2
+    vqmovun.s16     d3,  q11
+    vqmovun.s16     d4,  q12
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+    vst1.64         {d3}, [r0,:64], r2
+    vqmovun.s16     d5,  q13
+    vst1.64         {d4}, [r0,:64], r2
+    vqmovun.s16     d6,  q14
+    vqmovun.s16     d7,  q15
+    vst1.64         {d5}, [r0,:64], r2
+    vst1.64         {d6}, [r0,:64], r2
+    vst1.64         {d7}, [r0,:64], r2
+    bx              lr
+endfunc
+
+function add16x16_idct8_neon
+    mov             ip,  lr
+    bl              X(add8x8_idct8_neon)
+    sub             r0,  r0,  #8*FDEC_STRIDE-8
+    bl              X(add8x8_idct8_neon)
+    sub             r0,  r0,  #8
+    bl              X(add8x8_idct8_neon)
+    sub             r0,  r0,  #8*FDEC_STRIDE-8
+    mov             lr,  ip
+    b               X(add8x8_idct8_neon)
+endfunc
+
+
+function add8x8_idct_dc_neon
+    mov             r2,  #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64]
+    vrshr.s16       d16, d16, #6
+    vld1.64         {d0}, [r0,:64], r2
+    vmov.i16        q15, #0
+    vld1.64         {d1}, [r0,:64], r2
+    vld1.64         {d2}, [r0,:64], r2
+    vdup.16         d20, d16[0]
+    vld1.64         {d3}, [r0,:64], r2
+    vdup.16         d21, d16[1]
+    vld1.64         {d4}, [r0,:64], r2
+    vdup.16         d22, d16[2]
+    vld1.64         {d5}, [r0,:64], r2
+    vdup.16         d23, d16[3]
+    vld1.64         {d6}, [r0,:64], r2
+    vsub.s16        q12, q15, q10
+    vld1.64         {d7}, [r0,:64], r2
+    vsub.s16        q13, q15, q11
+
+    sub             r0,  r0,  #8*FDEC_STRIDE
+
+    vqmovun.s16     d20, q10
+    vqmovun.s16     d22, q11
+    vqmovun.s16     d24, q12
+    vqmovun.s16     d26, q13
+
+    vmov            d21, d20
+    vqadd.u8        q0,  q0,  q10
+    vmov            d23, d22
+    vqadd.u8        q1,  q1,  q10
+    vmov            d25, d24
+    vqadd.u8        q2,  q2,  q11
+    vmov            d27, d26
+    vqadd.u8        q3,  q3,  q11
+    vqsub.u8        q0,  q0,  q12
+    vqsub.u8        q1,  q1,  q12
+    vqsub.u8        q2,  q2,  q13
+
+    vst1.64         {d0}, [r0,:64], r2
+    vqsub.u8        q3,  q3,  q13
+    vst1.64         {d1}, [r0,:64], r2
+    vst1.64         {d2}, [r0,:64], r2
+    vst1.64         {d3}, [r0,:64], r2
+    vst1.64         {d4}, [r0,:64], r2
+    vst1.64         {d5}, [r0,:64], r2
+    vst1.64         {d6}, [r0,:64], r2
+    vst1.64         {d7}, [r0,:64], r2
+    bx              lr
+endfunc
+
+.macro ADD16x4_IDCT_DC dc
+    vld1.64         {d16-d17}, [r0,:128], r3
+    vld1.64         {d18-d19}, [r0,:128], r3
+    vdup.16         d4,  \dc[0]
+    vdup.16         d5,  \dc[1]
+    vld1.64         {d20-d21}, [r0,:128], r3
+    vdup.16         d6,  \dc[2]
+    vdup.16         d7,  \dc[3]
+    vld1.64         {d22-d23}, [r0,:128], r3
+    vsub.s16        q12, q15, q2
+    vsub.s16        q13, q15, q3
+
+    vqmovun.s16     d4,  q2
+    vqmovun.s16     d5,  q3
+    vqmovun.s16     d6,  q12
+    vqmovun.s16     d7,  q13
+
+    vqadd.u8        q8,  q8,  q2
+    vqadd.u8        q9,  q9,  q2
+    vqadd.u8        q10, q10, q2
+    vqadd.u8        q11, q11, q2
+
+    vqsub.u8        q8,  q8,  q3
+    vqsub.u8        q9,  q9,  q3
+    vqsub.u8        q10, q10, q3
+    vst1.64         {d16-d17}, [r2,:128], r3
+    vqsub.u8        q11, q11, q3
+    vst1.64         {d18-d19}, [r2,:128], r3
+    vst1.64         {d20-d21}, [r2,:128], r3
+    vst1.64         {d22-d23}, [r2,:128], r3
+.endm
+
+function add16x16_idct_dc_neon
+    mov             r2,  r0
+    mov             r3,  #FDEC_STRIDE
+    vmov.i16        q15, #0
+
+    vld1.64         {d0-d3}, [r1,:64]
+    vrshr.s16       q0, #6
+    vrshr.s16       q1, #6
+
+    ADD16x4_IDCT_DC d0
+    ADD16x4_IDCT_DC d1
+    ADD16x4_IDCT_DC d2
+    ADD16x4_IDCT_DC d3
+    bx              lr
+endfunc
+
+function sub8x8_dct_dc_neon
+    mov             r3,  #FENC_STRIDE
+    mov             ip,  #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64], r3
+    vld1.64         {d17}, [r2,:64], ip
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d18}, [r1,:64], r3
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vadd.s16        q0,  q8,  q9
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q10
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q11
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vadd.s16        q1,  q12, q13
+    vld1.64         {d31}, [r2,:64], ip
+    vsubl.u8        q15, d30, d31
+    vadd.s16        q1,  q1,  q14
+
+    vadd.s16        d4,  d0,  d1
+    vadd.s16        q1,  q1,  q15
+    vsub.s16        d5,  d0,  d1
+    vadd.s16        d6,  d2,  d3
+    vsub.s16        d7,  d2,  d3
+    vadd.s16        q0,  q2,  q3
+    vsub.s16        q1,  q2,  q3
+
+    vpadd.s16       d0,  d0,  d2
+    vpadd.s16       d1,  d1,  d3
+    vpadd.s16       d0,  d0,  d1
+    vst1.64         {d0}, [r0,:64]
+    bx              lr
+endfunc
+
+function sub8x16_dct_dc_neon
+    mov             r3,  #FENC_STRIDE
+    mov             ip,  #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64], r3
+    vld1.64         {d17}, [r2,:64], ip
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d18}, [r1,:64], r3
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vadd.s16        q0,  q8,  q9
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q10
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q11
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vadd.s16        q1,  q12, q13
+    vld1.64         {d31}, [r2,:64], ip
+    vsubl.u8        q15, d30, d31
+
+    vld1.64         {d16}, [r1,:64], r3
+    vadd.s16        q1,  q1,  q14
+    vld1.64         {d17}, [r2,:64], ip
+    vadd.s16        q1,  q1,  q15
+    vld1.64         {d18}, [r1,:64], r3
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vadd.s16        q2,  q8,  q9
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vadd.s16        q2,  q2,  q10
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vadd.s16        q2,  q2,  q11
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vadd.s16        q3,  q12, q13
+    vld1.64         {d31}, [r2,:64], ip
+    vsubl.u8        q15, d30, d31
+    vadd.s16        q3,  q3,  q14
+
+    vadd.s16        d16, d0,  d1  @ b0
+    vadd.s16        q3,  q3,  q15
+    vsub.s16        d17, d0,  d1  @ b4
+    vadd.s16        d18, d2,  d3  @ b1
+    vsub.s16        d19, d2,  d3  @ b5
+    vadd.s16        d20, d4,  d5  @ b2
+    vsub.s16        d21, d4,  d5  @ b6
+    vadd.s16        d22, d6,  d7  @ b3
+    vsub.s16        d23, d6,  d7  @ b7
+    vadd.s16        q0,  q8,  q9  @ b0 + b1, b4 + b5; a0, a2
+    vsub.s16        q1,  q8,  q9  @ b0 - b1, b4 - b5; a4, a6
+    vadd.s16        q2,  q10, q11 @ b2 + b3, b6 + b7; a1, a3
+    vsub.s16        q3,  q10, q11 @ b2 - b3, b6 - b7; a5, a7
+
+    vadd.s16        q8,  q0,  q2  @ a0 + a1, a2 + a3
+    vsub.s16        q9,  q0,  q2  @ a0 - a1, a2 - a3
+    vsub.s16        q10, q1,  q3  @ a4 - a5, a6 - a7
+    vadd.s16        q11, q1,  q3  @ a4 + a5, a6 + a7
+
+    vpadd.s16       d0,  d16, d17
+    vpadd.s16       d1,  d18, d19
+    vpadd.s16       d2,  d20, d21
+    vpadd.s16       d3,  d22, d23
+    vpadd.s16       d0,  d0,  d1
+    vpadd.s16       d1,  d2,  d3
+    vst1.64         {q0}, [r0,:64]
+    bx              lr
+endfunc
+
+
+function zigzag_scan_4x4_frame_neon
+    movrel      r2, scan4x4_frame
+    vld1.64     {d0-d3},   [r1,:128]
+    vld1.64     {d16-d19}, [r2,:128]
+    vtbl.8      d4, {d0-d1}, d16
+    vtbl.8      d5, {d1-d3}, d17
+    vtbl.8      d6, {d0-d2}, d18
+    vtbl.8      d7, {d2-d3}, d19
+    vst1.64     {d4-d7},   [r0,:128]
+    bx          lr
+endfunc
--- a/common/arm/dct.h
+++ b/common/arm/dct.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * dct.h: arm transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_DCT_H
+#define X264_ARM_DCT_H
+
+#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
+void x264_dct4x4dc_neon( int16_t d[16] );
+#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
+void x264_idct4x4dc_neon( int16_t d[16] );
+
+#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
+void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
+void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
+void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
+void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
+#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
+void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
+#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
+void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
+
+#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
+#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
+void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
+#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
+void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
+void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
+void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
+void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
+#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
+void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
+
+#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
+void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+
+#endif
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -0,0 +1,795 @@
+/*****************************************************************************
+ * deblock.S: arm deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          Martin Storsjo <martin@martin.st>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro h264_loop_filter_start
+    ldr             ip,  [sp]
+    ldr             ip,  [ip]
+    vdup.32         d24, ip
+    and             ip,  ip,  ip, lsl #16
+    ands            ip,  ip,  ip, lsl #8
+    bxlt            lr
+.endm
+
+.macro align_push_regs
+    and             ip,  sp,  #15
+    add             ip,  ip,  #32
+    sub             sp,  sp,  ip
+    vst1.64         {d12-d15}, [sp,:128]
+    sub             sp,  sp,  #32
+    vst1.64         {d8-d11},  [sp,:128]
+.endm
+
+.macro align_pop_regs
+    vld1.64         {d8-d11},  [sp,:128]!
+    vld1.64         {d12-d15}, [sp,:128], ip
+.endm
+
+.macro h264_loop_filter_luma
+    vdup.8          q11, r2         @ alpha
+    vmovl.u8        q12, d24
+    vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
+    vmovl.u16       q12, d24
+    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
+    vsli.16         q12, q12, #8
+    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
+    vsli.32         q12, q12, #16
+    vclt.u8         q6,  q6,  q11   @ < alpha
+    vdup.8          q11, r3         @ beta
+    vclt.s8         q7,  q12, #0
+    vclt.u8         q14, q14, q11   @ < beta
+    vclt.u8         q15, q15, q11   @ < beta
+    vbic            q6,  q6,  q7
+    vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
+    vand            q6,  q6,  q14
+    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
+    vclt.u8         q4,  q4,  q11   @ < beta
+    vand            q6,  q6,  q15
+    vclt.u8         q5,  q5,  q11   @ < beta
+    vand            q4,  q4,  q6
+    vand            q5,  q5,  q6
+    vand            q12, q12, q6
+    vrhadd.u8       q14, q8,  q0
+    vsub.i8         q6,  q12, q4
+    vqadd.u8        q7,  q9,  q12
+    vhadd.u8        q10, q10, q14
+    vsub.i8         q6,  q6,  q5
+    vhadd.u8        q14, q2,  q14
+    vmin.u8         q7,  q7,  q10
+    vqsub.u8        q11, q9,  q12
+    vqadd.u8        q2,  q1,  q12
+    vmax.u8         q7,  q7,  q11
+    vqsub.u8        q11, q1,  q12
+    vmin.u8         q14, q2,  q14
+    vmovl.u8        q2,  d0
+    vmax.u8         q14, q14, q11
+    vmovl.u8        q10, d1
+    vsubw.u8        q2,  q2,  d16
+    vsubw.u8        q10, q10, d17
+    vshl.i16        q2,  q2,  #2
+    vshl.i16        q10, q10, #2
+    vaddw.u8        q2,  q2,  d18
+    vaddw.u8        q10, q10, d19
+    vsubw.u8        q2,  q2,  d2
+    vsubw.u8        q10, q10, d3
+    vrshrn.i16      d4,  q2,  #3
+    vrshrn.i16      d5,  q10, #3
+    vbsl            q4,  q7,  q9
+    vbsl            q5,  q14, q1
+    vneg.s8         q7,  q6
+    vmovl.u8        q14, d16
+    vmin.s8         q2,  q2,  q6
+    vmovl.u8        q6,  d17
+    vmax.s8         q2,  q2,  q7
+    vmovl.u8        q11, d0
+    vmovl.u8        q12, d1
+    vaddw.s8        q14, q14, d4
+    vaddw.s8        q6,  q6,  d5
+    vsubw.s8        q11, q11, d4
+    vsubw.s8        q12, q12, d5
+    vqmovun.s16     d16, q14
+    vqmovun.s16     d17, q6
+    vqmovun.s16     d0,  q11
+    vqmovun.s16     d1,  q12
+.endm
+
+function deblock_v_luma_neon
+    h264_loop_filter_start
+
+    vld1.64         {d0, d1},  [r0,:128], r1
+    vld1.64         {d2, d3},  [r0,:128], r1
+    vld1.64         {d4, d5},  [r0,:128], r1
+    sub             r0,  r0,  r1, lsl #2
+    sub             r0,  r0,  r1, lsl #1
+    vld1.64         {d20,d21}, [r0,:128], r1
+    vld1.64         {d18,d19}, [r0,:128], r1
+    vld1.64         {d16,d17}, [r0,:128], r1
+
+    align_push_regs
+
+    h264_loop_filter_luma
+
+    sub             r0,  r0,  r1, lsl #1
+    vst1.64         {d8, d9},  [r0,:128], r1
+    vst1.64         {d16,d17}, [r0,:128], r1
+    vst1.64         {d0, d1},  [r0,:128], r1
+    vst1.64         {d10,d11}, [r0,:128]
+
+    align_pop_regs
+    bx              lr
+endfunc
+
+function deblock_h_luma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  #4
+    vld1.64         {d6},  [r0], r1
+    vld1.64         {d20}, [r0], r1
+    vld1.64         {d18}, [r0], r1
+    vld1.64         {d16}, [r0], r1
+    vld1.64         {d0},  [r0], r1
+    vld1.64         {d2},  [r0], r1
+    vld1.64         {d4},  [r0], r1
+    vld1.64         {d26}, [r0], r1
+    vld1.64         {d7},  [r0], r1
+    vld1.64         {d21}, [r0], r1
+    vld1.64         {d19}, [r0], r1
+    vld1.64         {d17}, [r0], r1
+    vld1.64         {d1},  [r0], r1
+    vld1.64         {d3},  [r0], r1
+    vld1.64         {d5},  [r0], r1
+    vld1.64         {d27}, [r0], r1
+
+    TRANSPOSE8x8    q3, q10, q9, q8, q0, q1, q2, q13
+
+    align_push_regs
+
+    h264_loop_filter_luma
+
+    TRANSPOSE4x4    q4, q8, q0, q5
+
+    sub             r0,  r0,  r1, lsl #4
+    add             r0,  r0,  #2
+    vst1.32         {d8[0]},  [r0], r1
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d10[0]}, [r0], r1
+    vst1.32         {d8[1]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0], r1
+    vst1.32         {d10[1]}, [r0], r1
+    vst1.32         {d9[0]},  [r0], r1
+    vst1.32         {d17[0]}, [r0], r1
+    vst1.32         {d1[0]},  [r0], r1
+    vst1.32         {d11[0]}, [r0], r1
+    vst1.32         {d9[1]},  [r0], r1
+    vst1.32         {d17[1]}, [r0], r1
+    vst1.32         {d1[1]},  [r0], r1
+    vst1.32         {d11[1]}, [r0], r1
+
+    align_pop_regs
+    bx              lr
+endfunc
+
+.macro h264_loop_filter_luma_intra
+    vdup.8          q14, r2         @ alpha
+    vabd.u8         q4,  q8,  q0    @ abs(p0 - q0)
+    vabd.u8         q5,  q9,  q8    @ abs(p1 - p0)
+    vabd.u8         q6,  q1,  q0    @ abs(q1 - q0)
+    vdup.8          q15, r3         @ beta
+    vmov.u8         q13, #2
+    vclt.u8         q7,  q4,  q14   @ < alpha
+    vshr.u8         q14, q14, #2    @ alpha >> 2
+    vclt.u8         q5,  q5,  q15   @ < beta
+    vadd.u8         q14, q14, q13   @ (alpha >> 2) + 2
+    vand            q7,  q7,  q5
+    vclt.u8         q6,  q6,  q15   @ < beta
+    vclt.u8         q13, q4,  q14   @ < (alpha >> 2) + 2 if_2
+    vand            q12, q7,  q6    @ if_1
+    vshrn.u16       d28, q12,  #4
+    vmov            r2,  lr,  d28
+    orrs            r2,  r2,  lr
+    beq             9f
+
+    sub             sp,  sp,  #32
+    vst1.8          {q12-q13}, [sp,:128]
+
+    vshll.u8        q4,  d18, #1    @ 2*p1
+    vshll.u8        q5,  d19, #1
+    vaddw.u8        q4,  q4,  d16   @ 2*p1 + p0
+    vaddw.u8        q5,  q5,  d17
+    vaddw.u8        q4,  q4,  d2    @ 2*p1 + p0 + q1
+    vaddw.u8        q5,  q5,  d3
+    vrshrn.u16      d24, q4,  #2
+    vrshrn.u16      d25, q5,  #2
+
+    vaddl.u8        q6,  d20, d16   @ p2 + p0
+    vaddl.u8        q7,  d21, d17
+    vaddw.u8        q6,  q6,  d0    @ p2 + p0 + q0
+    vaddw.u8        q7,  q7,  d1
+    vadd.u16        q4,  q4,  q6    @ p2 + 2*p1 + 2*p0 + q0 + q1
+    vadd.u16        q5,  q5,  q7
+    vaddw.u8        q4,  q4,  d0    @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
+    vaddw.u8        q5,  q5,  d1
+    vrshrn.u16      d26, q4,  #3    @ p0'_2
+    vrshrn.u16      d27, q5,  #3
+    vaddw.u8        q6,  q6,  d18   @ p2 + p1 + p0 + q0
+    vaddw.u8        q7,  q7,  d19
+    vrshrn.u16      d28, q6,  #2    @ p1'_2
+    vrshrn.u16      d29, q7,  #2
+    vaddl.u8        q4,  d22, d20   @ p3 + p2
+    vaddl.u8        q5,  d23, d21
+    vshl.u16        q4,  q4,  #1    @ 2*p3 + 2*p2
+    vshl.u16        q5,  q5,  #1
+    vadd.u16        q4,  q4,  q6    @ 2*p3 + 3*p2 + p1 + p0 + q0
+    vadd.u16        q5,  q5,  q7
+    vrshrn.u16      d30, q4,  #3    @ p2'_2
+    vrshrn.u16      d31, q5,  #3
+
+    vdup.8          q4,  r3         @ beta
+    vabd.u8         q5,  q10, q8    @ abs(p2 - p0)
+    vld1.8          {q6-q7}, [sp,:128]   @ if_1, if_2
+    vclt.u8         q5,  q5,  q4    @ < beta if_3
+
+    vand            q7,  q7,  q5    @ if_2 && if_3
+    vmvn            q4,  q7
+    vand            q7,  q7,  q6    @ if_1 && if_2 && if_3
+    vand            q6,  q4,  q6    @ if_1 && !(if_2 && if_3)
+
+    @ copy p0 to q15 so it can be clobbered
+    vbit            q10, q15, q7
+    vmov            q15, q8
+    vbit            q8,  q12, q6
+
+    @ wait for q9 to clobber
+    vshll.u8        q4,  d2,  #1    @ 2*q1
+    vshll.u8        q5,  d3,  #1
+
+    vbit            q8,  q12, q6
+
+    vaddw.u8        q4,  q4,  d0    @ 2*q1 + q0
+    vaddw.u8        q5,  q5,  d1
+
+    vbit            q8,  q13, q7
+
+    vaddw.u8        q4,  q4,  d18   @ 2*q1 + q0 + p1
+    vaddw.u8        q5,  q5,  d19
+
+    vbit            q9,  q14, q7
+
+    vrshrn.u16      d24, q4,  #2
+    vrshrn.u16      d25, q5,  #2
+
+    vaddl.u8        q6,  d4,  d0    @ q2 + q0
+    vaddl.u8        q7,  d5,  d1
+    vaddw.u8        q6,  q6,  d30   @ q2 + q0 + p0
+    vaddw.u8        q7,  q7,  d31
+    vadd.u16        q4,  q4,  q6    @ q2 + 2*q1 + 2*q0 + p0 + p1
+    vadd.u16        q5,  q5,  q7
+    vaddw.u8        q4,  q4,  d30   @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
+    vaddw.u8        q5,  q5,  d31
+    vrshrn.u16      d26, q4,  #3    @ q0'_2
+    vrshrn.u16      d27, q5,  #3
+    vaddw.u8        q6,  q6,  d2    @ q2 + q1 + q0 + p0
+    vaddw.u8        q7,  q7,  d3
+    vrshrn.u16      d28, q6,  #2    @ q1'_2
+    vrshrn.u16      d29, q7,  #2
+    vaddl.u8        q4,  d6,  d4    @ q3 + q2
+    vaddl.u8        q5,  d7,  d5
+    vshl.u16        q4,  q4,  #1    @ 2*q3 + 2*q2
+    vshl.u16        q5,  q5,  #1
+    vadd.u16        q4,  q4,  q6    @ 2*q3 + 3*q2 + q1 + q0 + p0
+    vadd.u16        q5,  q5,  q7
+    vrshrn.u16      d30, q4,  #3    @ q2'_2
+    vrshrn.u16      d31, q5,  #3
+
+    vdup.8          q4,  r3         @ beta
+    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
+    vld1.8          {q6-q7}, [sp,:128]!   @ if_1, if_2
+    vclt.u8         q5,  q5,  q4    @ < beta if_4
+
+    vand            q7,  q7,  q5    @ if_2 && if_4
+    vmvn            q4,  q7
+    vand            q7,  q6,  q7    @ if_1 && if_2 && if_4
+    vand            q6,  q6,  q4    @ if_1 && !(if_2 && if_4)
+
+    vbit            q0,  q12, q6
+    vbit            q1,  q14, q7
+    vbit            q0,  q13, q7
+    vbit            q2,  q15, q7
+
+.endm
+
+function deblock_v_luma_intra_neon
+    push            {lr}
+    vld1.64         {d0, d1},  [r0,:128], r1
+    vld1.64         {d2, d3},  [r0,:128], r1
+    vld1.64         {d4, d5},  [r0,:128], r1
+    vld1.64         {d6, d7},  [r0,:128], r1
+    sub             r0,  r0,  r1, lsl #3
+    vld1.64         {d22,d23}, [r0,:128], r1
+    vld1.64         {d20,d21}, [r0,:128], r1
+    vld1.64         {d18,d19}, [r0,:128], r1
+    vld1.64         {d16,d17}, [r0,:128]
+
+    align_push_regs
+
+    h264_loop_filter_luma_intra
+
+    sub             r0,  r0,  r1, lsl #1
+    vst1.64         {d20,d21}, [r0,:128], r1
+    vst1.64         {d18,d19}, [r0,:128], r1
+    vst1.64         {d16,d17}, [r0,:128], r1
+    vst1.64         {d0, d1},  [r0,:128], r1
+    vst1.64         {d2, d3},  [r0,:128], r1
+    vst1.64         {d4, d5},  [r0,:128]
+9:
+    align_pop_regs
+    pop             {pc}
+endfunc
+
+function deblock_h_luma_intra_neon
+    push            {lr}
+    sub             r0,  r0,  #4
+    vld1.64         {d22}, [r0], r1
+    vld1.64         {d20}, [r0], r1
+    vld1.64         {d18}, [r0], r1
+    vld1.64         {d16}, [r0], r1
+    vld1.64         {d0},  [r0], r1
+    vld1.64         {d2},  [r0], r1
+    vld1.64         {d4},  [r0], r1
+    vld1.64         {d6},  [r0], r1
+    vld1.64         {d23}, [r0], r1
+    vld1.64         {d21}, [r0], r1
+    vld1.64         {d19}, [r0], r1
+    vld1.64         {d17}, [r0], r1
+    vld1.64         {d1},  [r0], r1
+    vld1.64         {d3},  [r0], r1
+    vld1.64         {d5},  [r0], r1
+    vld1.64         {d7},  [r0], r1
+
+    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
+
+    align_push_regs
+
+    h264_loop_filter_luma_intra
+
+    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
+
+    sub             r0,  r0,  r1, lsl #4
+    vst1.64         {d22}, [r0], r1
+    vst1.64         {d20}, [r0], r1
+    vst1.64         {d18}, [r0], r1
+    vst1.64         {d16}, [r0], r1
+    vst1.64         {d0},  [r0], r1
+    vst1.64         {d2},  [r0], r1
+    vst1.64         {d4},  [r0], r1
+    vst1.64         {d6},  [r0], r1
+    vst1.64         {d23}, [r0], r1
+    vst1.64         {d21}, [r0], r1
+    vst1.64         {d19}, [r0], r1
+    vst1.64         {d17}, [r0], r1
+    vst1.64         {d1},  [r0], r1
+    vst1.64         {d3},  [r0], r1
+    vst1.64         {d5},  [r0], r1
+    vst1.64         {d7},  [r0], r1
+9:
+    align_pop_regs
+    pop             {pc}
+endfunc
+
+.macro h264_loop_filter_chroma
+    vdup.8          q11, r2         // alpha
+    vmovl.u8        q12, d24
+    vabd.u8         q13, q8,  q0    // abs(p0 - q0)
+    vabd.u8         q14, q9,  q8    // abs(p1 - p0)
+    vsubl.u8        q2,  d0,  d16
+    vsubl.u8        q3,  d1,  d17
+    vsli.16         q12, q12, #8
+    vshl.i16        q2,  q2,  #2
+    vshl.i16        q3,  q3,  #2
+    vabd.u8         q15, q1,  q0    // abs(q1 - q0)
+    vmovl.u8        q12, d24
+    vaddw.u8        q2,  q2,  d18
+    vaddw.u8        q3,  q3,  d19
+    vclt.u8         q13, q13, q11   // < alpha
+    vsubw.u8        q2,  q2,  d2
+    vsubw.u8        q3,  q3,  d3
+    vsli.16         q12, q12, #8
+    vdup.8          q11, r3         // beta
+    vclt.s8         q10, q12, #0
+    vrshrn.i16      d4,  q2,  #3
+    vrshrn.i16      d5,  q3,  #3
+    vclt.u8         q14, q14, q11   // < beta
+    vbic            q13, q13, q10
+    vclt.u8         q15, q15, q11   // < beta
+    vand            q13, q13, q14
+    vneg.s8         q10, q12
+    vand            q13, q13, q15
+    vmin.s8         q2,  q2,  q12
+    vmovl.u8        q14, d16
+    vand            q2,  q2,  q13
+    vmovl.u8        q15, d17
+    vmax.s8         q2,  q2,  q10
+    vmovl.u8        q11, d0
+    vmovl.u8        q12, d1
+    vaddw.s8        q14, q14, d4
+    vaddw.s8        q15, q15, d5
+    vsubw.s8        q11, q11, d4
+    vsubw.s8        q12, q12, d5
+    vqmovun.s16     d16, q14
+    vqmovun.s16     d17, q15
+    vqmovun.s16     d0,  q11
+    vqmovun.s16     d1,  q12
+.endm
+
+function deblock_v_chroma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  r1, lsl #1
+    vld1.8          {d18,d19}, [r0,:128], r1
+    vld1.8          {d16,d17}, [r0,:128], r1
+    vld1.8          {d0, d1},  [r0,:128], r1
+    vld1.8          {d2, d3},  [r0,:128]
+
+    h264_loop_filter_chroma
+
+    sub             r0,  r0,  r1, lsl #1
+    vst1.8          {d16,d17}, [r0,:128], r1
+    vst1.8          {d0, d1},  [r0,:128], r1
+
+    bx              lr
+endfunc
+
+function deblock_h_chroma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  #4
+deblock_h_chroma:
+    vld1.8          {d18}, [r0], r1
+    vld1.8          {d16}, [r0], r1
+    vld1.8          {d0},  [r0], r1
+    vld1.8          {d2},  [r0], r1
+    vld1.8          {d19}, [r0], r1
+    vld1.8          {d17}, [r0], r1
+    vld1.8          {d1},  [r0], r1
+    vld1.8          {d3},  [r0], r1
+
+    TRANSPOSE4x4_16 q9, q8, q0, q1
+
+    h264_loop_filter_chroma
+
+    vtrn.16         q8,  q0
+
+    sub             r0,  r0,  r1, lsl #3
+    add             r0,  r0,  #2
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0], r1
+    vst1.32         {d17[0]}, [r0], r1
+    vst1.32         {d1[0]},  [r0], r1
+    vst1.32         {d17[1]}, [r0], r1
+    vst1.32         {d1[1]},  [r0], r1
+
+    bx              lr
+endfunc
+
+function deblock_h_chroma_422_neon
+    h264_loop_filter_start
+    push            {lr}
+    sub             r0,  r0,  #4
+    add             r1,  r1,  r1
+    bl              deblock_h_chroma
+    ldr             ip,  [sp, #4]
+    ldr             ip,  [ip]
+    vdup.32         d24, ip
+    sub             r0,  r0,  r1, lsl #3
+    add             r0,  r0,  r1, lsr #1
+    sub             r0,  r0,  #2
+    pop             {lr}
+    b               deblock_h_chroma
+endfunc
+
+.macro h264_loop_filter_chroma8
+    vdup.8          d22, r2         @ alpha
+    vmovl.u8        q12, d24
+    vabd.u8         d26, d16, d0    @ abs(p0 - q0)
+    vabd.u8         d28, d18, d16   @ abs(p1 - p0)
+    vsubl.u8        q2,  d0,  d16
+    vsli.16         d24, d24, #8
+    vshl.i16        q2,  q2,  #2
+    vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
+    vaddw.u8        q2,  q2,  d18
+    vclt.u8         d26, d26, d22   @ < alpha
+    vsubw.u8        q2,  q2,  d2
+    vdup.8          d22, r3         @ beta
+    vclt.s8         d20, d24, #0
+    vrshrn.i16      d4,  q2,  #3
+    vclt.u8         d28, d28, d22   @ < beta
+    vbic            d26, d26, d20
+    vclt.u8         d30, d30, d22   @ < beta
+    vand            d26, d26, d28
+    vneg.s8         d20, d24
+    vand            d26, d26, d30
+    vmin.s8         d4,  d4,  d24
+    vmovl.u8        q14, d16
+    vand            d4,  d4,  d26
+    vmax.s8         d4,  d4,  d20
+    vmovl.u8        q11, d0
+    vaddw.s8        q14, q14, d4
+    vsubw.s8        q11, q11, d4
+    vqmovun.s16     d16, q14
+    vqmovun.s16     d0,  q11
+.endm
+
+function deblock_h_chroma_mbaff_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  #4
+    vld1.8          {d18}, [r0], r1
+    vld1.8          {d16}, [r0], r1
+    vld1.8          {d0},  [r0], r1
+    vld1.8          {d2},  [r0], r1
+
+    TRANSPOSE4x4_16 d18, d16, d0, d2
+
+    h264_loop_filter_chroma8
+
+    vtrn.16         d16, d0
+
+    sub             r0,  r0,  r1, lsl #2
+    add             r0,  r0,  #2
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0]
+
+    bx              lr
+endfunc
+
+.macro h264_loop_filter_chroma_intra, width=16
+    vdup.8          q11, r2         @ alpha
+    vabd.u8         q13, q8,  q0    @ abs(p0 - q0)
+    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
+    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
+    vclt.u8         q13, q13, q11   @ < alpha
+    vdup.8          q11, r3         @ beta
+    vclt.u8         q14, q14, q11   @ < beta
+    vclt.u8         q15, q15, q11   @ < beta
+    vand            q13, q13, q14
+    vand            q13, q13, q15
+
+    vshll.u8        q14, d18, #1
+    vshll.u8        q2,  d2,  #1
+.ifc \width, 16
+    vshll.u8        q15, d19, #1
+    vshll.u8        q3,  d3,  #1
+    vaddl.u8        q12, d17, d3
+    vaddl.u8        q10, d1,  d19
+.endif
+    vaddl.u8        q11, d16, d2
+    vaddl.u8        q1,  d18, d0    @ or vaddw q2, to not clobber q1
+    vadd.u16        q14, q14, q11
+    vadd.u16        q2,  q2,  q1
+.ifc \width, 16
+    vadd.u16        q15, q15, q12
+    vadd.u16        q3,  q3,  q10
+.endif
+    vqrshrn.u16     d28, q14, #2
+    vqrshrn.u16     d4,  q2, #2
+.ifc \width, 16
+    vqrshrn.u16     d29, q15, #2
+    vqrshrn.u16     d5,  q3, #2
+.endif
+    vbit            q8,  q14, q13
+    vbit            q0,  q2,  q13
+.endm
+
+function deblock_v_chroma_intra_neon
+    sub             r0,  r0,  r1, lsl #1
+    vld2.8          {d18,d19}, [r0,:128], r1
+    vld2.8          {d16,d17}, [r0,:128], r1
+    vld2.8          {d0, d1},  [r0,:128], r1
+    vld2.8          {d2, d3},  [r0,:128]
+
+    h264_loop_filter_chroma_intra
+
+    sub             r0,  r0,  r1, lsl #1
+    vst2.8          {d16,d17}, [r0,:128], r1
+    vst2.8          {d0, d1},  [r0,:128], r1
+
+    bx              lr
+endfunc
+
+function deblock_h_chroma_intra_neon
+    sub             r0,  r0,  #4
+    vld1.8          {d18}, [r0], r1
+    vld1.8          {d16}, [r0], r1
+    vld1.8          {d0},  [r0], r1
+    vld1.8          {d2},  [r0], r1
+    vld1.8          {d19}, [r0], r1
+    vld1.8          {d17}, [r0], r1
+    vld1.8          {d1},  [r0], r1
+    vld1.8          {d3},  [r0], r1
+
+    TRANSPOSE4x4_16 q9, q8, q0, q1
+
+    h264_loop_filter_chroma_intra
+
+    vtrn.16         q8,  q0
+
+    sub             r0,  r0,  r1, lsl #3
+    add             r0,  r0,  #2
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0], r1
+    vst1.32         {d17[0]}, [r0], r1
+    vst1.32         {d1[0]},  [r0], r1
+    vst1.32         {d17[1]}, [r0], r1
+    vst1.32         {d1[1]},  [r0], r1
+
+    bx              lr
+endfunc
+
+function deblock_h_chroma_422_intra_neon
+    push            {lr}
+    bl              X(deblock_h_chroma_intra_neon)
+    add             r0, r0,  #2
+    pop             {lr}
+    b               X(deblock_h_chroma_intra_neon)
+endfunc
+
+function deblock_h_chroma_intra_mbaff_neon
+    sub             r0,  r0,  #4
+    vld1.8          {d18}, [r0], r1
+    vld1.8          {d16}, [r0], r1
+    vld1.8          {d0},  [r0], r1
+    vld1.8          {d2},  [r0], r1
+
+    TRANSPOSE4x4_16 d18, d16, d0, d2
+
+    h264_loop_filter_chroma_intra width=8
+
+    vtrn.16         d16, d0
+
+    sub             r0,  r0,  r1, lsl #2
+    add             r0,  r0,  #2
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0]
+
+    bx              lr
+endfunc
+
+function deblock_strength_neon
+    ldr             ip,  [sp]
+    vmov.i8         q8,  #0
+    lsl             ip,  ip,  #8
+    add             r3,  r3,  #32
+    sub             ip,  ip,  #(1<<8)-3
+    vmov.i8         q9,  #0
+    vdup.16         q10, ip
+    ldr             ip,  [sp, #4]
+
+lists:
+    @ load bytes ref
+    vld1.8          {d31}, [r1]!
+    add             r2,  r2,  #16
+    vld1.8          {q1},  [r1]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r1]!
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    veor            q0,  q0,  q2
+    veor            q1,  q1,  q2
+    vorr            q8,  q8,  q0
+    vorr            q9,  q9,  q1
+
+    vld1.16         {q11}, [r2,:128]!   @ mv + 0x10
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x20
+    vld1.16         {q12}, [r2,:128]!   @ mv + 0x30
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x40
+    vld1.16         {q13}, [r2,:128]!   @ mv + 0x50
+    vext.8          q3,  q3,  q12, #12
+    vext.8          q2,  q2,  q13, #12
+    vabd.s16        q0,  q12, q3
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x60
+    vabd.s16        q1,  q13, q2
+    vld1.16         {q14}, [r2,:128]!   @ mv + 0x70
+    vqmovn.u16      d0,  q0
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x80
+    vld1.16         {q15}, [r2,:128]!   @ mv + 0x90
+    vqmovn.u16      d1,  q1
+    vext.8          q3,  q3,  q14, #12
+    vext.8          q2,  q2,  q15, #12
+    vabd.s16        q3,  q14, q3
+    vabd.s16        q2,  q15, q2
+    vqmovn.u16      d2,  q3
+    vqmovn.u16      d3,  q2
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+
+    vabd.s16        q1,  q12, q13
+    vorr            q8,  q8,  q0
+
+    vabd.s16        q0,  q11, q12
+    vabd.s16        q2,  q13, q14
+    vabd.s16        q3,  q14, q15
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    vqmovn.u16      d2,  q2
+    vqmovn.u16      d3,  q3
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    subs            ip,  ip,  #1
+    vorr            q9,  q9,  q0
+    beq             lists
+
+    mov             ip,  #-32
+    @ load bytes nnz
+    vld1.8          {d31}, [r0]!
+    vld1.8          {q1},  [r0]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r0]
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    vorr            q0,  q0,  q2
+    vorr            q1,  q1,  q2
+    vmov.u8         q10, #1
+    vmin.u8         q0,  q0,  q10
+    vmin.u8         q1,  q1,  q10
+    vmin.u8         q8,  q8,  q10       @ mv ? 1 : 0
+    vmin.u8         q9,  q9,  q10
+    vadd.u8         q0,  q0,  q0        @ nnz ? 2 : 0
+    vadd.u8         q1,  q1,  q1
+    vmax.u8         q8,  q8,  q0
+    vmax.u8         q9,  q9,  q1
+    vzip.16         d16, d17
+    vst1.8          {q9}, [r3,:128], ip @ bs[1]
+    vtrn.8          d16, d17
+    vtrn.32         d16, d17
+
+    vst1.8          {q8}, [r3,:128]     @ bs[0]
+    bx              lr
+endfunc
--- a/common/arm/deblock.h
+++ b/common/arm/deblock.h
@@ -0,0 +1,58 @@
+/*****************************************************************************
+ * deblock.h: arm deblocking
+ *****************************************************************************
+ * Copyright (C) 2017-2025 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_DEBLOCK_H
+#define X264_ARM_DEBLOCK_H
+
+#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
+void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
+void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
+void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
+void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
+void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                 int mvy_limit, int bframe );
+#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
+void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
+void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
+void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
+void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
+void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
+void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
+void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+
+#endif
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -0,0 +1,366 @@
+/*****************************************************************************
+ * mc-c.c: arm motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+#define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
+void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
+#define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
+void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
+void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
+void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
+void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
+void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
+void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
+void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
+#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
+void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
+                                         pixel *dstv, intptr_t i_dstv,
+                                         pixel *src,  intptr_t i_src, int w, int h );
+#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+                                            pixel *dstb, intptr_t i_dstb,
+                                            pixel *dstc, intptr_t i_dstc,
+                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
+#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
+void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
+
+#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
+#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
+#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
+#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
+#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
+#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
+#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
+#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
+#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
+#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
+#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
+#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
+#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
+#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
+#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
+#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
+#if !HIGH_BIT_DEPTH
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+\
+static weight_fn_t mc##func##_wtab_neon[6] =\
+{\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w8##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+#endif
+
+#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+#define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
+void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
+
+#define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
+#define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+#define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
+void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
+#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
+void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
+#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
+void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
+#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
+void x264_integral_init8v_neon( uint16_t *, intptr_t );
+
+#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
+#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
+#if !HIGH_BIT_DEPTH
+static void weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+        {
+            w->weightfn = mc_offsetsub_wtab_neon;
+            w->cachea[0] = -w->i_offset;
+        }
+        else
+        {
+            w->weightfn = mc_offsetadd_wtab_neon;
+            w->cachea[0] = w->i_offset;
+        }
+    }
+    else if( !w->i_denom )
+        w->weightfn = mc_nodenom_wtab_neon;
+    else
+        w->weightfn = mc_wtab_neon;
+}
+
+static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_neon,
+    x264_pixel_avg2_w8_neon,
+    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
+    x264_pixel_avg2_w16_neon,
+    x264_pixel_avg2_w20_neon,
+};
+
+static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_neon,
+    x264_mc_copy_w8_neon,
+    NULL,
+    x264_mc_copy_w16_neon,
+};
+
+static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
+                          uint8_t *src[4], intptr_t i_src_stride,
+                          int mvx, int mvy,
+                          int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel_avg_wtab_neon[i_width>>2](
+                dst, i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+    }
+    else if( weight->weightfn )
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+    else
+        mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
+                              uint8_t *src[4], intptr_t i_src_stride,
+                              int mvx, int mvy,
+                              int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel_avg_wtab_neon[i_width>>2](
+                dst, *i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+        return dst;
+    }
+    else
+    {
+        *i_dst_stride = i_src_stride;
+        return src1;
+    }
+}
+
+static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+                              intptr_t stride, int width, int height, int16_t *buf )
+{
+    intptr_t realign = (intptr_t)src & 15;
+    src -= realign;
+    dstv -= realign;
+    dstc -= realign;
+    dsth -= realign;
+    width += realign;
+    while( height-- )
+    {
+        x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
+        x264_hpel_filter_c_neon( dstc, buf+8, width );
+        x264_hpel_filter_h_neon( dsth, src, width );
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+        src  += stride;
+    }
+}
+
+PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
+PLANE_INTERLEAVE(neon)
+PROPAGATE_LIST(neon)
+#endif // !HIGH_BIT_DEPTH
+
+void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
+{
+    if( !(cpu&X264_CPU_ARMV6) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
+    pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
+    pf->prefetch_ref  = x264_prefetch_ref_arm;
+#endif // !HIGH_BIT_DEPTH
+
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
+    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
+    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
+
+    pf->plane_copy              = plane_copy_neon;
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+    pf->plane_copy_interleave = plane_copy_interleave_neon;
+    pf->plane_copy_swap = plane_copy_swap_neon;
+
+    pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+
+    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
+    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
+    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
+    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
+    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
+    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
+    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
+
+    pf->weight    = mc_wtab_neon;
+    pf->offsetadd = mc_offsetadd_wtab_neon;
+    pf->offsetsub = mc_offsetsub_wtab_neon;
+    pf->weight_cache = weight_cache_neon;
+
+    pf->mc_chroma = x264_mc_chroma_neon;
+    pf->mc_luma = mc_luma_neon;
+    pf->get_ref = get_ref_neon;
+    pf->hpel_filter = hpel_filter_neon;
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+    pf->integral_init4h = x264_integral_init4h_neon;
+    pf->integral_init8h = x264_integral_init8h_neon;
+    pf->integral_init4v = x264_integral_init4v_neon;
+    pf->integral_init8v = x264_integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = mbtree_propagate_list_neon;
+    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
+    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
+#endif // !HIGH_BIT_DEPTH
+
+// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
+#ifndef SYS_MACOSX
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+#endif
+    pf->memzero_aligned = x264_memzero_aligned_neon;
+}
--- a/common/arm/mc.h
+++ b/common/arm/mc.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * mc.h: arm motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_MC_H
+#define X264_ARM_MC_H
+
+#define x264_mc_init_arm x264_template(mc_init_arm)
+void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
+
+#endif
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -0,0 +1,160 @@
+/*****************************************************************************
+ * pixel.h: arm pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_PIXEL_H
+#define X264_ARM_PIXEL_H
+
+#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
+#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
+#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
+#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
+#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
+#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
+#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
+#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
+#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
+#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
+#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
+#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
+#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
+#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
+#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
+#define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
+#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
+#define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
+#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
+#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
+#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
+#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
+#define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
+#define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
+#define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
+#define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
+#define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
+#define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
+#define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
+#define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
+#define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
+#define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
+#define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
+#define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
+#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
+#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
+#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
+#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
+#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
+#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
+#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
+#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
+#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
+#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
+#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
+#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
+#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
+#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
+#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
+#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
+#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
+#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
+#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
+#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
+#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
+#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
+#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
+#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
+#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
+#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
+#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
+#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
+#define DECL_PIXELS( ret, name, suffix, args ) \
+    ret x264_pixel_##name##_16x16_##suffix args;\
+    ret x264_pixel_##name##_16x8_##suffix args;\
+    ret x264_pixel_##name##_8x16_##suffix args;\
+    ret x264_pixel_##name##_8x8_##suffix args;\
+    ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x8_##suffix args;\
+    ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+    DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
+
+#define DECL_X4( name, suffix ) \
+    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+
+int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+
+DECL_X1( sad, neon )
+DECL_X1( sad_aligned, neon )
+DECL_X1( sad_aligned, neon_dual )
+DECL_X4( sad, neon )
+DECL_X1( satd, neon )
+DECL_X1( ssd, neon )
+
+#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
+void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
+
+#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
+#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
+int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
+int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+
+#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
+uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
+#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
+#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
+
+#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
+uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
+#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
+uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
+#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
+uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
+#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
+
+#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
+                                      const uint8_t *, intptr_t,
+                                      int sums[2][4] );
+#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
+float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+
+#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
+int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
+
+#endif
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -0,0 +1,808 @@
+/*****************************************************************************
+ * predict.S: arm intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Mans Rullgard <mans@mansr.com>
+ *          Martin Storsjo <martin@martin.st>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const p16weight, align=4
+.short 1,2,3,4,5,6,7,8
+endconst
+
+.text
+
+.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
+.if \n == 8 || \hi == 0
+    vld1.8          {\rd[0]}, [\rs], \rt
+    vld1.8          {\rd[1]}, [\rs], \rt
+    vld1.8          {\rd[2]}, [\rs], \rt
+    vld1.8          {\rd[3]}, [\rs], \rt
+.endif
+.if \n == 8 || \hi == 1
+    vld1.8          {\rd[4]}, [\rs], \rt
+    vld1.8          {\rd[5]}, [\rs], \rt
+    vld1.8          {\rd[6]}, [\rs], \rt
+    vld1.8          {\rd[7]}, [\rs], \rt
+.endif
+.endm
+
+.macro ldcol.16  rd1,  rd2,  rs,  rt,  ru
+    add             \ru, \rs, \rt, lsl #3
+    vld1.8          {\rd1[0]}, [\rs], \rt
+    vld1.8          {\rd2[0]}, [\ru], \rt
+    vld1.8          {\rd1[1]}, [\rs], \rt
+    vld1.8          {\rd2[1]}, [\ru], \rt
+    vld1.8          {\rd1[2]}, [\rs], \rt
+    vld1.8          {\rd2[2]}, [\ru], \rt
+    vld1.8          {\rd1[3]}, [\rs], \rt
+    vld1.8          {\rd2[3]}, [\ru], \rt
+    vld1.8          {\rd1[4]}, [\rs], \rt
+    vld1.8          {\rd2[4]}, [\ru], \rt
+    vld1.8          {\rd1[5]}, [\rs], \rt
+    vld1.8          {\rd2[5]}, [\ru], \rt
+    vld1.8          {\rd1[6]}, [\rs], \rt
+    vld1.8          {\rd2[6]}, [\ru], \rt
+    vld1.8          {\rd1[7]}, [\rs], \rt
+    vld1.8          {\rd2[7]}, [\ru], \rt
+.endm
+
+.macro add16x8  dq,  dl,  dh,  rl,  rh
+    vaddl.u8        \dq, \rl, \rh
+    vadd.u16        \dl, \dl, \dh
+    vpadd.u16       \dl, \dl, \dl
+    vpadd.u16       \dl, \dl, \dl
+.endm
+
+
+// because gcc doesn't believe in using the free shift in add
+function predict_4x4_h_armv6
+    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
+    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
+    add     r1, r1, r1, lsl #8
+    add     r2, r2, r2, lsl #8
+    add     r3, r3, r3, lsl #8
+    add     ip, ip, ip, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    add     r2, r2, r2, lsl #16
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r3, lsl #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     ip, ip, ip, lsl #16
+    str     ip, [r0, #3*FDEC_STRIDE]
+    bx      lr
+endfunc
+
+function predict_4x4_v_armv6
+    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
+    bx      lr
+endfunc
+
+function predict_4x4_dc_armv6
+    mov     ip, #0
+    ldr     r1, [r0, #-FDEC_STRIDE]
+    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
+    usad8   r1, r1, ip
+    add     r2, r2, #4
+    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
+    add     r2, r2, r3
+    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
+    add     r2, r2, ip
+    add     r2, r2, r3
+    add     r1, r1, r2
+    lsr     r1, r1, #3
+    add     r1, r1, r1, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    str     r1, [r0, #1*FDEC_STRIDE]
+    str     r1, [r0, #2*FDEC_STRIDE]
+    str     r1, [r0, #3*FDEC_STRIDE]
+    bx      lr
+endfunc
+
+function predict_4x4_dc_top_neon
+    mov         r12, #FDEC_STRIDE
+    sub         r1, r0, #FDEC_STRIDE
+    vld1.32     d1[], [r1,:32]
+    vpaddl.u8   d1, d1
+    vpadd.u16   d1, d1, d1
+    vrshr.u16   d1, d1, #2
+    vdup.8      d1, d1[0]
+    vst1.32     d1[0], [r0,:32], r12
+    vst1.32     d1[0], [r0,:32], r12
+    vst1.32     d1[0], [r0,:32], r12
+    vst1.32     d1[0], [r0,:32], r12
+    bx          lr
+endfunc
+
+// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
+.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
+    uhadd8  \a1, \a1, \c1
+    uhadd8  \a2, \a2, \c2
+    uhadd8  \c1, \a1, \b1
+    uhadd8  \c2, \a2, \b2
+    eor     \a1, \a1, \b1
+    eor     \a2, \a2, \b2
+    and     \a1, \a1, \pb_1
+    and     \a2, \a2, \pb_1
+    uadd8   \a1, \a1, \c1
+    uadd8   \a2, \a2, \c2
+.endm
+
+function predict_4x4_ddr_armv6
+    ldr     r1, [r0, # -FDEC_STRIDE]
+    ldrb    r2, [r0, # -FDEC_STRIDE-1]
+    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
+    push    {r4-r6,lr}
+    add     r2, r2, r1, lsl #8
+    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
+    add     r3, r3, r2, lsl #8
+    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
+    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
+    add     r4, r4, r3, lsl #8
+    add     r5, r5, r4, lsl #8
+    add     r6, r6, r5, lsl #8
+    ldr     ip, =0x01010101
+    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
+    str     r1, [r0, #0*FDEC_STRIDE]
+    lsl     r2, r1, #8
+    lsl     r3, r1, #16
+    lsl     r4, r4, #8
+    lsl     r5, r1, #24
+    add     r2, r2, r4, lsr #24
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r4, lsr #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     r5, r5, r4, lsr #8
+    str     r5, [r0, #3*FDEC_STRIDE]
+    pop     {r4-r6,pc}
+endfunc
+
+function predict_4x4_ddl_neon
+    sub         r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0], ip
+    vdup.8      d3, d0[7]
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d3, #2
+    vhadd.u8    d0, d0, d2
+    vrhadd.u8   d0, d0, d1
+    vst1.32     {d0[0]}, [r0,:32], ip
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d0, #2
+    vst1.32     {d1[0]}, [r0,:32], ip
+    vext.8      d3, d0, d0, #3
+    vst1.32     {d2[0]}, [r0,:32], ip
+    vst1.32     {d3[0]}, [r0,:32], ip
+    bx          lr
+endfunc
+
+function predict_8x8_dc_neon
+    mov     ip, #0
+    ldrd    r2, r3, [r1, #8]
+    push    {r4-r5,lr}
+    ldrd    r4, r5, [r1, #16]
+    lsl     r3, r3, #8
+    ldrb    lr, [r1, #7]
+    usad8   r2, r2, ip
+    usad8   r3, r3, ip
+    usada8  r2, r4, ip, r2
+    add     lr, lr, #8
+    usada8  r3, r5, ip, r3
+    add     r2, r2, lr
+    mov     ip, #FDEC_STRIDE
+    add     r2, r2, r3
+    lsr     r2, r2, #4
+
+    vdup.8  d0, r2
+.rept 8
+    vst1.64 {d0}, [r0,:64], ip
+.endr
+    pop     {r4-r5,pc}
+endfunc
+
+function predict_8x8_h_neon
+    add         r1, r1, #7
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d16}, [r1]
+    vdup.8      d0, d16[7]
+    vdup.8      d1, d16[6]
+    vst1.64     {d0}, [r0,:64], ip
+    vdup.8      d2, d16[5]
+    vst1.64     {d1}, [r0,:64], ip
+    vdup.8      d3, d16[4]
+    vst1.64     {d2}, [r0,:64], ip
+    vdup.8      d4, d16[3]
+    vst1.64     {d3}, [r0,:64], ip
+    vdup.8      d5, d16[2]
+    vst1.64     {d4}, [r0,:64], ip
+    vdup.8      d6, d16[1]
+    vst1.64     {d5}, [r0,:64], ip
+    vdup.8      d7, d16[0]
+    vst1.64     {d6}, [r0,:64], ip
+    vst1.64     {d7}, [r0,:64], ip
+    bx          lr
+endfunc
+
+function predict_8x8_v_neon
+    add         r1, r1, #16
+    mov         r12, #FDEC_STRIDE
+    vld1.8      {d0}, [r1,:64]
+.rept 8
+    vst1.8      {d0}, [r0,:64], r12
+.endr
+    bx          lr
+endfunc
+
+function predict_8x8_ddl_neon
+    add         r1, #16
+    vld1.8      {d0, d1}, [r1,:128]
+    vmov.i8     q3, #0
+    vrev64.8    d2, d1
+    vext.8      q8, q3, q0, #15
+    vext.8      q2, q0, q1, #1
+    vhadd.u8    q8, q2
+    mov         r12, #FDEC_STRIDE
+    vrhadd.u8   q0, q8
+    vext.8      d2, d0, d1, #1
+    vext.8      d3, d0, d1, #2
+    vst1.8      d2, [r0,:64], r12
+    vext.8      d2, d0, d1, #3
+    vst1.8      d3, [r0,:64], r12
+    vext.8      d3, d0, d1, #4
+    vst1.8      d2, [r0,:64], r12
+    vext.8      d2, d0, d1, #5
+    vst1.8      d3, [r0,:64], r12
+    vext.8      d3, d0, d1, #6
+    vst1.8      d2, [r0,:64], r12
+    vext.8      d2, d0, d1, #7
+    vst1.8      d3, [r0,:64], r12
+    vst1.8      d2, [r0,:64], r12
+    vst1.8      d1, [r0,:64], r12
+    bx          lr
+endfunc
+
+function predict_8x8_ddr_neon
+    vld1.8      {d0-d3}, [r1,:128]
+    vext.8      q2, q0, q1, #7
+    vext.8      q3, q0, q1, #9
+
+    vhadd.u8    q2, q2, q3
+    vrhadd.u8   d0, d1, d4
+    vrhadd.u8   d1, d2, d5
+
+    add         r0, #7*FDEC_STRIDE
+    mov         r12, #-1*FDEC_STRIDE
+
+    vext.8      d2, d0, d1, #1
+    vst1.8      {d0}, [r0,:64], r12
+    vext.8      d4, d0, d1, #2
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d5, d0, d1, #3
+    vst1.8      {d4}, [r0,:64], r12
+    vext.8      d4, d0, d1, #4
+    vst1.8      {d5}, [r0,:64], r12
+    vext.8      d5, d0, d1, #5
+    vst1.8      {d4}, [r0,:64], r12
+    vext.8      d4, d0, d1, #6
+    vst1.8      {d5}, [r0,:64], r12
+    vext.8      d5, d0, d1, #7
+    vst1.8      {d4}, [r0,:64], r12
+    vst1.8      {d5}, [r0,:64], r12
+    bx          lr
+endfunc
+
+function predict_8x8_vl_neon
+    add         r1, #16
+    mov         r12, #FDEC_STRIDE
+
+    vld1.8      {d0, d1}, [r1,:128]
+    vext.8      q1, q1, q0, #15
+    vext.8      q2, q0, q2, #1
+
+    vrhadd.u8   q3, q0, q2
+
+    vhadd.u8    q1, q1, q2
+    vrhadd.u8   q0, q0, q1
+
+    vext.8      d2, d0, d1, #1
+    vst1.8      {d6}, [r0,:64], r12
+    vext.8      d3, d6, d7, #1
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d0, d1, #2
+    vst1.8      {d3}, [r0,:64], r12
+    vext.8      d3, d6, d7, #2
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d0, d1, #3
+    vst1.8      {d3}, [r0,:64], r12
+    vext.8      d3, d6, d7, #3
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d0, d1, #4
+    vst1.8      {d3}, [r0,:64], r12
+    vst1.8      {d2}, [r0,:64], r12
+    bx          lr
+endfunc
+
+function predict_8x8_vr_neon
+    add         r1, #8
+    mov         r12, #FDEC_STRIDE
+    vld1.8      {d4,d5}, [r1,:64]
+
+    vext.8      q1, q2, q2, #14
+    vext.8      q0, q2, q2, #15
+
+    vhadd.u8    q3, q2, q1
+    vrhadd.u8   q2, q2, q0
+    vrhadd.u8   q0, q0, q3
+
+    vmov        d2, d0
+
+    vst1.8      {d5}, [r0,:64], r12
+    vuzp.8      d2, d0
+    vst1.8      {d1}, [r0,:64], r12
+    vext.8      d6, d0, d5, #7
+    vext.8      d3, d2, d1, #7
+    vst1.8      {d6}, [r0,:64], r12
+    vst1.8      {d3}, [r0,:64], r12
+    vext.8      d6, d0, d5, #6
+    vext.8      d3, d2, d1, #6
+    vst1.8      {d6}, [r0,:64], r12
+    vst1.8      {d3}, [r0,:64], r12
+    vext.8      d6, d0, d5, #5
+    vext.8      d3, d2, d1, #5
+    vst1.8      {d6}, [r0,:64], r12
+    vst1.8      {d3}, [r0,:64], r12
+    bx          lr
+endfunc
+
+function predict_8x8_hd_neon
+    mov         r12, #FDEC_STRIDE
+    add         r1, #7
+
+    vld1.8      {d2,d3}, [r1]
+    vext.8      q3, q1, q1, #1
+    vext.8      q2, q1, q1, #2
+
+    vrhadd.u8   q8, q1, q3
+
+    vhadd.u8    q1, q2
+    vrhadd.u8   q0, q1, q3
+
+    vzip.8      d16, d0
+
+    vext.8      d2, d0, d1, #6
+    vext.8      d3, d0, d1, #4
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d0, d1, #2
+    vst1.8      {d3}, [r0,:64], r12
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d16, d0, #6
+    vst1.8      {d0}, [r0,:64], r12
+    vext.8      d3, d16, d0, #4
+    vst1.8      {d2}, [r0,:64], r12
+    vext.8      d2, d16, d0, #2
+    vst1.8      {d3}, [r0,:64], r12
+    vst1.8      {d2}, [r0,:64], r12
+    vst1.8      {d16}, [r0,:64], r12
+
+    bx          lr
+endfunc
+
+function predict_8x8_hu_neon
+    mov         r12, #FDEC_STRIDE
+    add         r1, #7
+    vld1.8      {d7}, [r1]
+    vdup.8      d6, d7[0]
+    vrev64.8    d7, d7
+
+    vext.8      d4, d7, d6, #2
+    vext.8      d2, d7, d6, #1
+
+    vhadd.u8    d16, d7, d4
+    vrhadd.u8   d0, d2, d7
+    vrhadd.u8   d1, d16, d2
+
+    vzip.8      d0, d1
+
+    vdup.16     q1, d1[3]
+
+    vext.8      q2, q0, q1, #2
+    vext.8      q3, q0, q1, #4
+    vext.8      q8, q0, q1, #6
+    vst1.8      {d0}, [r0,:64], r12
+    vst1.8      {d4}, [r0,:64], r12
+    vst1.8      {d6}, [r0,:64], r12
+    vst1.8      {d16}, [r0,:64], r12
+
+    vst1.8      {d1}, [r0,:64], r12
+    vst1.8      {d5}, [r0,:64], r12
+    vst1.8      {d7}, [r0,:64], r12
+    vst1.8      {d17}, [r0,:64]
+    bx          lr
+endfunc
+
+function predict_8x8c_dc_top_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {d0}, [r2,:64]
+    vpaddl.u8   d0,  d0
+    vpadd.u16   d0,  d0,  d0
+    vrshrn.u16  d0,  q0,  #2
+    vdup.8      d1,  d0[1]
+    vdup.8      d0,  d0[0]
+    vtrn.32     d0,  d1
+    b           pred8x8_dc_end
+endfunc
+
+function predict_8x8c_dc_left_neon
+    mov         r1,  #FDEC_STRIDE
+    sub         r2,  r0,  #1
+    ldcol.8     d0,  r2,  r1
+    vpaddl.u8   d0,  d0
+    vpadd.u16   d0,  d0,  d0
+    vrshrn.u16  d0,  q0,  #2
+    vdup.8      d1,  d0[1]
+    vdup.8      d0,  d0[0]
+    b           pred8x8_dc_end
+endfunc
+
+function predict_8x8c_dc_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {d0}, [r2,:64]
+    sub         r2,  r0,  #1
+    ldcol.8     d1,  r2,  r1
+    vtrn.32     d0,  d1
+    vpaddl.u8   q0,  q0
+    vpadd.u16   d0,  d0,  d1
+    vpadd.u16   d1,  d0,  d0
+    vrshrn.u16  d2,  q0,  #3
+    vrshrn.u16  d3,  q0,  #2
+    vdup.8      d0,  d2[4]
+    vdup.8      d1,  d3[3]
+    vdup.8      d4,  d3[2]
+    vdup.8      d5,  d2[5]
+    vtrn.32     q0,  q2
+pred8x8_dc_end:
+    add         r2,  r0,  r1,  lsl #2
+.rept 4
+    vst1.8      {d0}, [r0,:64], r1
+    vst1.8      {d1}, [r2,:64], r1
+.endr
+    bx          lr
+endfunc
+
+function predict_8x8c_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 4
+    vld1.8      {d0[]}, [r1], ip
+    vld1.8      {d2[]}, [r1], ip
+    vst1.64     {d0}, [r0,:64], ip
+    vst1.64     {d2}, [r0,:64], ip
+.endr
+    bx          lr
+endfunc
+
+function predict_8x8c_v_neon
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0,:64], ip
+.rept 8
+    vst1.64     {d0}, [r0,:64], ip
+.endr
+    bx          lr
+endfunc
+
+function predict_8x8c_p_neon
+    sub         r3,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    add         r2,  r3,  #4
+    sub         r3,  r3,  #1
+    vld1.32     {d0[0]}, [r3]
+    vld1.32     {d2[0]}, [r2,:32], r1
+    ldcol.8     d0,  r3,  r1,  4,  hi=1
+    add         r3,  r3,  r1
+    ldcol.8     d3,  r3,  r1,  4
+    vaddl.u8    q8,  d2,  d3
+    vrev32.8    d0,  d0
+    vtrn.32     d2,  d3
+    vsubl.u8    q2,  d2,  d0
+    movrel      r3,  p16weight
+    vld1.16     {q0}, [r3,:128]
+    vmul.s16    d4,  d4,  d0
+    vmul.s16    d5,  d5,  d0
+    vpadd.i16   d4,  d4,  d5
+    vpaddl.s16  d4,  d4
+    vshl.i32    d5,  d4,  #4
+    vadd.s32    d4,  d4,  d5
+    vrshrn.s32  d4,  q2,  #5
+    mov         r3,  #0
+    vtrn.16     d4,  d5
+    vadd.i16    d2,  d4,  d5
+    vshl.i16    d3,  d2,  #2
+    vrev64.16   d16, d16
+    vsub.i16    d3,  d3,  d2
+    vadd.i16    d16, d16, d0
+    vshl.i16    d2,  d16, #4
+    vsub.i16    d2,  d2,  d3
+    vext.16     q0,  q0,  q0,  #7
+    vmov.16     d0[0], r3
+    vmul.i16    q0,  q0,  d4[0]
+    vdup.16     q1,  d2[0]
+    vdup.16     q3,  d5[0]
+    vadd.i16    q1,  q1,  q0
+    mov         r3,  #8
+1:
+    vqshrun.s16 d0,  q1,  #5
+    vadd.i16    q1,  q1,  q3
+    vst1.8      {d0}, [r0,:64], r1
+    subs        r3,  r3,  #1
+    bne         1b
+    bx          lr
+endfunc
+
+
+function predict_8x16c_dc_top_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {d0}, [r2,:64]
+    vpaddl.u8   d0,  d0
+    vpadd.u16   d0,  d0,  d0
+    vrshrn.u16  d0,  q0,  #2
+    vdup.8      d1,  d0[1]
+    vdup.8      d0,  d0[0]
+    vtrn.32     d0,  d1
+
+    add         r2,  r0,  r1,  lsl #2
+.rept 4
+    vst1.8      {d0}, [r0,:64], r1
+    vst1.8      {d1}, [r2,:64], r1
+.endr
+    add         r2,  r2,  r1,  lsl #2
+    add         r0,  r0,  r1,  lsl #2
+.rept 4
+    vst1.8      {d0}, [r0,:64], r1
+    vst1.8      {d1}, [r2,:64], r1
+.endr
+    bx          lr
+endfunc
+
+function predict_8x16c_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 8
+    vld1.8      {d0[]}, [r1], ip
+    vld1.8      {d2[]}, [r1], ip
+    vst1.64     {d0}, [r0,:64], ip
+    vst1.64     {d2}, [r0,:64], ip
+.endr
+    bx          lr
+endfunc
+
+function predict_8x16c_p_neon
+    sub         r3,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    add         r2,  r3,  #4
+    sub         r3,  r3,  #1
+    vld1.32     {d0[0]}, [r3]
+    vld1.32     {d2[0]}, [r2,:32], r1
+    ldcol.8     d1,  r3,  r1
+    add         r3,  r3,  r1
+    ldcol.8     d3,  r3,  r1
+    vrev64.32   d16, d3
+    vaddl.u8    q8,  d2,  d16
+    vrev32.8    d0,  d0
+    vsubl.u8    q2,  d2,  d0
+    vrev64.8    d1,  d1
+    vsubl.u8    q3,  d3,  d1
+    movrel      r3,  p16weight
+    vld1.16     {q0}, [r3,:128]
+    vmul.s16    d4,  d4,  d0
+    vmul.s16    q3,  q3,  q0
+    vpadd.i16   d4,  d4,  d5
+    vpadd.i16   d6,  d6,  d7
+    vpaddl.s16  d4,  d4        @ d4[0] = H
+    vpaddl.s16  d6,  d6
+    vpadd.s32   d6,  d6        @ d6[0] = V
+    vshl.i32    d5,  d4,  #4
+    vadd.s32    d4,  d4,  d5   @ d4[0] = 17*H
+    vshl.i32    d7,  d6,  #2
+    vrshrn.s32  d4,  q2,  #5   @ d4[0] = b
+    vadd.s32    d6,  d6,  d7   @ d6[0] = 5*V
+    vrshrn.s32  d6,  q3,  #6   @ d6[0] = c
+    mov         r3,  #0
+    vshl.i16    d3,  d4,  #2
+    vsub.i16    d3,  d3,  d4   @ d2[0] = 3 * b
+    vshl.i16    d2,  d6,  #3
+    vadd.i16    d3,  d3,  d2   @ d2[0] = 3 * b + 8 * c
+    vsub.i16    d3,  d3,  d6   @ d2[0] = 3 * b + 7 * c
+    vrev64.16   d16, d16
+    vadd.i16    d16, d16, d0   @ d16[0] = src[]+src[] + 1
+    vshl.i16    d2,  d16, #4   @ d3[0] = a + 16
+    vsub.i16    d2,  d2,  d3   @ i00
+    vext.16     q0,  q0,  q0,  #7
+    vmov.16     d0[0], r3
+    vmul.i16    q0,  q0,  d4[0]
+    vdup.16     q1,  d2[0]
+    vdup.16     q3,  d6[0]
+    vadd.i16    q1,  q1,  q0
+    mov         r3,  #16
+1:
+    vqshrun.s16 d0,  q1,  #5
+    vadd.i16    q1,  q1,  q3
+    vst1.8      {d0}, [r0,:64], r1
+    subs        r3,  r3,  #1
+    bne         1b
+    bx          lr
+endfunc
+
+
+function predict_16x16_dc_top_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {q0}, [r2,:128]
+    add16x8     q0,  d0,  d1,  d0,  d1
+    vrshrn.u16  d0,  q0,  #4
+    vdup.8      q0,  d0[0]
+    b           pred16x16_dc_end
+endfunc
+
+function predict_16x16_dc_left_neon
+    mov         r1,  #FDEC_STRIDE
+    sub         r2,  r0,  #1
+    ldcol.8     d0,  r2,  r1
+    ldcol.8     d1,  r2,  r1
+    add16x8     q0,  d0,  d1,  d0,  d1
+    vrshrn.u16  d0,  q0,  #4
+    vdup.8      q0,  d0[0]
+    b           pred16x16_dc_end
+endfunc
+
+function predict_16x16_dc_neon
+    sub         r3, r0, #FDEC_STRIDE
+    sub         r0, r0, #1
+    vld1.64     {d0-d1}, [r3,:128]
+    ldrb        ip, [r0], #FDEC_STRIDE
+    vaddl.u8    q0, d0, d1
+    ldrb        r1, [r0], #FDEC_STRIDE
+    vadd.u16    d0, d0, d1
+    vpadd.u16   d0, d0, d0
+    vpadd.u16   d0, d0, d0
+.rept 4
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+    ldrb        r1, [r0], #FDEC_STRIDE
+    add         ip, ip, r3
+.endr
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+
+    sub         r0, r0, #FDEC_STRIDE*16
+    add         ip, ip, r3
+    vdup.16     d1, ip
+    vadd.u16    d0, d0, d1
+    mov         r1, #FDEC_STRIDE
+    add         r0, r0, #1
+    vrshr.u16   d0, d0, #5
+    vdup.8      q0, d0[0]
+pred16x16_dc_end:
+.rept 16
+    vst1.64     {d0-d1}, [r0,:128], r1
+.endr
+    bx          lr
+endfunc
+
+function predict_16x16_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 8
+    vld1.8      {d0[]}, [r1], ip
+    vmov        d1, d0
+    vld1.8      {d2[]}, [r1], ip
+    vmov        d3, d2
+    vst1.64     {d0-d1}, [r0,:128], ip
+    vst1.64     {d2-d3}, [r0,:128], ip
+.endr
+    bx          lr
+endfunc
+
+function predict_16x16_v_neon
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0-d1}, [r0,:128], ip
+.rept 16
+    vst1.64     {d0-d1}, [r0,:128], ip
+.endr
+    bx          lr
+endfunc
+
+function predict_16x16_p_neon
+    sub         r3,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    add         r2,  r3,  #8
+    sub         r3,  r3,  #1
+    vld1.8      {d0}, [r3]
+    vld1.8      {d2}, [r2,:64], r1
+    ldcol.8     d1,  r3,  r1
+    add         r3,  r3,  r1
+    ldcol.8     d3,  r3,  r1
+    vrev64.8    q0,  q0
+    vaddl.u8    q8,  d2,  d3
+    vsubl.u8    q2,  d2,  d0
+    vsubl.u8    q3,  d3,  d1
+    movrel      r3,  p16weight
+    vld1.8      {q0}, [r3,:128]
+    vmul.s16    q2,  q2,  q0
+    vmul.s16    q3,  q3,  q0
+    vadd.i16    d4,  d4,  d5
+    vadd.i16    d5,  d6,  d7
+    vpadd.i16   d4,  d4,  d5
+    vpadd.i16   d4,  d4,  d4
+    vshll.s16   q3,  d4,  #2
+    vaddw.s16   q2,  q3,  d4
+    vrshrn.s32  d4,  q2,  #6
+    mov         r3,  #0
+    vtrn.16     d4,  d5
+    vadd.i16    d2,  d4,  d5
+    vshl.i16    d3,  d2,  #3
+    vrev64.16   d16, d17
+    vsub.i16    d3,  d3,  d2
+    vadd.i16    d16, d16, d0
+    vshl.i16    d2,  d16, #4
+    vsub.i16    d2,  d2,  d3
+    vshl.i16    d3,  d4,  #4
+    vext.16     q0,  q0,  q0,  #7
+    vsub.i16    d6,  d5,  d3
+    vmov.16     d0[0], r3
+    vmul.i16    q0,  q0,  d4[0]
+    vdup.16     q1,  d2[0]
+    vdup.16     q2,  d4[0]
+    vdup.16     q3,  d6[0]
+    vshl.i16    q2,  q2,  #3
+    vadd.i16    q1,  q1,  q0
+    vadd.i16    q3,  q3,  q2
+    mov         r3,  #16
+1:
+    vqshrun.s16 d0,  q1,  #5
+    vadd.i16    q1,  q1,  q2
+    vqshrun.s16 d1,  q1,  #5
+    vadd.i16    q1,  q1,  q3
+    vst1.8      {q0}, [r0,:128], r1
+    subs        r3,  r3,  #1
+    bne         1b
+    bx          lr
+endfunc
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+ * predict.c: arm intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
+{
+    if( !(cpu&X264_CPU_ARMV6) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
+    pf[I_PRED_4x4_V]   = x264_predict_4x4_v_armv6;
+    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
+    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
+
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+    pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
+    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
+{
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
+    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
+    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
+    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
+    pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
+{
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    /* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_neon;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_neon;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
+    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
+    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
+    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
+    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
+    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
+{
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
+    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
+    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
+    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
+    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
+    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
--- a/common/arm/predict.h
+++ b/common/arm/predict.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * predict.h: arm intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_PREDICT_H
+#define X264_ARM_PREDICT_H
+
+#define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+#define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
+void x264_predict_4x4_v_armv6( uint8_t *src );
+#define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
+void x264_predict_4x4_h_armv6( uint8_t *src );
+#define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
+void x264_predict_8x8c_h_neon( uint8_t *src );
+#define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
+void x264_predict_8x8c_v_neon( uint8_t *src );
+#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
+void x264_predict_8x16c_h_neon( uint8_t *src );
+#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
+void x264_predict_8x16c_dc_top_neon( uint8_t *src );
+#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
+void x264_predict_8x16c_p_neon( uint8_t *src );
+
+#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
+void x264_predict_16x16_dc_neon( uint8_t *src );
+#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
+void x264_predict_16x16_h_neon( uint8_t *src );
+#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
+void x264_predict_16x16_v_neon( uint8_t *src );
+#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
+void x264_predict_16x16_p_neon( uint8_t *src );
+
+#define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
+void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
+#define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
+void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+#define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
+void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
+#define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
+void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
+#define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
+void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
+
+#endif
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -0,0 +1,574 @@
+/****************************************************************************
+ * quant.S: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2009-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const pmovmskb_byte, align=4
+.byte 1,2,4,8,16,32,64,128
+.byte 1,2,4,8,16,32,64,128
+endconst
+
+const mask_2bit, align=4
+.byte 3,12,48,192,3,12,48,192
+.byte 3,12,48,192,3,12,48,192
+endconst
+
+const mask_1bit, align=4
+.byte 128,64,32,16,8,4,2,1
+.byte 128,64,32,16,8,4,2,1
+endconst
+
+.text
+
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
+    vadd.u16    q8,  q8,  \bias0
+    vadd.u16    q9,  q9,  \bias1
+.ifc \load_mf, yes
+    vld1.64     {\mf0-\mf3}, [r1,:128]!
+.endif
+    vmull.u16   q10, d16, \mf0
+    vmull.u16   q11, d17, \mf1
+    vmull.u16   q12, d18, \mf2
+    vmull.u16   q13, d19, \mf3
+    vshr.s16    q14, q14, #15
+    vshr.s16    q15, q15, #15
+    vshrn.u32   d16, q10, #16
+    vshrn.u32   d17, q11, #16
+    vshrn.u32   d18, q12, #16
+    vshrn.u32   d19, q13, #16
+    veor        q8,  q8,  q14
+    veor        q9,  q9,  q15
+    vsub.s16    q8,  q8,  q14
+    vsub.s16    q9,  q9,  q15
+    vorr        \mask, q8,  q9
+    vst1.64     {d16-d19}, [r0,:128]!
+.endm
+
+.macro QUANT_END d
+    vmov        r2,  r3,  \d
+    orrs        r0,  r2,  r3
+    movne       r0,  #1
+    bx          lr
+.endm
+
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function quant_2x2_dc_neon
+    vld1.64     {d0}, [r0,:64]
+    vabs.s16    d3,  d0
+    vdup.16     d2,  r2
+    vdup.16     d1,  r1
+    vadd.u16    d3,  d3,  d2
+    vmull.u16   q3,  d3,  d1
+    vshr.s16    d0,  d0,  #15
+    vshrn.u32   d3,  q3,  #16
+    veor        d3,  d3,  d0
+    vsub.s16    d3,  d3,  d0
+    vst1.64     {d3}, [r0,:64]
+    QUANT_END   d3
+endfunc
+
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function quant_4x4_dc_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vdup.16     q0,  r2
+    vdup.16     q2,  r1
+    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+endfunc
+
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function quant_4x4_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3}, [r2,:128]
+    vld1.64     {d4-d7}, [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+endfunc
+
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function quant_4x4x4_neon
+    vpush       {d8-d15}
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]
+    vld1.64     {d4-d7},   [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
+    vorr        d8,  d8,  d9
+    vorr        d10, d10, d11
+    vorr        d12, d12, d13
+    vorr        d14, d14, d15
+    vmov        r0,  r1,  d8
+    vmov        r2,  r3, d10
+    orrs        r0,  r1
+    movne       r0,  #1
+    orrs        r2,  r3
+    orrne       r0,  #2
+    vmov        r1,  r2, d12
+    vmov        r3,  ip, d14
+    orrs        r1,  r2
+    orrne       r0,  #4
+    orrs        r3,  ip
+    orrne       r0,  #8
+    vpop        {d8-d15}
+    bx          lr
+endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function quant_8x8_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]!
+    vld1.64     {d4-d7},   [r1,:128]!
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
+.rept 3
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d2-d5},   [r2,:128]!
+    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
+    vorr        q0,  q0,  q1
+.endr
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+endfunc
+
+.macro DEQUANT_START mf_size offset dc=no
+    mov         r3,  #0x2b
+    mul         r3,  r3,  r2
+    lsr         r3,  r3,  #8            // i_qbits = i_qp / 6
+    add         ip,  r3,  r3,  lsl #1
+    sub         r2,  r2,  ip,  lsl #1   // i_mf = i_qp % 6
+.ifc \dc,no
+    add         r1,  r1,  r2, lsl #\mf_size  // dequant_mf[i_mf]
+.else
+    ldr         r1, [r1,  r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
+.endif
+    subs        r3,  r3,  #\offset      // 6 for 8x8
+.endm
+
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+.macro DEQUANT size bits
+function dequant_\size\()_neon
+    DEQUANT_START \bits+2, \bits
+.ifc \size, 8x8
+    mov         r2,  #4
+.endif
+    blt         dequant_\size\()_rshift
+
+    vdup.16     q15, r3
+dequant_\size\()_lshift_loop:
+.ifc \size, 8x8
+    subs        r2,  r2,  #1
+.endif
+    vld1.32     {d16-d17}, [r1,:128]!
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d4,  q8
+    vld1.32     {d20-d21}, [r1,:128]!
+    vmovn.s32   d5,  q9
+    vld1.32     {d22-d23}, [r1,:128]!
+    vmovn.s32   d6,  q10
+    vld1.16     {d0-d3},   [r0,:128]
+    vmovn.s32   d7,  q11
+    vmul.s16    q0,  q0,  q2
+    vmul.s16    q1,  q1,  q3
+    vshl.s16    q0,  q0,  q15
+    vshl.s16    q1,  q1,  q15
+    vst1.16     {d0-d3},   [r0,:128]!
+.ifc \size, 8x8
+    bgt         dequant_\size\()_lshift_loop
+.endif
+    bx          lr
+
+dequant_\size\()_rshift:
+    vdup.32     q15, r3
+    rsb         r3,  r3,  #0
+    mov         ip,  #1
+    sub         r3,  r3,  #1
+    lsl         ip,  ip,  r3
+
+.ifc \size, 8x8
+dequant_\size\()_rshift_loop:
+    subs        r2,  r2,  #1
+.endif
+    vdup.32     q10, ip
+    vld1.32     {d16-d17}, [r1,:128]!
+    vdup.32     q11, ip
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d4,  q8
+    vld1.32     {d16-d17}, [r1,:128]!
+    vmovn.s32   d5,  q9
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d6,  q8
+    vld1.16     {d0-d3},   [r0,:128]
+    vmovn.s32   d7,  q9
+    vdup.32     q12, ip
+    vdup.32     q13, ip
+
+    vmlal.s16   q10, d0,  d4
+    vmlal.s16   q11, d1,  d5
+    vmlal.s16   q12, d2,  d6
+    vmlal.s16   q13, d3,  d7
+    vshl.s32    q10, q10, q15
+    vshl.s32    q11, q11, q15
+    vshl.s32    q12, q12, q15
+    vshl.s32    q13, q13, q15
+
+    vmovn.s32   d0,  q10
+    vmovn.s32   d1,  q11
+    vmovn.s32   d2,  q12
+    vmovn.s32   d3,  q13
+    vst1.16     {d0-d3},   [r0,:128]!
+.ifc \size, 8x8
+    bgt         dequant_\size\()_rshift_loop
+.endif
+    bx          lr
+endfunc
+.endm
+
+DEQUANT 4x4, 4
+DEQUANT 8x8, 6
+
+// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+function dequant_4x4_dc_neon
+    DEQUANT_START 6, 6, yes
+    blt         dequant_4x4_dc_rshift
+
+    lsl         r1,  r1,  r3
+    vdup.16     q2,  r1
+    vld1.16     {d0-d3},   [r0,:128]
+    vdup.16     q15, r3
+
+    vmul.s16    q0,  q0,  q2
+    vmul.s16    q1,  q1,  q2
+    vst1.16     {d0-d3},   [r0,:128]
+    bx          lr
+
+dequant_4x4_dc_rshift:
+    vdup.16     d4,  r1
+    vdup.32     q15, r3
+    rsb         r3,  r3,  #0
+    mov         ip,  #1
+    sub         r3,  r3,  #1
+    lsl         ip,  ip,  r3
+
+    vdup.32     q10, ip
+    vdup.32     q11, ip
+    vld1.16     {d0-d3},   [r0,:128]
+    vdup.32     q12, ip
+    vdup.32     q13, ip
+
+    vmlal.s16   q10, d0,  d4
+    vmlal.s16   q11, d1,  d4
+    vmlal.s16   q12, d2,  d4
+    vmlal.s16   q13, d3,  d4
+    vshl.s32    q10, q10, q15
+    vshl.s32    q11, q11, q15
+    vshl.s32    q12, q12, q15
+    vshl.s32    q13, q13, q15
+
+    vmovn.s32   d0,  q10
+    vmovn.s32   d1,  q11
+    vmovn.s32   d2,  q12
+    vmovn.s32   d3,  q13
+    vst1.16     {d0-d3},   [r0,:128]
+    bx          lr
+endfunc
+
+.macro decimate_score_1x size
+function decimate_score\size\()_neon
+    vld1.16     {q0, q1}, [r0, :128]
+    movrel      r3, mask_2bit
+    vmov.s8     q3,  #0x01
+    vqmovn.s16  d0,  q0
+    vqmovn.s16  d1,  q1
+    vqabs.s8    q2,  q0
+    vld1.8      {q8}, [r3, :128]
+    vceq.s8     q1,  q0,  #0
+    vcgt.s8     q2,  q2,  q3
+    vand.u8     q1,  q1,  q8
+    vshrn.u16   d4,  q2,  #4
+    vpadd.u8    d2,  d2,  d3
+    vpadd.u8    d4,  d4,  d4
+    vpadd.u8    d2,  d2,  d2
+    vmov.32     r2,  d4[0]
+    vmov.32     r1,  d2[0]
+    cmp         r2,  #0
+    beq         0f
+    mov         r0,  #9
+    bx          lr
+0:
+    mvns        r1,  r1
+    mov         r0,  #0
+    bxeq        lr
+.ifc \size, 15
+    lsr         r1,  r1,  #2
+.endif
+    rbit        r1,  r1
+    movrelx     r3,  X264(decimate_table4), r2
+1:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    lsr         r12, r2,  #1
+    ldrb        r2,  [r3, r12]
+    lsls        r1,  r1,  #2
+    add         r0,  r0,  r2
+    bne         1b
+    bx          lr
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+function decimate_score64_neon
+    push        {lr}
+    vld1.16     {q8,  q9},  [r0, :128]!
+    vld1.16     {q10, q11}, [r0, :128]!
+    vld1.16     {q12, q13}, [r0, :128]!
+    vld1.16     {q14, q15}, [r0, :128]
+    movrel      r3, mask_1bit
+    vmov.s8     q3,  #0x01
+    vqmovn.s16  d17, q8
+    vqmovn.s16  d16, q9
+    vqmovn.s16  d19, q10
+    vqmovn.s16  d18, q11
+    vqmovn.s16  d21, q12
+    vqmovn.s16  d20, q13
+    vqmovn.s16  d23, q14
+    vqmovn.s16  d22, q15
+    vqabs.s8    q12, q8
+    vqabs.s8    q13, q9
+    vqabs.s8    q14, q10
+    vqabs.s8    q15, q11
+    vld1.8      {q2}, [r3, :128]
+    vceq.s8     q8,  q8,  #0
+    vceq.s8     q9,  q9,  #0
+    vceq.s8     q10, q10, #0
+    vceq.s8     q11, q11, #0
+    vmax.s8     q12, q12, q13
+    vmax.s8     q14, q14, q15
+    vand.u8     q8,  q8,  q2
+    vand.u8     q9,  q9,  q2
+    vand.u8     q10, q10, q2
+    vand.u8     q11, q11, q2
+    vmax.s8     q12, q12, q14
+    vpadd.u8    d18, d18, d19
+    vpadd.u8    d19, d16, d17
+    vcgt.s8     q12, q12, q3
+    vpadd.u8    d22, d22, d23
+    vpadd.u8    d23, d20, d21
+    vshrn.u16   d24, q12, #4
+    vpadd.u8    d16, d22, d23
+    vpadd.u8    d17, d18, d19
+    vpadd.u8    d24, d24, d24
+    vpadd.u8    d16, d16, d17
+    vmov.32     r2,  d24[0]
+    vmov        r12, r1,  d16
+    cmp         r2,  #0
+    beq         0f
+    mov         r0,  #9
+    pop         {pc}
+0:
+    mvns        r1,  r1
+    mvn         r12, r12
+    mov         r0,  #0
+    mov         lr,  #32
+    movrelx     r3,  X264(decimate_table8), r2
+    beq         2f
+1:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    sub         lr,  lr,  r2
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    sub         lr,  lr,  #1
+    add         r0,  r0,  r2
+    bne         1b
+2:
+    cmp         r12, #0
+    popeq       {pc}
+
+    clz         r2,  r12
+    lsl         r1,  r12, r2
+    add         r2,  r2,  lr
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    add         r0,  r0,  r2
+    popeq       {pc}
+3:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    add         r0,  r0,  r2
+    bne         3b
+    pop         {pc}
+endfunc
+
+// int coeff_last( int16_t *l )
+function coeff_last4_arm
+    ldrd        r2,  r3,  [r0]
+    subs        r0,  r3,  #0
+    movne       r0,  #2
+    movne       r2,  r3
+    lsrs        r2,  r2,  #16
+    addne       r0,  r0,  #1
+    bx          lr
+endfunc
+
+function coeff_last8_arm
+    ldrd        r2,  r3,  [r0, #8]
+    orrs        ip,  r2,  r3
+    movne       r0,  #4
+    ldrdeq      r2,  r3,  [r0]
+    moveq       r0,  #0
+    tst         r3,  r3
+    addne       r0,  #2
+    movne       r2,  r3
+    lsrs        r2,  r2,  #16
+    addne       r0,  r0,  #1
+    bx          lr
+endfunc
+
+.macro COEFF_LAST_1x size
+function coeff_last\size\()_neon
+.if \size == 15
+    sub         r0,  r0,  #2
+.endif
+    vld1.64     {d0-d3}, [r0,:128]
+    vtst.16     q0,  q0
+    vtst.16     q1,  q1
+    vshrn.u16   d0,  q0,  #8
+    vshrn.u16   d1,  q1,  #8
+    vshrn.u16   d0,  q0,  #4
+    vclz.i32    d0,  d0
+    mov         ip,  #7
+    mov         r3,  #\size - 9
+    vmov        r0,  r1,  d0
+
+    subs        r1,  ip,  r1,  lsr #2
+    addge       r0,  r1,  #\size - 8
+    subslt      r0,  r3,  r0,  lsr #2
+    movlt       r0,  #0
+    bx          lr
+endfunc
+.endm
+
+COEFF_LAST_1x 15
+COEFF_LAST_1x 16
+
+function coeff_last64_neon
+    vld1.64     {d16-d19}, [r0,:128]!
+    vqmovn.u16  d16, q8
+    vqmovn.u16  d17, q9
+    vld1.64     {d20-d23}, [r0,:128]!
+    vqmovn.u16  d18, q10
+    vqmovn.u16  d19, q11
+    vld1.64     {d24-d27}, [r0,:128]!
+    vqmovn.u16  d20, q12
+    vqmovn.u16  d21, q13
+    vld1.64     {d28-d31}, [r0,:128]!
+    vqmovn.u16  d22, q14
+    vqmovn.u16  d23, q15
+
+    movrel      r1, pmovmskb_byte
+    vld1.64     {d0-d1}, [r1,:128]
+
+    vtst.8      q8,  q8
+    vtst.8      q9,  q9
+    vtst.8      q10, q10
+    vtst.8      q11, q11
+
+    vand        q8,  q8,  q0
+    vand        q9,  q9,  q0
+    vand        q10, q10, q0
+    vand        q11, q11, q0
+
+    vpadd.u8    d0,  d16, d17
+    vpadd.u8    d1,  d18, d19
+    vpadd.u8    d2,  d20, d21
+    vpadd.u8    d3,  d22, d23
+    vpadd.u8    d0,  d0,  d1
+    vpadd.u8    d1,  d2,  d3
+    vpadd.u8    d0,  d0,  d1
+    vclz.i32    d0,  d0
+    mov         ip,  #31
+    vmov        r0,  r1,  d0
+
+    subs        r1,  ip,  r1
+    addge       r0,  r1,  #32
+    subslt      r0,  ip,  r0
+    movlt       r0,  #0
+    bx          lr
+endfunc
+
+function denoise_dct_neon
+1:  subs        r3,  r3,  #16
+    vld1.16     {q0,  q1},  [r0]
+    vld1.32     {q12, q13}, [r1]!
+    vld1.32     {q14, q15}, [r1]
+    sub         r1,  #32
+    vabs.s16    q8,  q0
+    vabs.s16    q9,  q1
+    vld1.16     {q2, q3}, [r2]!
+    vclt.s16    q10, q0,  #0
+    vclt.s16    q11, q1,  #0
+    vaddw.u16   q12, q12, d16
+    vaddw.u16   q13, q13, d17
+    vqsub.u16   q0,  q8,  q2
+    vqsub.u16   q1,  q9,  q3
+    vaddw.u16   q14, q14, d18
+    vaddw.u16   q15, q15, d19
+    vneg.s16    q8,  q0
+    vneg.s16    q9,  q1
+    vbsl        q10, q8,  q0
+    vbsl        q11, q9,  q1
+    vst1.32     {q12, q13}, [r1]!
+    vst1.32     {q14, q15}, [r1]!
+    vst1.16     {q10, q11}, [r0]!
+    bgt         1b
+    bx          lr
+endfunc
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ * quant.h: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2005-2025 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_ARM_QUANT_H
+#define X264_ARM_QUANT_H
+
+#define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
+int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
+
+#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
+int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
+#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
+int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
+#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
+int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
+int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
+#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
+int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+
+#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
+void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
+void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
+void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+
+#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
+int x264_decimate_score15_neon( int16_t * );
+#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
+int x264_decimate_score16_neon( int16_t * );
+#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
+int x264_decimate_score64_neon( int16_t * );
+
+#define x264_coeff_last4_arm x264_template(coeff_last4_arm)
+int x264_coeff_last4_arm( int16_t * );
+#define x264_coeff_last8_arm x264_template(coeff_last8_arm)
+int x264_coeff_last8_arm( int16_t * );
+#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
+int x264_coeff_last15_neon( int16_t * );
+#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
+int x264_coeff_last16_neon( int16_t * );
+#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
+int x264_coeff_last64_neon( int16_t * );
+
+#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
+#endif