x264 source for verification 2026-05-22

This commit is contained in:
2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions

263
common/arm/asm.S Normal file
View File

@@ -0,0 +1,263 @@
/*****************************************************************************
* asm.S: arm utility macros
*****************************************************************************
* Copyright (C) 2008-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "config.h"
.syntax unified
#ifdef __ELF__
.arch armv7-a
.fpu neon
#endif
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#ifdef PREFIX
# define BASE _x264_
# define SYM_PREFIX _
#else
# define BASE x264_
# define SYM_PREFIX
#endif
#ifdef BIT_DEPTH
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
#else
# define EXTERN_ASM BASE
#endif
#define X(s) JOIN(EXTERN_ASM, s)
#define X264(s) JOIN(BASE, s)
#define EXT(s) JOIN(SYM_PREFIX, s)
#ifdef __ELF__
# define ELF
#else
# define ELF @
#endif
#ifdef __MACH__
# define MACH
# define NONMACH @
#else
# define MACH @
# define NONMACH
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC @
#endif
#if SYS_LINUX || SYS_OPENBSD
#define HAVE_SECTION_DATA_REL_RO 1
#else
#define HAVE_SECTION_DATA_REL_RO 0
#endif
.macro require8, val=1
ELF .eabi_attribute 24, \val
.endm
.macro preserve8, val=1
ELF .eabi_attribute 25, \val
.endm
.macro function name, export=1
.macro endfunc
.if \export
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
.else
ELF .size \name, . - \name
.endif
FUNC .endfunc
.purgem endfunc
.endm
.text
.align 2
.if \export == 1
.global EXTERN_ASM\name
ELF .hidden EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
.else
ELF .hidden \name
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2, relocate=0
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
.if HAVE_SECTION_DATA_REL_RO && \relocate
.section .data.rel.ro
.else
NONMACH .section .rodata
MACH .const_data
.endif
.align \align
\name:
.endm
.macro movrel rd, val
#if defined(PIC)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word \val - (2f + 8)
2:
add \rd, \rd, pc
#elif HAVE_ARMV6T2
movw \rd, #:lower16:\val
movt \rd, #:upper16:\val
#else
ldr \rd, =\val
#endif
.endm
.macro movrelx rd, val, got
#if defined(PIC) && defined(__ELF__)
ldr \got, 2f
ldr \rd, 1f
b 3f
1:
@ FIXME: thumb
.word \val(GOT)
2:
.word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
3:
add \got, \got, pc
ldr \rd, [\got, \rd]
#elif defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word 3f - (2f + 8)
2:
ldr \rd, [pc, \rd]
.non_lazy_symbol_pointer
3:
.indirect_symbol \val
.word 0
.text
#else
movrel \rd, \val
#endif
.endm
.macro movconst rd, val
#if HAVE_ARMV6T2
movw \rd, #:lower16:\val
.if \val >> 16
movt \rd, #:upper16:\val
.endif
#else
ldr \rd, =\val
#endif
.endm
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
.macro HORIZ_ADD dest, a, b
.ifnb \b
vadd.u16 \a, \a, \b
.endif
vpaddl.u16 \a, \a
vpaddl.u32 \dest, \a
.endm
.macro SUMSUB_AB sum, diff, a, b
vadd.s16 \sum, \a, \b
vsub.s16 \diff, \a, \b
.endm
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro ABS2 a b
vabs.s16 \a, \a
vabs.s16 \b, \b
.endm
// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
// op = sumsub/amax (sum and diff / maximum of absolutes)
// d1/2 = destination registers
// s1/2 = source registers
.macro HADAMARD dist, op, d1, d2, s1, s2
.if \dist == 1
vtrn.16 \s1, \s2
.else
vtrn.32 \s1, \s2
.endif
.ifc \op, sumsub
SUMSUB_AB \d1, \d2, \s1, \s2
.else
vabs.s16 \s1, \s1
vabs.s16 \s2, \s2
vmax.s16 \d1, \s1, \s2
.endif
.endm
.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.32 \r2, \r6
vtrn.32 \r3, \r7
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
.macro TRANSPOSE4x4 r0 r1 r2 r3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
.macro TRANSPOSE4x4_16 d0 d1 d2 d3
vtrn.32 \d0, \d2
vtrn.32 \d1, \d3
vtrn.16 \d0, \d1
vtrn.16 \d2, \d3
.endm

84
common/arm/bitstream-a.S Normal file
View File

@@ -0,0 +1,84 @@
/*****************************************************************************
* bitstream-a.S: arm bitstream functions
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
function nal_escape_neon
push {r4-r5,lr}
vmov.u8 q0, #0xff
vmov.u8 q8, #4
mov r3, #3
subs lr, r1, r2
beq 99f
0:
cmn lr, #15
blt 16f
mov r1, r2
b 100f
16:
vld1.8 {q1}, [r1]!
vext.8 q2, q0, q1, #14
vext.8 q3, q0, q1, #15
vcgt.u8 q11, q8, q1
vceq.u8 q9, q2, #0
vceq.u8 q10, q3, #0
vand q9, q9, q11
vand q9, q9, q10
vshrn.u16 d22, q9, #4
vmov ip, lr, d22
orrs ip, ip, lr
beq 16f
mov lr, #-16
100:
vmov.u8 r5, d1[6]
vmov.u8 r4, d1[7]
orr r5, r4, r5, lsl #8
101:
ldrb r4, [r1, lr]
orr ip, r4, r5, lsl #16
cmp ip, #3
bhi 102f
strb r3, [r0], #1
orr r5, r3, r5, lsl #8
102:
adds lr, lr, #1
strb r4, [r0], #1
orr r5, r4, r5, lsl #8
blt 101b
subs lr, r1, r2
lsr ip, r5, #8
vmov.u8 d1[6], ip
vmov.u8 d1[7], r5
blt 0b
pop {r4-r5,pc}
16:
subs lr, r1, r2
vst1.8 {q1}, [r0]!
vmov q0, q1
blt 0b
99:
pop {r4-r5,pc}
endfunc

32
common/arm/bitstream.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* bitstream.h: arm bitstream functions
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_BITSTREAM_H
#define X264_ARM_BITSTREAM_H
#define x264_nal_escape_neon x264_template(nal_escape_neon)
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif

108
common/arm/cpu-a.S Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* cpu-a.S: arm cpu detection
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
.align 2
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
function cpu_neon_test
vadd.i16 q0, q0, q0
bx lr
endfunc
// return: 0 on success
// 1 if counters were already enabled
// 9 if lo-res counters were already enabled
function cpu_enable_armv7_counter, export=0
mrc p15, 0, r2, c9, c12, 0 // read PMNC
ands r0, r2, #1
andne r0, r2, #9
orr r2, r2, #1 // enable counters
bic r2, r2, #8 // full resolution
mcreq p15, 0, r2, c9, c12, 0 // write PMNC
mov r2, #1 << 31 // enable cycle counter
mcr p15, 0, r2, c9, c12, 1 // write CNTENS
bx lr
endfunc
function cpu_disable_armv7_counter, export=0
mrc p15, 0, r0, c9, c12, 0 // read PMNC
bic r0, r0, #1 // disable counters
mcr p15, 0, r0, c9, c12, 0 // write PMNC
bx lr
endfunc
.macro READ_TIME r
mrc p15, 0, \r, c9, c13, 0
.endm
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
// nonzero otherwise
function cpu_fast_neon_mrc_test
// check for user access to performance counters
mrc p15, 0, r0, c9, c14, 0
cmp r0, #0
bxeq lr
push {r4-r6,lr}
bl cpu_enable_armv7_counter
ands r1, r0, #8
mov r3, #0
mov ip, #4
mov r6, #4
moveq r5, #1
movne r5, #64
average_loop:
mov r4, r5
READ_TIME r1
1: subs r4, r4, #1
.rept 8
vmov.u32 lr, d0[0]
add lr, lr, lr
.endr
bgt 1b
READ_TIME r2
subs r6, r6, #1
sub r2, r2, r1
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
addle r3, r3, r2
subsle ip, ip, #1
bgt average_loop
// disable counters if we enabled them
ands r0, r0, #1
bleq cpu_disable_armv7_counter
lsr r0, r3, #5
cmp r0, #10
movgt r0, #0
pop {r4-r6,pc}
endfunc

764
common/arm/dct-a.S Normal file
View File

@@ -0,0 +1,764 @@
/****************************************************************************
* dct-a.S: arm transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 2,3, 8,9, 16,17, 10,11
.byte 12,13, 6,7, 14,15, 20,21
.byte 10,11, 12,13, 6,7, 14,15
endconst
.text
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
vshr.s16 \t0, \b, #\shift
vshr.s16 \t1, \a, #\shift
vadd.s16 \sum, \a, \t0
vsub.s16 \sub, \t1, \b
.endm
// sum = (a>>shift) + b sub = a - (b>>shift)
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
vshr.s16 \t0, \a, #\shift
vshr.s16 \t1, \b, #\shift
vadd.s16 \sum, \t0, \b
vsub.s16 \sub, \a, \t1
.endm
// a += 1.5*ma b -= 1.5*mb
.macro SUMSUB_15 a b ma mb t0 t1
vshr.s16 \t0, \ma, #1
vshr.s16 \t1, \mb, #1
vadd.s16 \t0, \t0, \ma
vadd.s16 \t1, \t1, \mb
vadd.s16 \a, \a, \t0
vsub.s16 \b, \b, \t1
.endm
function dct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
vmov.s16 d31, #1
HADAMARD 1, sumsub, q2, q3, q0, q1
vtrn.32 d4, d5
vadd.s16 d16, d4, d31
vtrn.32 d6, d7
vadd.s16 d17, d6, d31
vrhadd.s16 d0, d4, d5
vhsub.s16 d1, d16, d5
vhsub.s16 d2, d17, d7
vrhadd.s16 d3, d6, d7
vst1.64 {d0-d3}, [r0,:128]
bx lr
endfunc
function idct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
HADAMARD 1, sumsub, q2, q3, q0, q1
HADAMARD 2, sumsub, d0, d1, d4, d5
HADAMARD 2, sumsub, d3, d2, d6, d7
vst1.64 {d0-d3}, [r0,:128]
bx lr
endfunc
.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
SUMSUB_AB \d1, \d6, \d5, \d6
SUMSUB_AB \d3, \d7, \d4, \d7
vadd.s16 \d0, \d3, \d1
vadd.s16 \d4, \d7, \d7
vadd.s16 \d5, \d6, \d6
vsub.s16 \d2, \d3, \d1
vadd.s16 \d1, \d4, \d6
vsub.s16 \d3, \d7, \d5
.endm
function sub4x4_dct_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.32 {d0[]}, [r1,:32], r3
vld1.32 {d1[]}, [r2,:32], ip
vld1.32 {d2[]}, [r1,:32], r3
vsubl.u8 q8, d0, d1
vld1.32 {d3[]}, [r2,:32], ip
vld1.32 {d4[]}, [r1,:32], r3
vsubl.u8 q9, d2, d3
vld1.32 {d5[]}, [r2,:32], ip
vld1.32 {d6[]}, [r1,:32], r3
vsubl.u8 q10, d4, d5
vld1.32 {d7[]}, [r2,:32], ip
vsubl.u8 q11, d6, d7
DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
TRANSPOSE4x4_16 d0, d1, d2, d3
DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
vst1.64 {d4-d7}, [r0,:128]
bx lr
endfunc
function sub8x4_dct_neon, export=0
vld1.64 {d0}, [r1,:64], r3
vld1.64 {d1}, [r2,:64], ip
vsubl.u8 q8, d0, d1
vld1.64 {d2}, [r1,:64], r3
vld1.64 {d3}, [r2,:64], ip
vsubl.u8 q9, d2, d3
vld1.64 {d4}, [r1,:64], r3
vld1.64 {d5}, [r2,:64], ip
vsubl.u8 q10, d4, d5
vld1.64 {d6}, [r1,:64], r3
vld1.64 {d7}, [r2,:64], ip
vsubl.u8 q11, d6, d7
DCT_1D q0, q1, q2, q3, q8, q9, q10, q11
TRANSPOSE4x4_16 q0, q1, q2, q3
SUMSUB_AB q8, q12, q0, q3
SUMSUB_AB q9, q10, q1, q2
vadd.i16 q13, q12, q12
vadd.i16 q11, q10, q10
vadd.i16 d0, d16, d18
vadd.i16 d1, d26, d20
vsub.i16 d2, d16, d18
vsub.i16 d3, d24, d22
vst1.64 {d0-d1}, [r0,:128]!
vadd.i16 d4, d17, d19
vadd.i16 d5, d27, d21
vst1.64 {d2-d3}, [r0,:128]!
vsub.i16 d6, d17, d19
vsub.i16 d7, d25, d23
vst1.64 {d4-d5}, [r0,:128]!
vst1.64 {d6-d7}, [r0,:128]!
bx lr
endfunc
function sub8x8_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
bl sub8x4_dct_neon
pop {lr}
b sub8x4_dct_neon
endfunc
function sub16x16_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8*FENC_STRIDE-8
sub r2, r2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8
sub r2, r2, #8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8*FENC_STRIDE-8
sub r2, r2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
pop {lr}
b sub8x4_dct_neon
endfunc
.macro DCT8_1D type
SUMSUB_AB q2, q1, q11, q12 // s34/d34
SUMSUB_AB q3, q11, q10, q13 // s25/d25
SUMSUB_AB q13, q10, q9, q14 // s16/d16
SUMSUB_AB q14, q8, q8, q15 // s07/d07
SUMSUB_AB q9, q2, q14, q2 // a0/a2
SUMSUB_AB q12, q14, q13, q3 // a1/a3
SUMSUB_AB q3, q13, q8, q1 // a6/a5
vshr.s16 q0, q10, #1
vshr.s16 q15, q11, #1
vadd.s16 q0, q0, q10
vadd.s16 q15, q15, q11
vsub.s16 q3, q3, q0
vsub.s16 q13, q13, q15
SUMSUB_AB q0, q15, q10, q11 // a4/a7
vshr.s16 q10, q8, #1
vshr.s16 q11, q1, #1
vadd.s16 q10, q10, q8
vadd.s16 q11, q11, q1
vadd.s16 q10, q0, q10
vadd.s16 q15, q15, q11
SUMSUB_AB q8, q12, q9, q12
SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1
SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
.endm
function sub8x8_dct8_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
DCT8_1D row
vswp d17, d24 // 8, 12
vswp d21, d28 // 10,14
vtrn.32 q8, q10
vtrn.32 q12, q14
vswp d19, d26 // 9, 13
vswp d23, d30 // 11,15
vtrn.32 q9, q11
vtrn.32 q13, q15
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q8, q9
vtrn.16 q14, q15
DCT8_1D col
vst1.64 {d16-d19}, [r0,:128]!
vst1.64 {d20-d23}, [r0,:128]!
vst1.64 {d24-d27}, [r0,:128]!
vst1.64 {d28-d31}, [r0,:128]!
bx lr
endfunc
function sub16x16_dct8_neon
push {lr}
bl X(sub8x8_dct8_neon)
sub r1, r1, #FENC_STRIDE*8 - 8
sub r2, r2, #FDEC_STRIDE*8 - 8
bl X(sub8x8_dct8_neon)
sub r1, r1, #8
sub r2, r2, #8
bl X(sub8x8_dct8_neon)
pop {lr}
sub r1, r1, #FENC_STRIDE*8 - 8
sub r2, r2, #FDEC_STRIDE*8 - 8
b X(sub8x8_dct8_neon)
endfunc
// First part of IDCT (minus final SUMSUB_BA)
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
SUMSUB_AB \d4, \d5, \d0, \d2
vshr.s16 \d7, \d1, #1
vshr.s16 \d6, \d3, #1
vsub.s16 \d7, \d7, \d3
vadd.s16 \d6, \d6, \d1
.endm
function add4x4_idct_neon
mov r2, #FDEC_STRIDE
vld1.64 {d0-d3}, [r1,:128]
IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3
vld1.32 {d30[0]}, [r0,:32], r2
SUMSUB_AB q0, q1, q2, q3
TRANSPOSE4x4_16 d0, d1, d3, d2
IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2
vld1.32 {d30[1]}, [r0,:32], r2
SUMSUB_AB q0, q1, q2, q3
vrshr.s16 q0, q0, #6
vld1.32 {d31[1]}, [r0,:32], r2
vrshr.s16 q1, q1, #6
vld1.32 {d31[0]}, [r0,:32], r2
sub r0, r0, r2, lsl #2
vaddw.u8 q0, q0, d30
vaddw.u8 q1, q1, d31
vqmovun.s16 d0, q0
vqmovun.s16 d2, q1
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d2[1]}, [r0,:32], r2
vst1.32 {d2[0]}, [r0,:32], r2
bx lr
endfunc
function add8x4_idct_neon, export=0
vld1.64 {d0-d3}, [r1,:128]!
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
vld1.64 {d4-d7}, [r1,:128]!
IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7
SUMSUB_AB q0, q3, q8, q10
SUMSUB_AB q1, q2, q9, q11
TRANSPOSE4x4_16 q0, q1, q2, q3
IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3
SUMSUB_AB q0, q3, q8, q10
SUMSUB_AB q1, q2, q9, q11
vrshr.s16 q0, q0, #6
vld1.32 {d28}, [r0,:64], r2
vrshr.s16 q1, q1, #6
vld1.32 {d29}, [r0,:64], r2
vrshr.s16 q2, q2, #6
vld1.32 {d30}, [r0,:64], r2
vrshr.s16 q3, q3, #6
vld1.32 {d31}, [r0,:64], r2
sub r0, r0, r2, lsl #2
vaddw.u8 q0, q0, d28
vaddw.u8 q1, q1, d29
vaddw.u8 q2, q2, d30
vaddw.u8 q3, q3, d31
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 {d0}, [r0,:64], r2
vqmovun.s16 d2, q2
vst1.32 {d1}, [r0,:64], r2
vqmovun.s16 d3, q3
vst1.32 {d2}, [r0,:64], r2
vst1.32 {d3}, [r0,:64], r2
bx lr
endfunc
function add8x8_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl add8x4_idct_neon
mov lr, ip
b add8x4_idct_neon
endfunc
function add16x16_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8*FDEC_STRIDE-8
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8*FDEC_STRIDE-8
bl add8x4_idct_neon
mov lr, ip
b add8x4_idct_neon
endfunc
.macro IDCT8_1D type
.ifc \type, col
vswp d21, d28
.endif
SUMSUB_AB q0, q1, q8, q12 // a0/a2
.ifc \type, row
vld1.64 {d28-d31}, [r1,:128]!
.else
vswp d19, d26
.endif
SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4
.ifc \type, col
vswp d23, d30
.endif
SUMSUB_AB q8, q10, q13, q11
SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1
SUMSUB_AB q14, q15, q15, q9
SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3
SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5
SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7
SUMSUB_AB q10, q2, q0, q2 // b0/b6
SUMSUB_AB q11, q3, q1, q3 // b2/b4
SUMSUB_AB q8, q15, q10, q15
SUMSUB_AB q9, q14, q11, q14
SUMSUB_AB q10, q13, q3, q13
.ifc \type, row
vtrn.16 q8, q9
.endif
SUMSUB_AB q11, q12, q2, q12
.endm
function add8x8_idct8_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16-d19}, [r1,:128]!
vld1.64 {d20-d23}, [r1,:128]!
vld1.64 {d24-d27}, [r1,:128]!
IDCT8_1D row
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q14, q15
vtrn.32 q8, q10
vtrn.32 q9, q11
vtrn.32 q12, q14
vtrn.32 q13, q15
vswp d17, d24
IDCT8_1D col
vld1.64 {d0}, [r0,:64], r2
vrshr.s16 q8, q8, #6
vld1.64 {d1}, [r0,:64], r2
vrshr.s16 q9, q9, #6
vld1.64 {d2}, [r0,:64], r2
vrshr.s16 q10, q10, #6
vld1.64 {d3}, [r0,:64], r2
vrshr.s16 q11, q11, #6
vld1.64 {d4}, [r0,:64], r2
vrshr.s16 q12, q12, #6
vld1.64 {d5}, [r0,:64], r2
vrshr.s16 q13, q13, #6
vld1.64 {d6}, [r0,:64], r2
vrshr.s16 q14, q14, #6
vld1.64 {d7}, [r0,:64], r2
vrshr.s16 q15, q15, #6
sub r0, r0, r2, lsl #3
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vaddw.u8 q11, q11, d3
vst1.64 {d0}, [r0,:64], r2
vaddw.u8 q12, q12, d4
vst1.64 {d1}, [r0,:64], r2
vaddw.u8 q13, q13, d5
vst1.64 {d2}, [r0,:64], r2
vqmovun.s16 d3, q11
vqmovun.s16 d4, q12
vaddw.u8 q14, q14, d6
vaddw.u8 q15, q15, d7
vst1.64 {d3}, [r0,:64], r2
vqmovun.s16 d5, q13
vst1.64 {d4}, [r0,:64], r2
vqmovun.s16 d6, q14
vqmovun.s16 d7, q15
vst1.64 {d5}, [r0,:64], r2
vst1.64 {d6}, [r0,:64], r2
vst1.64 {d7}, [r0,:64], r2
bx lr
endfunc
function add16x16_idct8_neon
mov ip, lr
bl X(add8x8_idct8_neon)
sub r0, r0, #8*FDEC_STRIDE-8
bl X(add8x8_idct8_neon)
sub r0, r0, #8
bl X(add8x8_idct8_neon)
sub r0, r0, #8*FDEC_STRIDE-8
mov lr, ip
b X(add8x8_idct8_neon)
endfunc
function add8x8_idct_dc_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64]
vrshr.s16 d16, d16, #6
vld1.64 {d0}, [r0,:64], r2
vmov.i16 q15, #0
vld1.64 {d1}, [r0,:64], r2
vld1.64 {d2}, [r0,:64], r2
vdup.16 d20, d16[0]
vld1.64 {d3}, [r0,:64], r2
vdup.16 d21, d16[1]
vld1.64 {d4}, [r0,:64], r2
vdup.16 d22, d16[2]
vld1.64 {d5}, [r0,:64], r2
vdup.16 d23, d16[3]
vld1.64 {d6}, [r0,:64], r2
vsub.s16 q12, q15, q10
vld1.64 {d7}, [r0,:64], r2
vsub.s16 q13, q15, q11
sub r0, r0, #8*FDEC_STRIDE
vqmovun.s16 d20, q10
vqmovun.s16 d22, q11
vqmovun.s16 d24, q12
vqmovun.s16 d26, q13
vmov d21, d20
vqadd.u8 q0, q0, q10
vmov d23, d22
vqadd.u8 q1, q1, q10
vmov d25, d24
vqadd.u8 q2, q2, q11
vmov d27, d26
vqadd.u8 q3, q3, q11
vqsub.u8 q0, q0, q12
vqsub.u8 q1, q1, q12
vqsub.u8 q2, q2, q13
vst1.64 {d0}, [r0,:64], r2
vqsub.u8 q3, q3, q13
vst1.64 {d1}, [r0,:64], r2
vst1.64 {d2}, [r0,:64], r2
vst1.64 {d3}, [r0,:64], r2
vst1.64 {d4}, [r0,:64], r2
vst1.64 {d5}, [r0,:64], r2
vst1.64 {d6}, [r0,:64], r2
vst1.64 {d7}, [r0,:64], r2
bx lr
endfunc
.macro ADD16x4_IDCT_DC dc
vld1.64 {d16-d17}, [r0,:128], r3
vld1.64 {d18-d19}, [r0,:128], r3
vdup.16 d4, \dc[0]
vdup.16 d5, \dc[1]
vld1.64 {d20-d21}, [r0,:128], r3
vdup.16 d6, \dc[2]
vdup.16 d7, \dc[3]
vld1.64 {d22-d23}, [r0,:128], r3
vsub.s16 q12, q15, q2
vsub.s16 q13, q15, q3
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
vqmovun.s16 d6, q12
vqmovun.s16 d7, q13
vqadd.u8 q8, q8, q2
vqadd.u8 q9, q9, q2
vqadd.u8 q10, q10, q2
vqadd.u8 q11, q11, q2
vqsub.u8 q8, q8, q3
vqsub.u8 q9, q9, q3
vqsub.u8 q10, q10, q3
vst1.64 {d16-d17}, [r2,:128], r3
vqsub.u8 q11, q11, q3
vst1.64 {d18-d19}, [r2,:128], r3
vst1.64 {d20-d21}, [r2,:128], r3
vst1.64 {d22-d23}, [r2,:128], r3
.endm
function add16x16_idct_dc_neon
mov r2, r0
mov r3, #FDEC_STRIDE
vmov.i16 q15, #0
vld1.64 {d0-d3}, [r1,:64]
vrshr.s16 q0, #6
vrshr.s16 q1, #6
ADD16x4_IDCT_DC d0
ADD16x4_IDCT_DC d1
ADD16x4_IDCT_DC d2
ADD16x4_IDCT_DC d3
bx lr
endfunc
function sub8x8_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q0, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q0, q0, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q0, q0, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q1, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vadd.s16 q1, q1, q14
vadd.s16 d4, d0, d1
vadd.s16 q1, q1, q15
vsub.s16 d5, d0, d1
vadd.s16 d6, d2, d3
vsub.s16 d7, d2, d3
vadd.s16 q0, q2, q3
vsub.s16 q1, q2, q3
vpadd.s16 d0, d0, d2
vpadd.s16 d1, d1, d3
vpadd.s16 d0, d0, d1
vst1.64 {d0}, [r0,:64]
bx lr
endfunc
function sub8x16_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q0, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q0, q0, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q0, q0, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q1, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vld1.64 {d16}, [r1,:64], r3
vadd.s16 q1, q1, q14
vld1.64 {d17}, [r2,:64], ip
vadd.s16 q1, q1, q15
vld1.64 {d18}, [r1,:64], r3
vsubl.u8 q8, d16, d17
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q2, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q2, q2, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q2, q2, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q3, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vadd.s16 q3, q3, q14
vadd.s16 d16, d0, d1 @ b0
vadd.s16 q3, q3, q15
vsub.s16 d17, d0, d1 @ b4
vadd.s16 d18, d2, d3 @ b1
vsub.s16 d19, d2, d3 @ b5
vadd.s16 d20, d4, d5 @ b2
vsub.s16 d21, d4, d5 @ b6
vadd.s16 d22, d6, d7 @ b3
vsub.s16 d23, d6, d7 @ b7
vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2
vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6
vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3
vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7
vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3
vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3
vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7
vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7
vpadd.s16 d0, d16, d17
vpadd.s16 d1, d18, d19
vpadd.s16 d2, d20, d21
vpadd.s16 d3, d22, d23
vpadd.s16 d0, d0, d1
vpadd.s16 d1, d2, d3
vst1.64 {q0}, [r0,:64]
bx lr
endfunc
function zigzag_scan_4x4_frame_neon
movrel r2, scan4x4_frame
vld1.64 {d0-d3}, [r1,:128]
vld1.64 {d16-d19}, [r2,:128]
vtbl.8 d4, {d0-d1}, d16
vtbl.8 d5, {d1-d3}, d17
vtbl.8 d6, {d0-d2}, d18
vtbl.8 d7, {d2-d3}, d19
vst1.64 {d4-d7}, [r0,:128]
bx lr
endfunc

70
common/arm/dct.h Normal file
View File

@@ -0,0 +1,70 @@
/*****************************************************************************
* dct.h: arm transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_DCT_H
#define X264_ARM_DCT_H
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
void x264_dct4x4dc_neon( int16_t d[16] );
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
void x264_idct4x4dc_neon( int16_t d[16] );
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#endif

795
common/arm/deblock-a.S Normal file
View File

@@ -0,0 +1,795 @@
/*****************************************************************************
* deblock.S: arm deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
.macro h264_loop_filter_start
ldr ip, [sp]
ldr ip, [ip]
vdup.32 d24, ip
and ip, ip, ip, lsl #16
ands ip, ip, ip, lsl #8
bxlt lr
.endm
.macro align_push_regs
and ip, sp, #15
add ip, ip, #32
sub sp, sp, ip
vst1.64 {d12-d15}, [sp,:128]
sub sp, sp, #32
vst1.64 {d8-d11}, [sp,:128]
.endm
.macro align_pop_regs
vld1.64 {d8-d11}, [sp,:128]!
vld1.64 {d12-d15}, [sp,:128], ip
.endm
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
vmovl.u16 q12, d24
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vsli.16 q12, q12, #8
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vsli.32 q12, q12, #16
vclt.u8 q6, q6, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.s8 q7, q12, #0
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vbic q6, q6, q7
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
vand q6, q6, q14
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vclt.u8 q4, q4, q11 @ < beta
vand q6, q6, q15
vclt.u8 q5, q5, q11 @ < beta
vand q4, q4, q6
vand q5, q5, q6
vand q12, q12, q6
vrhadd.u8 q14, q8, q0
vsub.i8 q6, q12, q4
vqadd.u8 q7, q9, q12
vhadd.u8 q10, q10, q14
vsub.i8 q6, q6, q5
vhadd.u8 q14, q2, q14
vmin.u8 q7, q7, q10
vqsub.u8 q11, q9, q12
vqadd.u8 q2, q1, q12
vmax.u8 q7, q7, q11
vqsub.u8 q11, q1, q12
vmin.u8 q14, q2, q14
vmovl.u8 q2, d0
vmax.u8 q14, q14, q11
vmovl.u8 q10, d1
vsubw.u8 q2, q2, d16
vsubw.u8 q10, q10, d17
vshl.i16 q2, q2, #2
vshl.i16 q10, q10, #2
vaddw.u8 q2, q2, d18
vaddw.u8 q10, q10, d19
vsubw.u8 q2, q2, d2
vsubw.u8 q10, q10, d3
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q10, #3
vbsl q4, q7, q9
vbsl q5, q14, q1
vneg.s8 q7, q6
vmovl.u8 q14, d16
vmin.s8 q2, q2, q6
vmovl.u8 q6, d17
vmax.s8 q2, q2, q7
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q6, q6, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q6
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function deblock_v_luma_neon
h264_loop_filter_start
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
vld1.64 {d20,d21}, [r0,:128], r1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128], r1
align_push_regs
h264_loop_filter_luma
sub r0, r0, r1, lsl #1
vst1.64 {d8, d9}, [r0,:128], r1
vst1.64 {d16,d17}, [r0,:128], r1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d10,d11}, [r0,:128]
align_pop_regs
bx lr
endfunc
function deblock_h_luma_neon
h264_loop_filter_start
sub r0, r0, #4
vld1.64 {d6}, [r0], r1
vld1.64 {d20}, [r0], r1
vld1.64 {d18}, [r0], r1
vld1.64 {d16}, [r0], r1
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r0], r1
vld1.64 {d4}, [r0], r1
vld1.64 {d26}, [r0], r1
vld1.64 {d7}, [r0], r1
vld1.64 {d21}, [r0], r1
vld1.64 {d19}, [r0], r1
vld1.64 {d17}, [r0], r1
vld1.64 {d1}, [r0], r1
vld1.64 {d3}, [r0], r1
vld1.64 {d5}, [r0], r1
vld1.64 {d27}, [r0], r1
TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13
align_push_regs
h264_loop_filter_luma
TRANSPOSE4x4 q4, q8, q0, q5
sub r0, r0, r1, lsl #4
add r0, r0, #2
vst1.32 {d8[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d10[0]}, [r0], r1
vst1.32 {d8[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d10[1]}, [r0], r1
vst1.32 {d9[0]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d11[0]}, [r0], r1
vst1.32 {d9[1]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
align_pop_regs
bx lr
endfunc
.macro h264_loop_filter_luma_intra
vdup.8 q14, r2 @ alpha
vabd.u8 q4, q8, q0 @ abs(p0 - q0)
vabd.u8 q5, q9, q8 @ abs(p1 - p0)
vabd.u8 q6, q1, q0 @ abs(q1 - q0)
vdup.8 q15, r3 @ beta
vmov.u8 q13, #2
vclt.u8 q7, q4, q14 @ < alpha
vshr.u8 q14, q14, #2 @ alpha >> 2
vclt.u8 q5, q5, q15 @ < beta
vadd.u8 q14, q14, q13 @ (alpha >> 2) + 2
vand q7, q7, q5
vclt.u8 q6, q6, q15 @ < beta
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
vand q12, q7, q6 @ if_1
vshrn.u16 d28, q12, #4
vmov r2, lr, d28
orrs r2, r2, lr
beq 9f
sub sp, sp, #32
vst1.8 {q12-q13}, [sp,:128]
vshll.u8 q4, d18, #1 @ 2*p1
vshll.u8 q5, d19, #1
vaddw.u8 q4, q4, d16 @ 2*p1 + p0
vaddw.u8 q5, q5, d17
vaddw.u8 q4, q4, d2 @ 2*p1 + p0 + q1
vaddw.u8 q5, q5, d3
vrshrn.u16 d24, q4, #2
vrshrn.u16 d25, q5, #2
vaddl.u8 q6, d20, d16 @ p2 + p0
vaddl.u8 q7, d21, d17
vaddw.u8 q6, q6, d0 @ p2 + p0 + q0
vaddw.u8 q7, q7, d1
vadd.u16 q4, q4, q6 @ p2 + 2*p1 + 2*p0 + q0 + q1
vadd.u16 q5, q5, q7
vaddw.u8 q4, q4, d0 @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
vaddw.u8 q5, q5, d1
vrshrn.u16 d26, q4, #3 @ p0'_2
vrshrn.u16 d27, q5, #3
vaddw.u8 q6, q6, d18 @ p2 + p1 + p0 + q0
vaddw.u8 q7, q7, d19
vrshrn.u16 d28, q6, #2 @ p1'_2
vrshrn.u16 d29, q7, #2
vaddl.u8 q4, d22, d20 @ p3 + p2
vaddl.u8 q5, d23, d21
vshl.u16 q4, q4, #1 @ 2*p3 + 2*p2
vshl.u16 q5, q5, #1
vadd.u16 q4, q4, q6 @ 2*p3 + 3*p2 + p1 + p0 + q0
vadd.u16 q5, q5, q7
vrshrn.u16 d30, q4, #3 @ p2'_2
vrshrn.u16 d31, q5, #3
vdup.8 q4, r3 @ beta
vabd.u8 q5, q10, q8 @ abs(p2 - p0)
vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2
vclt.u8 q5, q5, q4 @ < beta if_3
vand q7, q7, q5 @ if_2 && if_3
vmvn q4, q7
vand q7, q7, q6 @ if_1 && if_2 && if_3
vand q6, q4, q6 @ if_1 && !(if_2 && if_3)
@ copy p0 to q15 so it can be clobbered
vbit q10, q15, q7
vmov q15, q8
vbit q8, q12, q6
@ wait for q9 to clobber
vshll.u8 q4, d2, #1 @ 2*q1
vshll.u8 q5, d3, #1
vbit q8, q12, q6
vaddw.u8 q4, q4, d0 @ 2*q1 + q0
vaddw.u8 q5, q5, d1
vbit q8, q13, q7
vaddw.u8 q4, q4, d18 @ 2*q1 + q0 + p1
vaddw.u8 q5, q5, d19
vbit q9, q14, q7
vrshrn.u16 d24, q4, #2
vrshrn.u16 d25, q5, #2
vaddl.u8 q6, d4, d0 @ q2 + q0
vaddl.u8 q7, d5, d1
vaddw.u8 q6, q6, d30 @ q2 + q0 + p0
vaddw.u8 q7, q7, d31
vadd.u16 q4, q4, q6 @ q2 + 2*q1 + 2*q0 + p0 + p1
vadd.u16 q5, q5, q7
vaddw.u8 q4, q4, d30 @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
vaddw.u8 q5, q5, d31
vrshrn.u16 d26, q4, #3 @ q0'_2
vrshrn.u16 d27, q5, #3
vaddw.u8 q6, q6, d2 @ q2 + q1 + q0 + p0
vaddw.u8 q7, q7, d3
vrshrn.u16 d28, q6, #2 @ q1'_2
vrshrn.u16 d29, q7, #2
vaddl.u8 q4, d6, d4 @ q3 + q2
vaddl.u8 q5, d7, d5
vshl.u16 q4, q4, #1 @ 2*q3 + 2*q2
vshl.u16 q5, q5, #1
vadd.u16 q4, q4, q6 @ 2*q3 + 3*q2 + q1 + q0 + p0
vadd.u16 q5, q5, q7
vrshrn.u16 d30, q4, #3 @ q2'_2
vrshrn.u16 d31, q5, #3
vdup.8 q4, r3 @ beta
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2
vclt.u8 q5, q5, q4 @ < beta if_4
vand q7, q7, q5 @ if_2 && if_4
vmvn q4, q7
vand q7, q6, q7 @ if_1 && if_2 && if_4
vand q6, q6, q4 @ if_1 && !(if_2 && if_4)
vbit q0, q12, q6
vbit q1, q14, q7
vbit q0, q13, q7
vbit q2, q15, q7
.endm
function deblock_v_luma_intra_neon
push {lr}
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
vld1.64 {d6, d7}, [r0,:128], r1
sub r0, r0, r1, lsl #3
vld1.64 {d22,d23}, [r0,:128], r1
vld1.64 {d20,d21}, [r0,:128], r1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128]
align_push_regs
h264_loop_filter_luma_intra
sub r0, r0, r1, lsl #1
vst1.64 {d20,d21}, [r0,:128], r1
vst1.64 {d18,d19}, [r0,:128], r1
vst1.64 {d16,d17}, [r0,:128], r1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d2, d3}, [r0,:128], r1
vst1.64 {d4, d5}, [r0,:128]
9:
align_pop_regs
pop {pc}
endfunc
function deblock_h_luma_intra_neon
push {lr}
sub r0, r0, #4
vld1.64 {d22}, [r0], r1
vld1.64 {d20}, [r0], r1
vld1.64 {d18}, [r0], r1
vld1.64 {d16}, [r0], r1
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r0], r1
vld1.64 {d4}, [r0], r1
vld1.64 {d6}, [r0], r1
vld1.64 {d23}, [r0], r1
vld1.64 {d21}, [r0], r1
vld1.64 {d19}, [r0], r1
vld1.64 {d17}, [r0], r1
vld1.64 {d1}, [r0], r1
vld1.64 {d3}, [r0], r1
vld1.64 {d5}, [r0], r1
vld1.64 {d7}, [r0], r1
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
align_push_regs
h264_loop_filter_luma_intra
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
sub r0, r0, r1, lsl #4
vst1.64 {d22}, [r0], r1
vst1.64 {d20}, [r0], r1
vst1.64 {d18}, [r0], r1
vst1.64 {d16}, [r0], r1
vst1.64 {d0}, [r0], r1
vst1.64 {d2}, [r0], r1
vst1.64 {d4}, [r0], r1
vst1.64 {d6}, [r0], r1
vst1.64 {d23}, [r0], r1
vst1.64 {d21}, [r0], r1
vst1.64 {d19}, [r0], r1
vst1.64 {d17}, [r0], r1
vst1.64 {d1}, [r0], r1
vst1.64 {d3}, [r0], r1
vst1.64 {d5}, [r0], r1
vst1.64 {d7}, [r0], r1
9:
align_pop_regs
pop {pc}
endfunc
.macro h264_loop_filter_chroma
vdup.8 q11, r2 // alpha
vmovl.u8 q12, d24
vabd.u8 q13, q8, q0 // abs(p0 - q0)
vabd.u8 q14, q9, q8 // abs(p1 - p0)
vsubl.u8 q2, d0, d16
vsubl.u8 q3, d1, d17
vsli.16 q12, q12, #8
vshl.i16 q2, q2, #2
vshl.i16 q3, q3, #2
vabd.u8 q15, q1, q0 // abs(q1 - q0)
vmovl.u8 q12, d24
vaddw.u8 q2, q2, d18
vaddw.u8 q3, q3, d19
vclt.u8 q13, q13, q11 // < alpha
vsubw.u8 q2, q2, d2
vsubw.u8 q3, q3, d3
vsli.16 q12, q12, #8
vdup.8 q11, r3 // beta
vclt.s8 q10, q12, #0
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q3, #3
vclt.u8 q14, q14, q11 // < beta
vbic q13, q13, q10
vclt.u8 q15, q15, q11 // < beta
vand q13, q13, q14
vneg.s8 q10, q12
vand q13, q13, q15
vmin.s8 q2, q2, q12
vmovl.u8 q14, d16
vand q2, q2, q13
vmovl.u8 q15, d17
vmax.s8 q2, q2, q10
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q15, q15, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q15
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function deblock_v_chroma_neon
h264_loop_filter_start
sub r0, r0, r1, lsl #1
vld1.8 {d18,d19}, [r0,:128], r1
vld1.8 {d16,d17}, [r0,:128], r1
vld1.8 {d0, d1}, [r0,:128], r1
vld1.8 {d2, d3}, [r0,:128]
h264_loop_filter_chroma
sub r0, r0, r1, lsl #1
vst1.8 {d16,d17}, [r0,:128], r1
vst1.8 {d0, d1}, [r0,:128], r1
bx lr
endfunc
function deblock_h_chroma_neon
h264_loop_filter_start
sub r0, r0, #4
deblock_h_chroma:
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
TRANSPOSE4x4_16 q9, q8, q0, q1
h264_loop_filter_chroma
vtrn.16 q8, q0
sub r0, r0, r1, lsl #3
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
bx lr
endfunc
function deblock_h_chroma_422_neon
h264_loop_filter_start
push {lr}
sub r0, r0, #4
add r1, r1, r1
bl deblock_h_chroma
ldr ip, [sp, #4]
ldr ip, [ip]
vdup.32 d24, ip
sub r0, r0, r1, lsl #3
add r0, r0, r1, lsr #1
sub r0, r0, #2
pop {lr}
b deblock_h_chroma
endfunc
.macro h264_loop_filter_chroma8
vdup.8 d22, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
vsubl.u8 q2, d0, d16
vsli.16 d24, d24, #8
vshl.i16 q2, q2, #2
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
vaddw.u8 q2, q2, d18
vclt.u8 d26, d26, d22 @ < alpha
vsubw.u8 q2, q2, d2
vdup.8 d22, r3 @ beta
vclt.s8 d20, d24, #0
vrshrn.i16 d4, q2, #3
vclt.u8 d28, d28, d22 @ < beta
vbic d26, d26, d20
vclt.u8 d30, d30, d22 @ < beta
vand d26, d26, d28
vneg.s8 d20, d24
vand d26, d26, d30
vmin.s8 d4, d4, d24
vmovl.u8 q14, d16
vand d4, d4, d26
vmax.s8 d4, d4, d20
vmovl.u8 q11, d0
vaddw.s8 q14, q14, d4
vsubw.s8 q11, q11, d4
vqmovun.s16 d16, q14
vqmovun.s16 d0, q11
.endm
function deblock_h_chroma_mbaff_neon
h264_loop_filter_start
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
TRANSPOSE4x4_16 d18, d16, d0, d2
h264_loop_filter_chroma8
vtrn.16 d16, d0
sub r0, r0, r1, lsl #2
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0]
bx lr
endfunc
.macro h264_loop_filter_chroma_intra, width=16
vdup.8 q11, r2 @ alpha
vabd.u8 q13, q8, q0 @ abs(p0 - q0)
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vclt.u8 q13, q13, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vand q13, q13, q14
vand q13, q13, q15
vshll.u8 q14, d18, #1
vshll.u8 q2, d2, #1
.ifc \width, 16
vshll.u8 q15, d19, #1
vshll.u8 q3, d3, #1
vaddl.u8 q12, d17, d3
vaddl.u8 q10, d1, d19
.endif
vaddl.u8 q11, d16, d2
vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
vadd.u16 q14, q14, q11
vadd.u16 q2, q2, q1
.ifc \width, 16
vadd.u16 q15, q15, q12
vadd.u16 q3, q3, q10
.endif
vqrshrn.u16 d28, q14, #2
vqrshrn.u16 d4, q2, #2
.ifc \width, 16
vqrshrn.u16 d29, q15, #2
vqrshrn.u16 d5, q3, #2
.endif
vbit q8, q14, q13
vbit q0, q2, q13
.endm
function deblock_v_chroma_intra_neon
sub r0, r0, r1, lsl #1
vld2.8 {d18,d19}, [r0,:128], r1
vld2.8 {d16,d17}, [r0,:128], r1
vld2.8 {d0, d1}, [r0,:128], r1
vld2.8 {d2, d3}, [r0,:128]
h264_loop_filter_chroma_intra
sub r0, r0, r1, lsl #1
vst2.8 {d16,d17}, [r0,:128], r1
vst2.8 {d0, d1}, [r0,:128], r1
bx lr
endfunc
function deblock_h_chroma_intra_neon
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
TRANSPOSE4x4_16 q9, q8, q0, q1
h264_loop_filter_chroma_intra
vtrn.16 q8, q0
sub r0, r0, r1, lsl #3
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
bx lr
endfunc
function deblock_h_chroma_422_intra_neon
push {lr}
bl X(deblock_h_chroma_intra_neon)
add r0, r0, #2
pop {lr}
b X(deblock_h_chroma_intra_neon)
endfunc
function deblock_h_chroma_intra_mbaff_neon
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
TRANSPOSE4x4_16 d18, d16, d0, d2
h264_loop_filter_chroma_intra width=8
vtrn.16 d16, d0
sub r0, r0, r1, lsl #2
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0]
bx lr
endfunc
function deblock_strength_neon
ldr ip, [sp]
vmov.i8 q8, #0
lsl ip, ip, #8
add r3, r3, #32
sub ip, ip, #(1<<8)-3
vmov.i8 q9, #0
vdup.16 q10, ip
ldr ip, [sp, #4]
lists:
@ load bytes ref
vld1.8 {d31}, [r1]!
add r2, r2, #16
vld1.8 {q1}, [r1]!
vmov.i8 q0, #0
vld1.8 {q2}, [r1]!
vext.8 q3, q0, q1, #15
vext.8 q0, q0, q2, #15
vuzp.32 q1, q2
vuzp.32 q3, q0
vext.8 q1, q15, q2, #12
veor q0, q0, q2
veor q1, q1, q2
vorr q8, q8, q0
vorr q9, q9, q1
vld1.16 {q11}, [r2,:128]! @ mv + 0x10
vld1.16 {q3}, [r2,:128]! @ mv + 0x20
vld1.16 {q12}, [r2,:128]! @ mv + 0x30
vld1.16 {q2}, [r2,:128]! @ mv + 0x40
vld1.16 {q13}, [r2,:128]! @ mv + 0x50
vext.8 q3, q3, q12, #12
vext.8 q2, q2, q13, #12
vabd.s16 q0, q12, q3
vld1.16 {q3}, [r2,:128]! @ mv + 0x60
vabd.s16 q1, q13, q2
vld1.16 {q14}, [r2,:128]! @ mv + 0x70
vqmovn.u16 d0, q0
vld1.16 {q2}, [r2,:128]! @ mv + 0x80
vld1.16 {q15}, [r2,:128]! @ mv + 0x90
vqmovn.u16 d1, q1
vext.8 q3, q3, q14, #12
vext.8 q2, q2, q15, #12
vabd.s16 q3, q14, q3
vabd.s16 q2, q15, q2
vqmovn.u16 d2, q3
vqmovn.u16 d3, q2
vqsub.u8 q0, q0, q10
vqsub.u8 q1, q1, q10
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
vabd.s16 q1, q12, q13
vorr q8, q8, q0
vabd.s16 q0, q11, q12
vabd.s16 q2, q13, q14
vabd.s16 q3, q14, q15
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
vqmovn.u16 d2, q2
vqmovn.u16 d3, q3
vqsub.u8 q0, q0, q10
vqsub.u8 q1, q1, q10
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
subs ip, ip, #1
vorr q9, q9, q0
beq lists
mov ip, #-32
@ load bytes nnz
vld1.8 {d31}, [r0]!
vld1.8 {q1}, [r0]!
vmov.i8 q0, #0
vld1.8 {q2}, [r0]
vext.8 q3, q0, q1, #15
vext.8 q0, q0, q2, #15
vuzp.32 q1, q2
vuzp.32 q3, q0
vext.8 q1, q15, q2, #12
vorr q0, q0, q2
vorr q1, q1, q2
vmov.u8 q10, #1
vmin.u8 q0, q0, q10
vmin.u8 q1, q1, q10
vmin.u8 q8, q8, q10 @ mv ? 1 : 0
vmin.u8 q9, q9, q10
vadd.u8 q0, q0, q0 @ nnz ? 2 : 0
vadd.u8 q1, q1, q1
vmax.u8 q8, q8, q0
vmax.u8 q9, q9, q1
vzip.16 d16, d17
vst1.8 {q9}, [r3,:128], ip @ bs[1]
vtrn.8 d16, d17
vtrn.32 d16, d17
vst1.8 {q8}, [r3,:128] @ bs[0]
bx lr
endfunc

58
common/arm/deblock.h Normal file
View File

@@ -0,0 +1,58 @@
/*****************************************************************************
* deblock.h: arm deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_DEBLOCK_H
#define X264_ARM_DEBLOCK_H
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif

1938
common/arm/mc-a.S Normal file

File diff suppressed because it is too large Load Diff

366
common/arm/mc-c.c Normal file
View File

@@ -0,0 +1,366 @@
/*****************************************************************************
* mc-c.c: arm motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "mc.h"
#define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
#define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
void x264_memzero_aligned_neon( void *dst, size_t n );
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
#if !HIGH_BIT_DEPTH
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
\
static weight_fn_t mc##func##_wtab_neon[6] =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w8##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w20##func##_neon,\
};
MC_WEIGHT()
MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
#endif
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
#define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
#define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
#define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
void x264_integral_init8v_neon( uint16_t *, intptr_t );
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
#if !HIGH_BIT_DEPTH
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
if( w->i_offset < 0 )
{
w->weightfn = mc_offsetsub_wtab_neon;
w->cachea[0] = -w->i_offset;
}
else
{
w->weightfn = mc_offsetadd_wtab_neon;
w->cachea[0] = w->i_offset;
}
}
else if( !w->i_denom )
w->weightfn = mc_nodenom_wtab_neon;
else
w->weightfn = mc_wtab_neon;
}
static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
x264_pixel_avg2_w8_neon,
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
x264_pixel_avg2_w16_neon,
x264_pixel_avg2_w20_neon,
};
static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
x264_mc_copy_w8_neon,
NULL,
x264_mc_copy_w16_neon,
};
static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
}
else if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
else
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, *i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
return dst;
}
else if( weight->weightfn )
{
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
intptr_t stride, int width, int height, int16_t *buf )
{
intptr_t realign = (intptr_t)src & 15;
src -= realign;
dstv -= realign;
dstc -= realign;
dsth -= realign;
width += realign;
while( height-- )
{
x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
x264_hpel_filter_c_neon( dstc, buf+8, width );
x264_hpel_filter_h_neon( dsth, src, width );
dsth += stride;
dstv += stride;
dstc += stride;
src += stride;
}
}
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
PROPAGATE_LIST(neon)
#endif // !HIGH_BIT_DEPTH
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_ARMV6) )
return;
#if !HIGH_BIT_DEPTH
pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
pf->prefetch_ref = x264_prefetch_ref_arm;
#endif // !HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
pf->plane_copy = plane_copy_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = plane_copy_interleave_neon;
pf->plane_copy_swap = plane_copy_swap_neon;
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
pf->weight = mc_wtab_neon;
pf->offsetadd = mc_offsetadd_wtab_neon;
pf->offsetsub = mc_offsetsub_wtab_neon;
pf->weight_cache = weight_cache_neon;
pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
pf->integral_init4h = x264_integral_init4h_neon;
pf->integral_init8h = x264_integral_init8h_neon;
pf->integral_init4v = x264_integral_init4v_neon;
pf->integral_init8v = x264_integral_init8v_neon;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
}

32
common/arm/mc.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* mc.h: arm motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_MC_H
#define X264_ARM_MC_H
#define x264_mc_init_arm x264_template(mc_init_arm)
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
#endif

1535
common/arm/pixel-a.S Normal file

File diff suppressed because it is too large Load Diff

160
common/arm/pixel.h Normal file
View File

@@ -0,0 +1,160 @@
/*****************************************************************************
* pixel.h: arm pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_PIXEL_H
#define X264_ARM_PIXEL_H
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
#define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
#define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
#define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
#define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
#define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
#define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
#define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
#define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
#define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
#define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
#define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
#define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
#define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
#define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
DECL_X1( sad, neon )
DECL_X1( sad_aligned, neon )
DECL_X1( sad_aligned, neon_dual )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
const uint8_t *, intptr_t,
int sums[2][4] );
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#endif

808
common/arm/predict-a.S Normal file
View File

@@ -0,0 +1,808 @@
/*****************************************************************************
* predict.S: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
.text
.macro ldcol.8 rd, rs, rt, n=8, hi=0
.if \n == 8 || \hi == 0
vld1.8 {\rd[0]}, [\rs], \rt
vld1.8 {\rd[1]}, [\rs], \rt
vld1.8 {\rd[2]}, [\rs], \rt
vld1.8 {\rd[3]}, [\rs], \rt
.endif
.if \n == 8 || \hi == 1
vld1.8 {\rd[4]}, [\rs], \rt
vld1.8 {\rd[5]}, [\rs], \rt
vld1.8 {\rd[6]}, [\rs], \rt
vld1.8 {\rd[7]}, [\rs], \rt
.endif
.endm
.macro ldcol.16 rd1, rd2, rs, rt, ru
add \ru, \rs, \rt, lsl #3
vld1.8 {\rd1[0]}, [\rs], \rt
vld1.8 {\rd2[0]}, [\ru], \rt
vld1.8 {\rd1[1]}, [\rs], \rt
vld1.8 {\rd2[1]}, [\ru], \rt
vld1.8 {\rd1[2]}, [\rs], \rt
vld1.8 {\rd2[2]}, [\ru], \rt
vld1.8 {\rd1[3]}, [\rs], \rt
vld1.8 {\rd2[3]}, [\ru], \rt
vld1.8 {\rd1[4]}, [\rs], \rt
vld1.8 {\rd2[4]}, [\ru], \rt
vld1.8 {\rd1[5]}, [\rs], \rt
vld1.8 {\rd2[5]}, [\ru], \rt
vld1.8 {\rd1[6]}, [\rs], \rt
vld1.8 {\rd2[6]}, [\ru], \rt
vld1.8 {\rd1[7]}, [\rs], \rt
vld1.8 {\rd2[7]}, [\ru], \rt
.endm
.macro add16x8 dq, dl, dh, rl, rh
vaddl.u8 \dq, \rl, \rh
vadd.u16 \dl, \dl, \dh
vpadd.u16 \dl, \dl, \dl
vpadd.u16 \dl, \dl, \dl
.endm
// because gcc doesn't believe in using the free shift in add
function predict_4x4_h_armv6
ldrb r1, [r0, #0*FDEC_STRIDE-1]
ldrb r2, [r0, #1*FDEC_STRIDE-1]
ldrb r3, [r0, #2*FDEC_STRIDE-1]
ldrb ip, [r0, #3*FDEC_STRIDE-1]
add r1, r1, r1, lsl #8
add r2, r2, r2, lsl #8
add r3, r3, r3, lsl #8
add ip, ip, ip, lsl #8
add r1, r1, r1, lsl #16
str r1, [r0, #0*FDEC_STRIDE]
add r2, r2, r2, lsl #16
str r2, [r0, #1*FDEC_STRIDE]
add r3, r3, r3, lsl #16
str r3, [r0, #2*FDEC_STRIDE]
add ip, ip, ip, lsl #16
str ip, [r0, #3*FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_v_armv6
ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
str r1, [r0, #0 + 0 * FDEC_STRIDE]
str r1, [r0, #0 + 1 * FDEC_STRIDE]
str r1, [r0, #0 + 2 * FDEC_STRIDE]
str r1, [r0, #0 + 3 * FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_dc_armv6
mov ip, #0
ldr r1, [r0, #-FDEC_STRIDE]
ldrb r2, [r0, #0*FDEC_STRIDE-1]
ldrb r3, [r0, #1*FDEC_STRIDE-1]
usad8 r1, r1, ip
add r2, r2, #4
ldrb ip, [r0, #2*FDEC_STRIDE-1]
add r2, r2, r3
ldrb r3, [r0, #3*FDEC_STRIDE-1]
add r2, r2, ip
add r2, r2, r3
add r1, r1, r2
lsr r1, r1, #3
add r1, r1, r1, lsl #8
add r1, r1, r1, lsl #16
str r1, [r0, #0*FDEC_STRIDE]
str r1, [r0, #1*FDEC_STRIDE]
str r1, [r0, #2*FDEC_STRIDE]
str r1, [r0, #3*FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_dc_top_neon
mov r12, #FDEC_STRIDE
sub r1, r0, #FDEC_STRIDE
vld1.32 d1[], [r1,:32]
vpaddl.u8 d1, d1
vpadd.u16 d1, d1, d1
vrshr.u16 d1, d1, #2
vdup.8 d1, d1[0]
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
bx lr
endfunc
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
uhadd8 \a1, \a1, \c1
uhadd8 \a2, \a2, \c2
uhadd8 \c1, \a1, \b1
uhadd8 \c2, \a2, \b2
eor \a1, \a1, \b1
eor \a2, \a2, \b2
and \a1, \a1, \pb_1
and \a2, \a2, \pb_1
uadd8 \a1, \a1, \c1
uadd8 \a2, \a2, \c2
.endm
function predict_4x4_ddr_armv6
ldr r1, [r0, # -FDEC_STRIDE]
ldrb r2, [r0, # -FDEC_STRIDE-1]
ldrb r3, [r0, #0*FDEC_STRIDE-1]
push {r4-r6,lr}
add r2, r2, r1, lsl #8
ldrb r4, [r0, #1*FDEC_STRIDE-1]
add r3, r3, r2, lsl #8
ldrb r5, [r0, #2*FDEC_STRIDE-1]
ldrb r6, [r0, #3*FDEC_STRIDE-1]
add r4, r4, r3, lsl #8
add r5, r5, r4, lsl #8
add r6, r6, r5, lsl #8
ldr ip, =0x01010101
PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
str r1, [r0, #0*FDEC_STRIDE]
lsl r2, r1, #8
lsl r3, r1, #16
lsl r4, r4, #8
lsl r5, r1, #24
add r2, r2, r4, lsr #24
str r2, [r0, #1*FDEC_STRIDE]
add r3, r3, r4, lsr #16
str r3, [r0, #2*FDEC_STRIDE]
add r5, r5, r4, lsr #8
str r5, [r0, #3*FDEC_STRIDE]
pop {r4-r6,pc}
endfunc
function predict_4x4_ddl_neon
sub r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0], ip
vdup.8 d3, d0[7]
vext.8 d1, d0, d0, #1
vext.8 d2, d0, d3, #2
vhadd.u8 d0, d0, d2
vrhadd.u8 d0, d0, d1
vst1.32 {d0[0]}, [r0,:32], ip
vext.8 d1, d0, d0, #1
vext.8 d2, d0, d0, #2
vst1.32 {d1[0]}, [r0,:32], ip
vext.8 d3, d0, d0, #3
vst1.32 {d2[0]}, [r0,:32], ip
vst1.32 {d3[0]}, [r0,:32], ip
bx lr
endfunc
function predict_8x8_dc_neon
mov ip, #0
ldrd r2, r3, [r1, #8]
push {r4-r5,lr}
ldrd r4, r5, [r1, #16]
lsl r3, r3, #8
ldrb lr, [r1, #7]
usad8 r2, r2, ip
usad8 r3, r3, ip
usada8 r2, r4, ip, r2
add lr, lr, #8
usada8 r3, r5, ip, r3
add r2, r2, lr
mov ip, #FDEC_STRIDE
add r2, r2, r3
lsr r2, r2, #4
vdup.8 d0, r2
.rept 8
vst1.64 {d0}, [r0,:64], ip
.endr
pop {r4-r5,pc}
endfunc
function predict_8x8_h_neon
add r1, r1, #7
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1]
vdup.8 d0, d16[7]
vdup.8 d1, d16[6]
vst1.64 {d0}, [r0,:64], ip
vdup.8 d2, d16[5]
vst1.64 {d1}, [r0,:64], ip
vdup.8 d3, d16[4]
vst1.64 {d2}, [r0,:64], ip
vdup.8 d4, d16[3]
vst1.64 {d3}, [r0,:64], ip
vdup.8 d5, d16[2]
vst1.64 {d4}, [r0,:64], ip
vdup.8 d6, d16[1]
vst1.64 {d5}, [r0,:64], ip
vdup.8 d7, d16[0]
vst1.64 {d6}, [r0,:64], ip
vst1.64 {d7}, [r0,:64], ip
bx lr
endfunc
function predict_8x8_v_neon
add r1, r1, #16
mov r12, #FDEC_STRIDE
vld1.8 {d0}, [r1,:64]
.rept 8
vst1.8 {d0}, [r0,:64], r12
.endr
bx lr
endfunc
function predict_8x8_ddl_neon
add r1, #16
vld1.8 {d0, d1}, [r1,:128]
vmov.i8 q3, #0
vrev64.8 d2, d1
vext.8 q8, q3, q0, #15
vext.8 q2, q0, q1, #1
vhadd.u8 q8, q2
mov r12, #FDEC_STRIDE
vrhadd.u8 q0, q8
vext.8 d2, d0, d1, #1
vext.8 d3, d0, d1, #2
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #3
vst1.8 d3, [r0,:64], r12
vext.8 d3, d0, d1, #4
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #5
vst1.8 d3, [r0,:64], r12
vext.8 d3, d0, d1, #6
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #7
vst1.8 d3, [r0,:64], r12
vst1.8 d2, [r0,:64], r12
vst1.8 d1, [r0,:64], r12
bx lr
endfunc
function predict_8x8_ddr_neon
vld1.8 {d0-d3}, [r1,:128]
vext.8 q2, q0, q1, #7
vext.8 q3, q0, q1, #9
vhadd.u8 q2, q2, q3
vrhadd.u8 d0, d1, d4
vrhadd.u8 d1, d2, d5
add r0, #7*FDEC_STRIDE
mov r12, #-1*FDEC_STRIDE
vext.8 d2, d0, d1, #1
vst1.8 {d0}, [r0,:64], r12
vext.8 d4, d0, d1, #2
vst1.8 {d2}, [r0,:64], r12
vext.8 d5, d0, d1, #3
vst1.8 {d4}, [r0,:64], r12
vext.8 d4, d0, d1, #4
vst1.8 {d5}, [r0,:64], r12
vext.8 d5, d0, d1, #5
vst1.8 {d4}, [r0,:64], r12
vext.8 d4, d0, d1, #6
vst1.8 {d5}, [r0,:64], r12
vext.8 d5, d0, d1, #7
vst1.8 {d4}, [r0,:64], r12
vst1.8 {d5}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_vl_neon
add r1, #16
mov r12, #FDEC_STRIDE
vld1.8 {d0, d1}, [r1,:128]
vext.8 q1, q1, q0, #15
vext.8 q2, q0, q2, #1
vrhadd.u8 q3, q0, q2
vhadd.u8 q1, q1, q2
vrhadd.u8 q0, q0, q1
vext.8 d2, d0, d1, #1
vst1.8 {d6}, [r0,:64], r12
vext.8 d3, d6, d7, #1
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #2
vst1.8 {d3}, [r0,:64], r12
vext.8 d3, d6, d7, #2
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #3
vst1.8 {d3}, [r0,:64], r12
vext.8 d3, d6, d7, #3
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #4
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_vr_neon
add r1, #8
mov r12, #FDEC_STRIDE
vld1.8 {d4,d5}, [r1,:64]
vext.8 q1, q2, q2, #14
vext.8 q0, q2, q2, #15
vhadd.u8 q3, q2, q1
vrhadd.u8 q2, q2, q0
vrhadd.u8 q0, q0, q3
vmov d2, d0
vst1.8 {d5}, [r0,:64], r12
vuzp.8 d2, d0
vst1.8 {d1}, [r0,:64], r12
vext.8 d6, d0, d5, #7
vext.8 d3, d2, d1, #7
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
vext.8 d6, d0, d5, #6
vext.8 d3, d2, d1, #6
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
vext.8 d6, d0, d5, #5
vext.8 d3, d2, d1, #5
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_hd_neon
mov r12, #FDEC_STRIDE
add r1, #7
vld1.8 {d2,d3}, [r1]
vext.8 q3, q1, q1, #1
vext.8 q2, q1, q1, #2
vrhadd.u8 q8, q1, q3
vhadd.u8 q1, q2
vrhadd.u8 q0, q1, q3
vzip.8 d16, d0
vext.8 d2, d0, d1, #6
vext.8 d3, d0, d1, #4
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #2
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d16, d0, #6
vst1.8 {d0}, [r0,:64], r12
vext.8 d3, d16, d0, #4
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d16, d0, #2
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
vst1.8 {d16}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_hu_neon
mov r12, #FDEC_STRIDE
add r1, #7
vld1.8 {d7}, [r1]
vdup.8 d6, d7[0]
vrev64.8 d7, d7
vext.8 d4, d7, d6, #2
vext.8 d2, d7, d6, #1
vhadd.u8 d16, d7, d4
vrhadd.u8 d0, d2, d7
vrhadd.u8 d1, d16, d2
vzip.8 d0, d1
vdup.16 q1, d1[3]
vext.8 q2, q0, q1, #2
vext.8 q3, q0, q1, #4
vext.8 q8, q0, q1, #6
vst1.8 {d0}, [r0,:64], r12
vst1.8 {d4}, [r0,:64], r12
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d16}, [r0,:64], r12
vst1.8 {d1}, [r0,:64], r12
vst1.8 {d5}, [r0,:64], r12
vst1.8 {d7}, [r0,:64], r12
vst1.8 {d17}, [r0,:64]
bx lr
endfunc
function predict_8x8c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b pred8x8_dc_end
endfunc
function predict_8x8c_dc_left_neon
mov r1, #FDEC_STRIDE
sub r2, r0, #1
ldcol.8 d0, r2, r1
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
b pred8x8_dc_end
endfunc
function predict_8x8c_dc_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
pred8x8_dc_end:
add r2, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
bx lr
endfunc
function predict_8x8c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 4
vld1.8 {d0[]}, [r1], ip
vld1.8 {d2[]}, [r1], ip
vst1.64 {d0}, [r0,:64], ip
vst1.64 {d2}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x8c_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0,:64], ip
.rept 8
vst1.64 {d0}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x8c_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d0, r3, r1, 4, hi=1
add r3, r3, r1
ldcol.8 d3, r3, r1, 4
vaddl.u8 q8, d2, d3
vrev32.8 d0, d0
vtrn.32 d2, d3
vsubl.u8 q2, d2, d0
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 d5, d5, d0
vpadd.i16 d4, d4, d5
vpaddl.s16 d4, d4
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5
vrshrn.s32 d4, q2, #5
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #2
vrev64.16 d16, d16
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vext.16 q0, q0, q0, #7
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q3, d5[0]
vadd.i16 q1, q1, q0
mov r3, #8
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function predict_8x16c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
add r2, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
add r2, r2, r1, lsl #2
add r0, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
bx lr
endfunc
function predict_8x16c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
vld1.8 {d0[]}, [r1], ip
vld1.8 {d2[]}, [r1], ip
vst1.64 {d0}, [r0,:64], ip
vst1.64 {d2}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x16c_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.32 d16, d3
vaddl.u8 q8, d2, d16
vrev32.8 d0, d0
vsubl.u8 q2, d2, d0
vrev64.8 d1, d1
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 q3, q3, q0
vpadd.i16 d4, d4, d5
vpadd.i16 d6, d6, d7
vpaddl.s16 d4, d4 @ d4[0] = H
vpaddl.s16 d6, d6
vpadd.s32 d6, d6 @ d6[0] = V
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
vshl.i32 d7, d6, #2
vrshrn.s32 d4, q2, #5 @ d4[0] = b
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
vrshrn.s32 d6, q3, #6 @ d6[0] = c
mov r3, #0
vshl.i16 d3, d4, #2
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
vshl.i16 d2, d6, #3
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
vrev64.16 d16, d16
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
vsub.i16 d2, d2, d3 @ i00
vext.16 q0, q0, q0, #7
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q3, d6[0]
vadd.i16 q1, q1, q0
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function predict_16x16_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {q0}, [r2,:128]
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_left_neon
mov r1, #FDEC_STRIDE
sub r2, r0, #1
ldcol.8 d0, r2, r1
ldcol.8 d1, r2, r1
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_neon
sub r3, r0, #FDEC_STRIDE
sub r0, r0, #1
vld1.64 {d0-d1}, [r3,:128]
ldrb ip, [r0], #FDEC_STRIDE
vaddl.u8 q0, d0, d1
ldrb r1, [r0], #FDEC_STRIDE
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0, d0
vpadd.u16 d0, d0, d0
.rept 4
ldrb r2, [r0], #FDEC_STRIDE
add ip, ip, r1
ldrb r3, [r0], #FDEC_STRIDE
add ip, ip, r2
ldrb r1, [r0], #FDEC_STRIDE
add ip, ip, r3
.endr
ldrb r2, [r0], #FDEC_STRIDE
add ip, ip, r1
ldrb r3, [r0], #FDEC_STRIDE
add ip, ip, r2
sub r0, r0, #FDEC_STRIDE*16
add ip, ip, r3
vdup.16 d1, ip
vadd.u16 d0, d0, d1
mov r1, #FDEC_STRIDE
add r0, r0, #1
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
pred16x16_dc_end:
.rept 16
vst1.64 {d0-d1}, [r0,:128], r1
.endr
bx lr
endfunc
function predict_16x16_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
vld1.8 {d0[]}, [r1], ip
vmov d1, d0
vld1.8 {d2[]}, [r1], ip
vmov d3, d2
vst1.64 {d0-d1}, [r0,:128], ip
vst1.64 {d2-d3}, [r0,:128], ip
.endr
bx lr
endfunc
function predict_16x16_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0-d1}, [r0,:128], ip
.rept 16
vst1.64 {d0-d1}, [r0,:128], ip
.endr
bx lr
endfunc
function predict_16x16_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #8
sub r3, r3, #1
vld1.8 {d0}, [r3]
vld1.8 {d2}, [r2,:64], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.8 q0, q0
vaddl.u8 q8, d2, d3
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.8 {q0}, [r3,:128]
vmul.s16 q2, q2, q0
vmul.s16 q3, q3, q0
vadd.i16 d4, d4, d5
vadd.i16 d5, d6, d7
vpadd.i16 d4, d4, d5
vpadd.i16 d4, d4, d4
vshll.s16 q3, d4, #2
vaddw.s16 q2, q3, d4
vrshrn.s32 d4, q2, #6
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #3
vrev64.16 d16, d17
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #4
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q2
vqshrun.s16 d1, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc

108
common/arm/predict-c.c Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* predict.c: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
{
if( !(cpu&X264_CPU_ARMV6) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
if( !(cpu&X264_CPU_NEON) )
return;
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
/* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !HIGH_BIT_DEPTH
}

105
common/arm/predict.h Normal file
View File

@@ -0,0 +1,105 @@
/*****************************************************************************
* predict.h: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_PREDICT_H
#define X264_ARM_PREDICT_H
#define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
void x264_predict_4x4_dc_armv6( uint8_t *src );
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
void x264_predict_4x4_dc_top_neon( uint8_t *src );
#define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
void x264_predict_4x4_v_armv6( uint8_t *src );
#define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
void x264_predict_4x4_h_armv6( uint8_t *src );
#define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
void x264_predict_4x4_ddr_armv6( uint8_t *src );
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
void x264_predict_4x4_ddl_neon( uint8_t *src );
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
void x264_predict_8x8c_dc_neon( uint8_t *src );
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
void x264_predict_8x8c_h_neon( uint8_t *src );
#define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
void x264_predict_8x8c_v_neon( uint8_t *src );
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
void x264_predict_8x8c_p_neon( uint8_t *src );
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
void x264_predict_8x16c_h_neon( uint8_t *src );
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
void x264_predict_8x16c_p_neon( uint8_t *src );
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
void x264_predict_16x16_dc_neon( uint8_t *src );
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
void x264_predict_16x16_dc_top_neon( uint8_t *src );
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
void x264_predict_16x16_dc_left_neon( uint8_t *src );
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
void x264_predict_16x16_h_neon( uint8_t *src );
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
void x264_predict_16x16_v_neon( uint8_t *src );
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
void x264_predict_16x16_p_neon( uint8_t *src );
#define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
#define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#endif

574
common/arm/quant-a.S Normal file
View File

@@ -0,0 +1,574 @@
/****************************************************************************
* quant.S: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const pmovmskb_byte, align=4
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
endconst
const mask_2bit, align=4
.byte 3,12,48,192,3,12,48,192
.byte 3,12,48,192,3,12,48,192
endconst
const mask_1bit, align=4
.byte 128,64,32,16,8,4,2,1
.byte 128,64,32,16,8,4,2,1
endconst
.text
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
vadd.u16 q8, q8, \bias0
vadd.u16 q9, q9, \bias1
.ifc \load_mf, yes
vld1.64 {\mf0-\mf3}, [r1,:128]!
.endif
vmull.u16 q10, d16, \mf0
vmull.u16 q11, d17, \mf1
vmull.u16 q12, d18, \mf2
vmull.u16 q13, d19, \mf3
vshr.s16 q14, q14, #15
vshr.s16 q15, q15, #15
vshrn.u32 d16, q10, #16
vshrn.u32 d17, q11, #16
vshrn.u32 d18, q12, #16
vshrn.u32 d19, q13, #16
veor q8, q8, q14
veor q9, q9, q15
vsub.s16 q8, q8, q14
vsub.s16 q9, q9, q15
vorr \mask, q8, q9
vst1.64 {d16-d19}, [r0,:128]!
.endm
.macro QUANT_END d
vmov r2, r3, \d
orrs r0, r2, r3
movne r0, #1
bx lr
.endm
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
function quant_2x2_dc_neon
vld1.64 {d0}, [r0,:64]
vabs.s16 d3, d0
vdup.16 d2, r2
vdup.16 d1, r1
vadd.u16 d3, d3, d2
vmull.u16 q3, d3, d1
vshr.s16 d0, d0, #15
vshrn.u32 d3, q3, #16
veor d3, d3, d0
vsub.s16 d3, d3, d0
vst1.64 {d3}, [r0,:64]
QUANT_END d3
endfunc
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
function quant_4x4_dc_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vdup.16 q0, r2
vdup.16 q2, r1
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
vorr d0, d0, d1
QUANT_END d0
endfunc
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
function quant_4x4_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
vorr d0, d0, d1
QUANT_END d0
endfunc
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
function quant_4x4x4_neon
vpush {d8-d15}
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7, q4
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q5
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q6
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q7
vorr d8, d8, d9
vorr d10, d10, d11
vorr d12, d12, d13
vorr d14, d14, d15
vmov r0, r1, d8
vmov r2, r3, d10
orrs r0, r1
movne r0, #1
orrs r2, r3
orrne r0, #2
vmov r1, r2, d12
vmov r3, ip, d14
orrs r1, r2
orrne r0, #4
orrs r3, ip
orrne r0, #8
vpop {d8-d15}
bx lr
endfunc
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function quant_8x8_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]!
vld1.64 {d4-d7}, [r1,:128]!
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
.rept 3
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d2-d5}, [r2,:128]!
QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
vorr q0, q0, q1
.endr
vorr d0, d0, d1
QUANT_END d0
endfunc
.macro DEQUANT_START mf_size offset dc=no
mov r3, #0x2b
mul r3, r3, r2
lsr r3, r3, #8 // i_qbits = i_qp / 6
add ip, r3, r3, lsl #1
sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
.ifc \dc,no
add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
.else
ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
.endif
subs r3, r3, #\offset // 6 for 8x8
.endm
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
function dequant_\size\()_neon
DEQUANT_START \bits+2, \bits
.ifc \size, 8x8
mov r2, #4
.endif
blt dequant_\size\()_rshift
vdup.16 q15, r3
dequant_\size\()_lshift_loop:
.ifc \size, 8x8
subs r2, r2, #1
.endif
vld1.32 {d16-d17}, [r1,:128]!
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d4, q8
vld1.32 {d20-d21}, [r1,:128]!
vmovn.s32 d5, q9
vld1.32 {d22-d23}, [r1,:128]!
vmovn.s32 d6, q10
vld1.16 {d0-d3}, [r0,:128]
vmovn.s32 d7, q11
vmul.s16 q0, q0, q2
vmul.s16 q1, q1, q3
vshl.s16 q0, q0, q15
vshl.s16 q1, q1, q15
vst1.16 {d0-d3}, [r0,:128]!
.ifc \size, 8x8
bgt dequant_\size\()_lshift_loop
.endif
bx lr
dequant_\size\()_rshift:
vdup.32 q15, r3
rsb r3, r3, #0
mov ip, #1
sub r3, r3, #1
lsl ip, ip, r3
.ifc \size, 8x8
dequant_\size\()_rshift_loop:
subs r2, r2, #1
.endif
vdup.32 q10, ip
vld1.32 {d16-d17}, [r1,:128]!
vdup.32 q11, ip
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d4, q8
vld1.32 {d16-d17}, [r1,:128]!
vmovn.s32 d5, q9
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d6, q8
vld1.16 {d0-d3}, [r0,:128]
vmovn.s32 d7, q9
vdup.32 q12, ip
vdup.32 q13, ip
vmlal.s16 q10, d0, d4
vmlal.s16 q11, d1, d5
vmlal.s16 q12, d2, d6
vmlal.s16 q13, d3, d7
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vmovn.s32 d0, q10
vmovn.s32 d1, q11
vmovn.s32 d2, q12
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]!
.ifc \size, 8x8
bgt dequant_\size\()_rshift_loop
.endif
bx lr
endfunc
.endm
DEQUANT 4x4, 4
DEQUANT 8x8, 6
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
function dequant_4x4_dc_neon
DEQUANT_START 6, 6, yes
blt dequant_4x4_dc_rshift
lsl r1, r1, r3
vdup.16 q2, r1
vld1.16 {d0-d3}, [r0,:128]
vdup.16 q15, r3
vmul.s16 q0, q0, q2
vmul.s16 q1, q1, q2
vst1.16 {d0-d3}, [r0,:128]
bx lr
dequant_4x4_dc_rshift:
vdup.16 d4, r1
vdup.32 q15, r3
rsb r3, r3, #0
mov ip, #1
sub r3, r3, #1
lsl ip, ip, r3
vdup.32 q10, ip
vdup.32 q11, ip
vld1.16 {d0-d3}, [r0,:128]
vdup.32 q12, ip
vdup.32 q13, ip
vmlal.s16 q10, d0, d4
vmlal.s16 q11, d1, d4
vmlal.s16 q12, d2, d4
vmlal.s16 q13, d3, d4
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vmovn.s32 d0, q10
vmovn.s32 d1, q11
vmovn.s32 d2, q12
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]
bx lr
endfunc
.macro decimate_score_1x size
function decimate_score\size\()_neon
vld1.16 {q0, q1}, [r0, :128]
movrel r3, mask_2bit
vmov.s8 q3, #0x01
vqmovn.s16 d0, q0
vqmovn.s16 d1, q1
vqabs.s8 q2, q0
vld1.8 {q8}, [r3, :128]
vceq.s8 q1, q0, #0
vcgt.s8 q2, q2, q3
vand.u8 q1, q1, q8
vshrn.u16 d4, q2, #4
vpadd.u8 d2, d2, d3
vpadd.u8 d4, d4, d4
vpadd.u8 d2, d2, d2
vmov.32 r2, d4[0]
vmov.32 r1, d2[0]
cmp r2, #0
beq 0f
mov r0, #9
bx lr
0:
mvns r1, r1
mov r0, #0
bxeq lr
.ifc \size, 15
lsr r1, r1, #2
.endif
rbit r1, r1
movrelx r3, X264(decimate_table4), r2
1:
clz r2, r1
lsl r1, r1, r2
lsr r12, r2, #1
ldrb r2, [r3, r12]
lsls r1, r1, #2
add r0, r0, r2
bne 1b
bx lr
endfunc
.endm
decimate_score_1x 15
decimate_score_1x 16
function decimate_score64_neon
push {lr}
vld1.16 {q8, q9}, [r0, :128]!
vld1.16 {q10, q11}, [r0, :128]!
vld1.16 {q12, q13}, [r0, :128]!
vld1.16 {q14, q15}, [r0, :128]
movrel r3, mask_1bit
vmov.s8 q3, #0x01
vqmovn.s16 d17, q8
vqmovn.s16 d16, q9
vqmovn.s16 d19, q10
vqmovn.s16 d18, q11
vqmovn.s16 d21, q12
vqmovn.s16 d20, q13
vqmovn.s16 d23, q14
vqmovn.s16 d22, q15
vqabs.s8 q12, q8
vqabs.s8 q13, q9
vqabs.s8 q14, q10
vqabs.s8 q15, q11
vld1.8 {q2}, [r3, :128]
vceq.s8 q8, q8, #0
vceq.s8 q9, q9, #0
vceq.s8 q10, q10, #0
vceq.s8 q11, q11, #0
vmax.s8 q12, q12, q13
vmax.s8 q14, q14, q15
vand.u8 q8, q8, q2
vand.u8 q9, q9, q2
vand.u8 q10, q10, q2
vand.u8 q11, q11, q2
vmax.s8 q12, q12, q14
vpadd.u8 d18, d18, d19
vpadd.u8 d19, d16, d17
vcgt.s8 q12, q12, q3
vpadd.u8 d22, d22, d23
vpadd.u8 d23, d20, d21
vshrn.u16 d24, q12, #4
vpadd.u8 d16, d22, d23
vpadd.u8 d17, d18, d19
vpadd.u8 d24, d24, d24
vpadd.u8 d16, d16, d17
vmov.32 r2, d24[0]
vmov r12, r1, d16
cmp r2, #0
beq 0f
mov r0, #9
pop {pc}
0:
mvns r1, r1
mvn r12, r12
mov r0, #0
mov lr, #32
movrelx r3, X264(decimate_table8), r2
beq 2f
1:
clz r2, r1
lsl r1, r1, r2
sub lr, lr, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
sub lr, lr, #1
add r0, r0, r2
bne 1b
2:
cmp r12, #0
popeq {pc}
clz r2, r12
lsl r1, r12, r2
add r2, r2, lr
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
popeq {pc}
3:
clz r2, r1
lsl r1, r1, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
bne 3b
pop {pc}
endfunc
// int coeff_last( int16_t *l )
function coeff_last4_arm
ldrd r2, r3, [r0]
subs r0, r3, #0
movne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
endfunc
function coeff_last8_arm
ldrd r2, r3, [r0, #8]
orrs ip, r2, r3
movne r0, #4
ldrdeq r2, r3, [r0]
moveq r0, #0
tst r3, r3
addne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
endfunc
.macro COEFF_LAST_1x size
function coeff_last\size\()_neon
.if \size == 15
sub r0, r0, #2
.endif
vld1.64 {d0-d3}, [r0,:128]
vtst.16 q0, q0
vtst.16 q1, q1
vshrn.u16 d0, q0, #8
vshrn.u16 d1, q1, #8
vshrn.u16 d0, q0, #4
vclz.i32 d0, d0
mov ip, #7
mov r3, #\size - 9
vmov r0, r1, d0
subs r1, ip, r1, lsr #2
addge r0, r1, #\size - 8
subslt r0, r3, r0, lsr #2
movlt r0, #0
bx lr
endfunc
.endm
COEFF_LAST_1x 15
COEFF_LAST_1x 16
function coeff_last64_neon
vld1.64 {d16-d19}, [r0,:128]!
vqmovn.u16 d16, q8
vqmovn.u16 d17, q9
vld1.64 {d20-d23}, [r0,:128]!
vqmovn.u16 d18, q10
vqmovn.u16 d19, q11
vld1.64 {d24-d27}, [r0,:128]!
vqmovn.u16 d20, q12
vqmovn.u16 d21, q13
vld1.64 {d28-d31}, [r0,:128]!
vqmovn.u16 d22, q14
vqmovn.u16 d23, q15
movrel r1, pmovmskb_byte
vld1.64 {d0-d1}, [r1,:128]
vtst.8 q8, q8
vtst.8 q9, q9
vtst.8 q10, q10
vtst.8 q11, q11
vand q8, q8, q0
vand q9, q9, q0
vand q10, q10, q0
vand q11, q11, q0
vpadd.u8 d0, d16, d17
vpadd.u8 d1, d18, d19
vpadd.u8 d2, d20, d21
vpadd.u8 d3, d22, d23
vpadd.u8 d0, d0, d1
vpadd.u8 d1, d2, d3
vpadd.u8 d0, d0, d1
vclz.i32 d0, d0
mov ip, #31
vmov r0, r1, d0
subs r1, ip, r1
addge r0, r1, #32
subslt r0, ip, r0
movlt r0, #0
bx lr
endfunc
function denoise_dct_neon
1: subs r3, r3, #16
vld1.16 {q0, q1}, [r0]
vld1.32 {q12, q13}, [r1]!
vld1.32 {q14, q15}, [r1]
sub r1, #32
vabs.s16 q8, q0
vabs.s16 q9, q1
vld1.16 {q2, q3}, [r2]!
vclt.s16 q10, q0, #0
vclt.s16 q11, q1, #0
vaddw.u16 q12, q12, d16
vaddw.u16 q13, q13, d17
vqsub.u16 q0, q8, q2
vqsub.u16 q1, q9, q3
vaddw.u16 q14, q14, d18
vaddw.u16 q15, q15, d19
vneg.s16 q8, q0
vneg.s16 q9, q1
vbsl q10, q8, q0
vbsl q11, q9, q1
vst1.32 {q12, q13}, [r1]!
vst1.32 {q14, q15}, [r1]!
vst1.16 {q10, q11}, [r0]!
bgt 1b
bx lr
endfunc

71
common/arm/quant.h Normal file
View File

@@ -0,0 +1,71 @@
/*****************************************************************************
* quant.h: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_QUANT_H
#define X264_ARM_QUANT_H
#define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
int x264_decimate_score15_neon( int16_t * );
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
int x264_decimate_score16_neon( int16_t * );
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
int x264_decimate_score64_neon( int16_t * );
#define x264_coeff_last4_arm x264_template(coeff_last4_arm)
int x264_coeff_last4_arm( int16_t * );
#define x264_coeff_last8_arm x264_template(coeff_last8_arm)
int x264_coeff_last8_arm( int16_t * );
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
int x264_coeff_last15_neon( int16_t * );
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
int x264_coeff_last16_neon( int16_t * );
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
int x264_coeff_last64_neon( int16_t * );
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
#endif