x264 source for verification 2026-05-22

This commit is contained in:
2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
/*****************************************************************************
* asm-offsets.c: check asm offsets for aarch64
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "asm-offsets.h"
#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
{ \
STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
}
#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
{ \
STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
}
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START);
X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
// the aarch64 asm makes following additional assumptions about the x264_cabac_t
// memory layout
X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);

View File

@@ -0,0 +1,39 @@
/*****************************************************************************
* asm-offsets.h: asm offsets for aarch64
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_ASM_OFFSETS_H
#define X264_AARCH64_ASM_OFFSETS_H
#define CABAC_I_LOW 0x00
#define CABAC_I_RANGE 0x04
#define CABAC_I_QUEUE 0x08
#define CABAC_I_BYTES_OUTSTANDING 0x0c
#define CABAC_P_START 0x10
#define CABAC_P 0x18
#define CABAC_P_END 0x20
#define CABAC_F8_BITS_ENCODED 0x30
#define CABAC_STATE 0x34
#endif

291
common/aarch64/asm.S Normal file
View File

@@ -0,0 +1,291 @@
/*****************************************************************************
* asm.S: AArch64 utility macros
*****************************************************************************
* Copyright (C) 2008-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "config.h"
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#ifdef PREFIX
# define BASE _x264_
# define SYM_PREFIX _
#else
# define BASE x264_
# define SYM_PREFIX
#endif
#ifdef BIT_DEPTH
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
#else
# define EXTERN_ASM BASE
#endif
#define X(s) JOIN(EXTERN_ASM, s)
#define X264(s) JOIN(BASE, s)
#define EXT(s) JOIN(SYM_PREFIX, s)
#ifdef __ELF__
# define ELF
#else
# define ELF #
#endif
#ifdef __MACH__
# define MACH
#else
# define MACH #
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC #
#endif
.arch AS_ARCH_LEVEL
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
#define ENABLE_DOTPROD .arch_extension dotprod
#define DISABLE_DOTPROD .arch_extension nodotprod
#else
#define ENABLE_DOTPROD
#define DISABLE_DOTPROD
#endif
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
#define ENABLE_I8MM .arch_extension i8mm
#define DISABLE_I8MM .arch_extension noi8mm
#else
#define ENABLE_I8MM
#define DISABLE_I8MM
#endif
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
#define ENABLE_SVE .arch_extension sve
#define DISABLE_SVE .arch_extension nosve
#else
#define ENABLE_SVE
#define DISABLE_SVE
#endif
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
#define ENABLE_SVE2 .arch_extension sve2
#define DISABLE_SVE2 .arch_extension nosve2
#else
#define ENABLE_SVE2
#define DISABLE_SVE2
#endif
/* If we do support the .arch_extension directives, disable support for all
* the extensions that we may use, in case they were implicitly enabled by
* the .arch level. This makes it clear if we try to assemble an instruction
* from an unintended extension set; we only allow assmbling such instructions
* within regions where we explicitly enable those extensions. */
DISABLE_DOTPROD
DISABLE_I8MM
DISABLE_SVE
DISABLE_SVE2
.macro function name, export=0, align=2
.macro endfunc
.if \export
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
.else
ELF .size \name, . - \name
.endif
FUNC .endfunc
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
.else
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
ELF .section .rodata
MACH .const_data
.align \align
\name:
.endm
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif defined(PIC) && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
#endif
.endm
#define FDEC_STRIDE 32
#define FENC_STRIDE 16
.macro SUMSUB_AB sum, sub, a, b
add \sum, \a, \b
sub \sub, \a, \b
.endm
.macro unzip t1, t2, s1, s2
uzp1 \t1, \s1, \s2
uzp2 \t2, \s1, \s2
.endm
.macro transpose t1, t2, s1, s2
trn1 \t1, \s1, \s2
trn2 \t2, \s1, \s2
.endm
.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
.endm
.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
.endm
.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8h, \r0\().8h, \r1\().8h
trn2 \r9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \r9\().4s, \r3\().4s
trn2 \r9\().4s, \r9\().4s, \r3\().4s
trn1 \r3\().4s, \r8\().4s, \r1\().4s
trn2 \r8\().4s, \r8\().4s, \r1\().4s
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2d, \r8\().2d, \r2\().2d
trn1 \r2\().2d, \r8\().2d, \r2\().2d
trn1 \r3\().2d, \r9\().2d, \r7\().2d
trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm
.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16b, \r0\().16b, \r1\().16b
trn2 \t1\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \t1\().8h, \r3\().8h
trn2 \t1\().8h, \t1\().8h, \r3\().8h
trn1 \r3\().8h, \t0\().8h, \r1\().8h
trn2 \t0\().8h, \t0\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \t0\().4s, \r2\().4s
trn1 \r2\().4s, \t0\().4s, \r2\().4s
trn1 \r3\().4s, \t1\().4s, \r7\().4s
trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm
.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8b, \r0\().8b, \r1\().8b
trn2 \t5\().8b, \r0\().8b, \r1\().8b
trn1 \t6\().8b, \r2\().8b, \r3\().8b
trn2 \t7\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().4h, \t4\().4h, \t6\().4h
trn2 \r2\().4h, \t4\().4h, \t6\().4h
trn1 \r1\().4h, \t5\().4h, \t7\().4h
trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm

View File

@@ -0,0 +1,82 @@
/*****************************************************************************
* bitstream-a.S: aarch64 bitstream functions
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
function nal_escape_neon, export=1
movi v0.16b, #0xff
movi v4.16b, #4
mov w3, #3
subs x6, x1, x2
cbz x6, 99f
0:
cmn x6, #15
b.lt 16f
mov x1, x2
b 100f
16:
ld1 {v1.16b}, [x1], #16
ext v2.16b, v0.16b, v1.16b, #14
ext v3.16b, v0.16b, v1.16b, #15
cmhi v7.16b, v4.16b, v1.16b
cmeq v5.16b, v2.16b, #0
cmeq v6.16b, v3.16b, #0
and v5.16b, v5.16b, v7.16b
and v5.16b, v5.16b, v6.16b
shrn v7.8b, v5.8h, #4
mov x7, v7.d[0]
cbz x7, 16f
mov x6, #-16
100:
umov w5, v0.b[14]
umov w4, v0.b[15]
orr w5, w4, w5, lsl #8
101:
ldrb w4, [x1, x6]
orr w9, w4, w5, lsl #16
cmp w9, #3
b.hi 102f
strb w3, [x0], #1
orr w5, w3, w5, lsl #8
102:
adds x6, x6, #1
strb w4, [x0], #1
orr w5, w4, w5, lsl #8
b.lt 101b
subs x6, x1, x2
lsr w9, w5, #8
mov v0.b[14], w9
mov v0.b[15], w5
b.lt 0b
ret
16:
subs x6, x1, x2
st1 {v1.16b}, [x0], #16
mov v0.16b, v1.16b
b.lt 0b
99:
ret
endfunc

View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* bitstream.h: aarch64 bitstream functions
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_BITSTREAM_H
#define X264_AARCH64_BITSTREAM_H
#define x264_nal_escape_neon x264_template(nal_escape_neon)
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif

131
common/aarch64/cabac-a.S Normal file
View File

@@ -0,0 +1,131 @@
/*****************************************************************************
* cabac-a.S: aarch64 cabac
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "asm-offsets.h"
// w11 holds x264_cabac_t.i_low
// w12 holds x264_cabac_t.i_range
function cabac_encode_decision_asm, export=1
add w10, w1, #CABAC_STATE
ldrb w3, [x0, w10, uxtw] // i_state
ldr w12, [x0, #CABAC_I_RANGE]
movrel x8, X264(cabac_range_lps), -4
movrel x9, X264(cabac_transition)
ubfx x4, x3, #1, #7
asr w5, w12, #6
add x8, x8, x4, lsl #2
orr w14, w2, w3, lsl #1
ldrb w4, [x8, w5, uxtw] // i_range_lps
ldr w11, [x0, #CABAC_I_LOW]
eor w6, w2, w3 // b ^ i_state
ldrb w9, [x9, w14, uxtw]
sub w12, w12, w4
add w7, w11, w12
tst w6, #1 // (b ^ i_state) & 1
csel w12, w4, w12, ne
csel w11, w7, w11, ne
strb w9, [x0, w10, uxtw] // i_state
cabac_encode_renorm:
ldr w2, [x0, #CABAC_I_QUEUE]
clz w5, w12
sub w5, w5, #23
lsl w11, w11, w5
lsl w12, w12, w5
adds w2, w2, w5
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
.align 5
cabac_putbyte:
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
add w14, w2, #10
mov w13, #-1
sub w2, w2, #8
asr w4, w11, w14 // out
lsl w13, w13, w14
subs w5, w4, #0xff
bic w11, w11, w13
cinc w6, w6, eq
b.eq 0f
1:
ldr x7, [x0, #CABAC_P]
asr w5, w4, #8 // carry
ldurb w8, [x7, #-1]
add w8, w8, w5
sub w5, w5, #1
sturb w8, [x7, #-1]
cbz w6, 3f
2:
subs w6, w6, #1
strb w5, [x7], #1
b.gt 2b
3:
strb w4, [x7], #1
str x7, [x0, #CABAC_P]
0:
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
ret
endfunc
function cabac_encode_bypass_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
ldr w11, [x0, #CABAC_I_LOW]
ldr w2, [x0, #CABAC_I_QUEUE]
and w1, w1, w12
add w11, w1, w11, lsl #1
adds w2, w2, #1
b.ge cabac_putbyte
str w11, [x0, #CABAC_I_LOW]
str w2, [x0, #CABAC_I_QUEUE]
ret
endfunc
function cabac_encode_terminal_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
sub w12, w12, #2
tbz w12, #8, 1f
str w12, [x0, #CABAC_I_RANGE]
ret
1:
ldr w2, [x0, #CABAC_I_QUEUE]
ldr w11, [x0, #CABAC_I_LOW]
lsl w12, w12, #1
adds w2, w2, #1
lsl w11, w11, #1
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
endfunc

View File

@@ -0,0 +1,40 @@
/****************************************************************************
* dct-a-common.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros that are intended to be used by
// the SVE/SVE2 functions as well
.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
SUMSUB_AB \v1, \v6, \v5, \v6
SUMSUB_AB \v3, \v7, \v4, \v7
add \v0, \v3, \v1
add \v4, \v7, \v7
add \v5, \v6, \v6
sub \v2, \v3, \v1
add \v1, \v4, \v6
sub \v3, \v7, \v5
.endm

View File

@@ -0,0 +1,88 @@
/****************************************************************************
* dct-a-sve.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
ENABLE_SVE
function sub4x4_dct_sve, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ptrue p0.h, vl4
ld1b {z0.h}, p0/z, [x1]
add x1, x1, x3
ld1b {z1.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z2.h}, p0/z, [x1]
add x1, x1, x3
sub v16.4h, v0.4h, v1.4h
ld1b {z3.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z4.h}, p0/z, [x1]
add x1, x1, x3
sub v17.4h, v2.4h, v3.4h
ld1b {z5.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z6.h}, p0/z, [x1]
sub v18.4h, v4.4h, v5.4h
ld1b {z7.h}, p0/z, [x2]
sub v19.4h, v6.4h, v7.4h
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
ret
endfunc
function zigzag_interleave_8x8_cavlc_sve, export=1
mov z31.s, #1
ptrue p2.s, vl2
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
umax v16.8h, v0.8h, v4.8h
umax v17.8h, v1.8h, v5.8h
umax v18.8h, v2.8h, v6.8h
umax v19.8h, v3.8h, v7.8h
st1 {v0.8h}, [x0], #16
st1 {v4.8h}, [x0], #16
umaxp v16.8h, v16.8h, v17.8h
umaxp v18.8h, v18.8h, v19.8h
st1 {v1.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
umaxp v16.8h, v16.8h, v18.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
cmhs v16.4s, v16.4s, v31.4s
st1 {v3.8h}, [x0], #16
and v16.16b, v16.16b, v31.16b
st1 {v7.8h}, [x0], #16
st1b {z16.s}, p2, [x2]
add x2, x2, #8
mov v16.d[0], v16.d[1]
st1b {z16.s}, p2, [x2]
ret
endfunc

View File

@@ -0,0 +1,90 @@
/****************************************************************************
* dct-a-sve2.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
ENABLE_SVE
ENABLE_SVE2
function add4x4_idct_sve2, export=1
mov x2, #FDEC_STRIDE
mov x11, x0
ptrue p0.h, vl8
ptrue p1.h, vl4
ld1 {v0.8h, v1.8h}, [x1]
SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h
sshr v7.8h, v0.8h, #1
sshr v6.8h, v1.8h, #1
sub v7.8h, v7.8h, v1.8h
add v6.8h, v6.8h, v0.8h
mov v7.d[0], v7.d[1]
mov v6.d[0], v6.d[1]
ld1b {z28.h}, p0/z, [x11]
add x11, x11, x2
SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h
SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h
sshr v7.4h, v1.4h, #1
sshr v6.4h, v2.4h, #1
sub v7.4h, v7.4h, v2.4h
add v6.4h, v6.4h, v1.4h
ld1b {z29.h}, p0/z, [x11]
add x11, x11, x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
srshr z0.h, p1/m, z0.h, #6
srshr z1.h, p1/m, z1.h, #6
ld1b {z31.h}, p0/z, [x11]
add x11, x11, x2
srshr z2.h, p1/m, z2.h, #6
srshr z3.h, p1/m, z3.h, #6
ld1b {z30.h}, p0/z, [x11]
add v0.8h, v0.8h, v28.8h
add v1.8h, v1.8h, v29.8h
add v2.8h, v2.8h, v30.8h
add v3.8h, v3.8h, v31.8h
sqxtunb z0.b, z0.h
sqxtunb z1.b, z1.h
sqxtunb z2.b, z2.h
sqxtunb z3.b, z3.h
st1b {z0.h}, p1, [x0]
add x0, x0, x2
st1b {z1.h}, p1, [x0]
add x0, x0, x2
st1b {z3.h}, p1, [x0]
add x0, x0, x2
st1b {z2.h}, p1, [x0]
ret
endfunc

998
common/aarch64/dct-a.S Normal file
View File

@@ -0,0 +1,998 @@
/****************************************************************************
* dct-a.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 10,11, 16,17, 24,25, 18,19
.byte 12,13, 6,7, 14,15, 20,21
.byte 26,27, 28,29, 22,23, 30,31
endconst
const scan4x4_field, align=4
.byte 0,1, 2,3, 8,9, 4,5
.byte 6,7, 10,11, 12,13, 14,15
endconst
const sub4x4_frame, align=4
.byte 0, 1, 4, 8
.byte 5, 2, 3, 6
.byte 9, 12, 13, 10
.byte 7, 11, 14, 15
endconst
const sub4x4_field, align=4
.byte 0, 4, 1, 8
.byte 12, 5, 9, 13
.byte 2, 6, 10, 14
.byte 3, 7, 11, 15
endconst
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
sshr \t0, \b, #\shift
sshr \t1, \a, #\shift
add \sum, \a, \t0
sub \sub, \t1, \b
.endm
// sum = (a>>shift) + b sub = a - (b>>shift)
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
sshr \t0, \a, #\shift
sshr \t1, \b, #\shift
add \sum, \t0, \b
sub \sub, \a, \t1
.endm
// a += 1.5*ma b -= 1.5*mb
.macro SUMSUB_15 a b ma mb t0 t1
sshr \t0, \ma, #1
sshr \t1, \mb, #1
add \t0, \t0, \ma
add \t1, \t1, \mb
add \a, \a, \t0
sub \b, \b, \t1
.endm
function dct4x4dc_neon, export=1
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
movi v31.4h, #1
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
transpose v4.4h, v6.4h, v0.4h, v2.4h
transpose v5.4h, v7.4h, v1.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose v4.2s, v5.2s, v0.2s, v1.2s
transpose v6.2s, v7.2s, v2.2s, v3.2s
add v16.4h, v4.4h, v31.4h
add v17.4h, v6.4h, v31.4h
srhadd v0.4h, v4.4h, v5.4h
shsub v1.4h, v16.4h, v5.4h
shsub v2.4h, v17.4h, v7.4h
srhadd v3.4h, v6.4h, v7.4h
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
ret
endfunc
function idct4x4dc_neon, export=1
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
transpose v4.4h, v6.4h, v0.4h, v2.4h
transpose v5.4h, v7.4h, v1.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose v4.2s, v5.2s, v0.2s, v1.2s
transpose v6.2s, v7.2s, v2.2s, v3.2s
SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
ret
endfunc
function sub4x4_dct_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ld1 {v0.s}[0], [x1], x3
ld1 {v1.s}[0], [x2], x4
ld1 {v2.s}[0], [x1], x3
usubl v16.8h, v0.8b, v1.8b
ld1 {v3.s}[0], [x2], x4
ld1 {v4.s}[0], [x1], x3
usubl v17.8h, v2.8b, v3.8b
ld1 {v5.s}[0], [x2], x4
ld1 {v6.s}[0], [x1], x3
usubl v18.8h, v4.8b, v5.8b
ld1 {v7.s}[0], [x2], x4
usubl v19.8h, v6.8b, v7.8b
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
ret
endfunc
function sub8x4_dct_neon
ld1 {v0.8b}, [x1], x3
ld1 {v1.8b}, [x2], x4
usubl v16.8h, v0.8b, v1.8b
ld1 {v2.8b}, [x1], x3
ld1 {v3.8b}, [x2], x4
usubl v17.8h, v2.8b, v3.8b
ld1 {v4.8b}, [x1], x3
ld1 {v5.8b}, [x2], x4
usubl v18.8h, v4.8b, v5.8b
ld1 {v6.8b}, [x1], x3
ld1 {v7.8b}, [x2], x4
usubl v19.8h, v6.8b, v7.8b
DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
add v22.8h, v19.8h, v19.8h
add v21.8h, v18.8h, v18.8h
add v0.8h, v16.8h, v17.8h
sub v1.8h, v16.8h, v17.8h
add v2.8h, v22.8h, v18.8h
sub v3.8h, v19.8h, v21.8h
zip1 v4.2d, v0.2d, v2.2d
zip2 v6.2d, v0.2d, v2.2d
zip1 v5.2d, v1.2d, v3.2d
zip2 v7.2d, v1.2d, v3.2d
st1 {v4.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
st1 {v7.8h}, [x0], #16
ret
endfunc
function sub8x8_dct_neon, export=1
mov x5, x30
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
bl sub8x4_dct_neon
mov x30, x5
b sub8x4_dct_neon
endfunc
function sub16x16_dct_neon, export=1
mov x5, x30
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8*FENC_STRIDE-8
sub x2, x2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8
sub x2, x2, #8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8*FENC_STRIDE-8
sub x2, x2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
mov x30, x5
b sub8x4_dct_neon
endfunc
.macro DCT8_1D type
SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
sshr v23.8h, v21.8h, #1
sshr v18.8h, v16.8h, #1
add v23.8h, v23.8h, v21.8h
add v18.8h, v18.8h, v16.8h
sub v30.8h, v30.8h, v23.8h
sub v29.8h, v29.8h, v18.8h
SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
sshr v22.8h, v20.8h, #1
sshr v19.8h, v17.8h, #1
add v22.8h, v22.8h, v20.8h
add v19.8h, v19.8h, v17.8h
add v22.8h, v28.8h, v22.8h
add v31.8h, v31.8h, v19.8h
SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
.endm
function sub8x8_dct8_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ld1 {v16.8b}, [x1], x3
ld1 {v17.8b}, [x2], x4
ld1 {v18.8b}, [x1], x3
ld1 {v19.8b}, [x2], x4
usubl v0.8h, v16.8b, v17.8b
ld1 {v20.8b}, [x1], x3
ld1 {v21.8b}, [x2], x4
usubl v1.8h, v18.8b, v19.8b
ld1 {v22.8b}, [x1], x3
ld1 {v23.8b}, [x2], x4
usubl v2.8h, v20.8b, v21.8b
ld1 {v24.8b}, [x1], x3
ld1 {v25.8b}, [x2], x4
usubl v3.8h, v22.8b, v23.8b
ld1 {v26.8b}, [x1], x3
ld1 {v27.8b}, [x2], x4
usubl v4.8h, v24.8b, v25.8b
ld1 {v28.8b}, [x1], x3
ld1 {v29.8b}, [x2], x4
usubl v5.8h, v26.8b, v27.8b
ld1 {v30.8b}, [x1], x3
ld1 {v31.8b}, [x2], x4
usubl v6.8h, v28.8b, v29.8b
usubl v7.8h, v30.8b, v31.8b
DCT8_1D row
transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
DCT8_1D col
st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
ret
endfunc
function sub16x16_dct8_neon, export=1
mov x7, x30
bl X(sub8x8_dct8_neon)
sub x1, x1, #FENC_STRIDE*8 - 8
sub x2, x2, #FDEC_STRIDE*8 - 8
bl X(sub8x8_dct8_neon)
sub x1, x1, #8
sub x2, x2, #8
bl X(sub8x8_dct8_neon)
mov x30, x7
sub x1, x1, #FENC_STRIDE*8 - 8
sub x2, x2, #FDEC_STRIDE*8 - 8
b X(sub8x8_dct8_neon)
endfunc
// First part of IDCT (minus final SUMSUB_BA)
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
SUMSUB_AB \d4, \d5, \d0, \d2
sshr \d7, \d1, #1
sshr \d6, \d3, #1
sub \d7, \d7, \d3
add \d6, \d6, \d1
.endm
function add4x4_idct_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
ld1 {v28.s}[0], [x0], x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
ld1 {v29.s}[0], [x0], x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
srshr v0.4h, v0.4h, #6
srshr v1.4h, v1.4h, #6
ld1 {v31.s}[0], [x0], x2
srshr v2.4h, v2.4h, #6
srshr v3.4h, v3.4h, #6
ld1 {v30.s}[0], [x0], x2
sub x0, x0, x2, lsl #2
uaddw v0.8h, v0.8h, v28.8b
uaddw v1.8h, v1.8h, v29.8b
uaddw v2.8h, v2.8h, v30.8b
uaddw v3.8h, v3.8h, v31.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
st1 {v0.s}[0], [x0], x2
st1 {v1.s}[0], [x0], x2
st1 {v3.s}[0], [x0], x2
st1 {v2.s}[0], [x0], x2
ret
endfunc
function add8x4_idct_neon, export=1
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
transpose v20.2d, v21.2d, v0.2d, v2.2d
transpose v22.2d, v23.2d, v1.2d, v3.2d
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
srshr v0.8h, v0.8h, #6
ld1 {v28.8b}, [x0], x2
srshr v1.8h, v1.8h, #6
ld1 {v29.8b}, [x0], x2
srshr v2.8h, v2.8h, #6
ld1 {v30.8b}, [x0], x2
srshr v3.8h, v3.8h, #6
ld1 {v31.8b}, [x0], x2
sub x0, x0, x2, lsl #2
uaddw v0.8h, v0.8h, v28.8b
uaddw v1.8h, v1.8h, v29.8b
uaddw v2.8h, v2.8h, v30.8b
uaddw v3.8h, v3.8h, v31.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
st1 {v0.8b}, [x0], x2
sqxtun v2.8b, v2.8h
st1 {v1.8b}, [x0], x2
sqxtun v3.8b, v3.8h
st1 {v2.8b}, [x0], x2
st1 {v3.8b}, [x0], x2
ret
endfunc
function add8x8_idct_neon, export=1
mov x2, #FDEC_STRIDE
mov x5, x30
bl X(add8x4_idct_neon)
mov x30, x5
b X(add8x4_idct_neon)
endfunc
function add16x16_idct_neon, export=1
mov x2, #FDEC_STRIDE
mov x5, x30
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x4_idct_neon)
mov x30, x5
b X(add8x4_idct_neon)
endfunc
.macro IDCT8_1D type
SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
.ifc \type, row
ld1 {v22.8h,v23.8h}, [x1], #32
.endif
SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
.endm
function add8x8_idct8_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v16.8h,v17.8h}, [x1], #32
ld1 {v18.8h,v19.8h}, [x1], #32
ld1 {v20.8h,v21.8h}, [x1], #32
IDCT8_1D row
transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
IDCT8_1D col
ld1 {v0.8b}, [x0], x2
srshr v16.8h, v16.8h, #6
ld1 {v1.8b}, [x0], x2
srshr v17.8h, v17.8h, #6
ld1 {v2.8b}, [x0], x2
srshr v18.8h, v18.8h, #6
ld1 {v3.8b}, [x0], x2
srshr v19.8h, v19.8h, #6
ld1 {v4.8b}, [x0], x2
srshr v20.8h, v20.8h, #6
ld1 {v5.8b}, [x0], x2
srshr v21.8h, v21.8h, #6
ld1 {v6.8b}, [x0], x2
srshr v22.8h, v22.8h, #6
ld1 {v7.8b}, [x0], x2
srshr v23.8h, v23.8h, #6
sub x0, x0, x2, lsl #3
uaddw v16.8h, v16.8h, v0.8b
uaddw v17.8h, v17.8h, v1.8b
uaddw v18.8h, v18.8h, v2.8b
sqxtun v0.8b, v16.8h
sqxtun v1.8b, v17.8h
sqxtun v2.8b, v18.8h
uaddw v19.8h, v19.8h, v3.8b
st1 {v0.8b}, [x0], x2
uaddw v20.8h, v20.8h, v4.8b
st1 {v1.8b}, [x0], x2
uaddw v21.8h, v21.8h, v5.8b
st1 {v2.8b}, [x0], x2
sqxtun v3.8b, v19.8h
sqxtun v4.8b, v20.8h
uaddw v22.8h, v22.8h, v6.8b
uaddw v23.8h, v23.8h, v7.8b
st1 {v3.8b}, [x0], x2
sqxtun v5.8b, v21.8h
st1 {v4.8b}, [x0], x2
sqxtun v6.8b, v22.8h
sqxtun v7.8b, v23.8h
st1 {v5.8b}, [x0], x2
st1 {v6.8b}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
endfunc
function add16x16_idct8_neon, export=1
mov x7, x30
bl X(add8x8_idct8_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x8_idct8_neon)
sub x0, x0, #8
bl X(add8x8_idct8_neon)
sub x0, x0, #8*FDEC_STRIDE-8
mov x30, x7
b X(add8x8_idct8_neon)
endfunc
function add8x8_idct_dc_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v16.4h}, [x1]
ld1 {v0.8b}, [x0], x2
srshr v16.4h, v16.4h, #6
ld1 {v1.8b}, [x0], x2
dup v20.8h, v16.h[0]
dup v21.8h, v16.h[1]
ld1 {v2.8b}, [x0], x2
dup v22.8h, v16.h[2]
dup v23.8h, v16.h[3]
ld1 {v3.8b}, [x0], x2
trn1 v20.2d, v20.2d, v21.2d
ld1 {v4.8b}, [x0], x2
trn1 v21.2d, v22.2d, v23.2d
ld1 {v5.8b}, [x0], x2
neg v22.8h, v20.8h
ld1 {v6.8b}, [x0], x2
neg v23.8h, v21.8h
ld1 {v7.8b}, [x0], x2
sub x0, x0, #8*FDEC_STRIDE
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v21.8h
sqxtun v22.8b, v22.8h
sqxtun v23.8b, v23.8h
uqadd v0.8b, v0.8b, v20.8b
uqadd v1.8b, v1.8b, v20.8b
uqadd v2.8b, v2.8b, v20.8b
uqadd v3.8b, v3.8b, v20.8b
uqadd v4.8b, v4.8b, v21.8b
uqadd v5.8b, v5.8b, v21.8b
uqadd v6.8b, v6.8b, v21.8b
uqadd v7.8b, v7.8b, v21.8b
uqsub v0.8b, v0.8b, v22.8b
uqsub v1.8b, v1.8b, v22.8b
uqsub v2.8b, v2.8b, v22.8b
uqsub v3.8b, v3.8b, v22.8b
uqsub v4.8b, v4.8b, v23.8b
uqsub v5.8b, v5.8b, v23.8b
uqsub v6.8b, v6.8b, v23.8b
uqsub v7.8b, v7.8b, v23.8b
st1 {v0.8b}, [x0], x2
st1 {v1.8b}, [x0], x2
st1 {v2.8b}, [x0], x2
st1 {v3.8b}, [x0], x2
st1 {v4.8b}, [x0], x2
st1 {v5.8b}, [x0], x2
st1 {v6.8b}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
endfunc
.macro ADD16x4_IDCT_DC dc
ld1 {v4.16b}, [x0], x3
dup v24.8h, \dc[0]
dup v25.8h, \dc[1]
ld1 {v5.16b}, [x0], x3
dup v26.8h, \dc[2]
dup v27.8h, \dc[3]
ld1 {v6.16b}, [x0], x3
trn1 v24.2d, v24.2d, v25.2d
ld1 {v7.16b}, [x0], x3
trn1 v25.2d, v26.2d, v27.2d
neg v26.8h, v24.8h
neg v27.8h, v25.8h
sqxtun v20.8b, v24.8h
sqxtun v21.8b, v26.8h
sqxtun2 v20.16b, v25.8h
sqxtun2 v21.16b, v27.8h
uqadd v4.16b, v4.16b, v20.16b
uqadd v5.16b, v5.16b, v20.16b
uqadd v6.16b, v6.16b, v20.16b
uqadd v7.16b, v7.16b, v20.16b
uqsub v4.16b, v4.16b, v21.16b
uqsub v5.16b, v5.16b, v21.16b
uqsub v6.16b, v6.16b, v21.16b
st1 {v4.16b}, [x2], x3
uqsub v7.16b, v7.16b, v21.16b
st1 {v5.16b}, [x2], x3
st1 {v6.16b}, [x2], x3
st1 {v7.16b}, [x2], x3
.endm
function add16x16_idct_dc_neon, export=1
mov x2, x0
mov x3, #FDEC_STRIDE
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
srshr v0.4h, v0.4h, #6
srshr v1.4h, v1.4h, #6
ADD16x4_IDCT_DC v0.h
srshr v2.4h, v2.4h, #6
ADD16x4_IDCT_DC v1.h
srshr v3.4h, v3.4h, #6
ADD16x4_IDCT_DC v2.h
ADD16x4_IDCT_DC v3.h
ret
endfunc
.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
ld1 {\t0\().8b}, [x1], x3
ld1 {\t1\().8b}, [x2], x4
ld1 {\t2\().8b}, [x1], x3
ld1 {\t3\().8b}, [x2], x4
usubl \t0\().8h, \t0\().8b, \t1\().8b
ld1 {\t4\().8b}, [x1], x3
ld1 {\t5\().8b}, [x2], x4
usubl \t1\().8h, \t2\().8b, \t3\().8b
ld1 {\t6\().8b}, [x1], x3
ld1 {\t7\().8b}, [x2], x4
add \dst\().8h, \t0\().8h, \t1\().8h
usubl \t2\().8h, \t4\().8b, \t5\().8b
usubl \t3\().8h, \t6\().8b, \t7\().8b
add \dst\().8h, \dst\().8h, \t2\().8h
add \dst\().8h, \dst\().8h, \t3\().8h
.endm
function sub8x8_dct_dc_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
addp v0.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v0.8h
st1 {v0.4h}, [x0]
ret
endfunc
function sub8x16_dct_dc_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
addp v4.8h, v0.8h, v2.8h
addp v5.8h, v1.8h, v3.8h
transpose v2.4s, v3.4s, v4.4s, v5.4s
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.4s, v3.4s, v0.4s, v1.4s
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
trn1 v2.2d, v0.2d, v1.2d
trn2 v3.2d, v1.2d, v0.2d
addp v0.8h, v2.8h, v3.8h
st1 {v0.8h}, [x0]
ret
endfunc
function zigzag_interleave_8x8_cavlc_neon, export=1
mov x3, #7
movi v31.4s, #1
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
umax v16.8h, v0.8h, v4.8h
umax v17.8h, v1.8h, v5.8h
umax v18.8h, v2.8h, v6.8h
umax v19.8h, v3.8h, v7.8h
st1 {v0.8h}, [x0], #16
st1 {v4.8h}, [x0], #16
umaxp v16.8h, v16.8h, v17.8h
umaxp v18.8h, v18.8h, v19.8h
st1 {v1.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
umaxp v16.8h, v16.8h, v18.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
cmhs v16.4s, v16.4s, v31.4s
st1 {v3.8h}, [x0], #16
and v16.16b, v16.16b, v31.16b
st1 {v7.8h}, [x0], #16
st1 {v16.b}[0], [x2], #1
st1 {v16.b}[4], [x2], x3
st1 {v16.b}[8], [x2], #1
st1 {v16.b}[12], [x2]
ret
endfunc
function zigzag_scan_4x4_frame_neon, export=1
movrel x2, scan4x4_frame
ld1 {v0.16b,v1.16b}, [x1]
ld1 {v16.16b,v17.16b}, [x2]
tbl v2.16b, {v0.16b,v1.16b}, v16.16b
tbl v3.16b, {v0.16b,v1.16b}, v17.16b
st1 {v2.16b,v3.16b}, [x0]
ret
endfunc
.macro zigzag_sub_4x4 f ac
function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
mov x9, #FENC_STRIDE
mov x4, #FDEC_STRIDE
movrel x5, sub4x4_\f
mov x6, x2
ld1 {v0.s}[0], [x1], x9
ld1 {v0.s}[1], [x1], x9
ld1 {v0.s}[2], [x1], x9
ld1 {v0.s}[3], [x1], x9
ld1 {v16.16b}, [x5]
ld1 {v1.s}[0], [x2], x4
ld1 {v1.s}[1], [x2], x4
ld1 {v1.s}[2], [x2], x4
ld1 {v1.s}[3], [x2], x4
tbl v2.16b, {v0.16b}, v16.16b
tbl v3.16b, {v1.16b}, v16.16b
st1 {v0.s}[0], [x6], x4
usubl v4.8h, v2.8b, v3.8b
.ifc \ac, ac
dup h7, v4.h[0]
ins v4.h[0], wzr
fmov w5, s7
strh w5, [x3]
.endif
usubl2 v5.8h, v2.16b, v3.16b
st1 {v0.s}[1], [x6], x4
umax v6.8h, v4.8h, v5.8h
umaxv h6, v6.8h
st1 {v0.s}[2], [x6], x4
fmov w7, s6
st1 {v0.s}[3], [x6], x4
cmp w7, #0
st1 {v4.8h,v5.8h}, [x0]
cset w0, ne
ret
endfunc
.endm
zigzag_sub_4x4 field
zigzag_sub_4x4 field, ac
zigzag_sub_4x4 frame
zigzag_sub_4x4 frame, ac
function zigzag_scan_4x4_field_neon, export=1
movrel x2, scan4x4_field
ld1 {v0.8h,v1.8h}, [x1]
ld1 {v16.16b}, [x2]
tbl v0.16b, {v0.16b}, v16.16b
st1 {v0.8h,v1.8h}, [x0]
ret
endfunc
function zigzag_scan_8x8_frame_neon, export=1
movrel x2, scan8x8_frame
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b,v23.16b}, [x2], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
mov v25.h[6], v4.h[0]
mov v25.h[7], v5.h[0]
mov v26.h[0], v4.h[1]
mov v27.h[4], v7.h[0]
mov v28.h[7], v4.h[4]
mov v29.h[7], v3.h[6]
mov v30.h[0], v2.h[7]
mov v30.h[1], v3.h[7]
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
#define Z(z) 2*(z), 2*(z)+1
#define T(x,y) Z(x*8+y)
const scan8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
#undef T
#define T(x,y) Z((x-0)*8+y)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
function zigzag_scan_8x8_field_neon, export=1
movrel x2, scan8x8_field
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b}, [x2]
ext v31.16b, v7.16b, v7.16b, #4
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
ext v31.16b, v6.16b, v31.16b, #12
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
.macro zigzag_sub8x8 f
function zigzag_sub_8x8_\f\()_neon, export=1
movrel x4, sub8x8_\f
mov x5, #FENC_STRIDE
mov x6, #FDEC_STRIDE
mov x7, x2
ld1 {v0.d}[0], [x1], x5
ld1 {v0.d}[1], [x1], x5
ld1 {v1.d}[0], [x1], x5
ld1 {v1.d}[1], [x1], x5
ld1 {v2.d}[0], [x1], x5
ld1 {v2.d}[1], [x1], x5
ld1 {v3.d}[0], [x1], x5
ld1 {v3.d}[1], [x1]
ld1 {v4.d}[0], [x2], x6
ld1 {v4.d}[1], [x2], x6
ld1 {v5.d}[0], [x2], x6
ld1 {v5.d}[1], [x2], x6
ld1 {v6.d}[0], [x2], x6
ld1 {v6.d}[1], [x2], x6
ld1 {v7.d}[0], [x2], x6
ld1 {v7.d}[1], [x2]
ld1 {v16.16b,v17.16b}, [x4], #32
ld1 {v18.16b,v19.16b}, [x4], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
usubl v4.8h, v24.8b, v28.8b
usubl2 v5.8h, v24.16b, v28.16b
usubl v6.8h, v25.8b, v29.8b
usubl2 v7.8h, v25.16b, v29.16b
usubl v16.8h, v26.8b, v30.8b
usubl2 v17.8h, v26.16b, v30.16b
usubl v18.8h, v27.8b, v31.8b
usubl2 v19.8h, v27.16b, v31.16b
umax v20.8h, v4.8h, v5.8h
umax v21.8h, v6.8h, v7.8h
umax v22.8h, v16.8h, v17.8h
umax v23.8h, v18.8h, v19.8h
umax v20.8h, v20.8h, v21.8h
umax v21.8h, v22.8h, v23.8h
umax v20.8h, v20.8h, v21.8h
umaxv h22, v20.8h
st1 {v0.d}[0], [x7], x6
st1 {v0.d}[1], [x7], x6
st1 {v1.d}[0], [x7], x6
st1 {v1.d}[1], [x7], x6
st1 {v2.d}[0], [x7], x6
st1 {v2.d}[1], [x7], x6
st1 {v3.d}[0], [x7], x6
st1 {v3.d}[1], [x7]
st1 {v4.8h,v5.8h}, [x0], #32
st1 {v6.8h,v7.8h}, [x0], #32
st1 {v16.8h,v17.8h}, [x0], #32
st1 {v18.8h,v19.8h}, [x0]
fmov w9, s22
cmp w9, #0
cset w0, ne
ret
endfunc
.endm
zigzag_sub8x8 field
zigzag_sub8x8 frame
#undef T
#define T(x,y) Z(x*8+y)
const scan8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
#undef T
#define T(x,y) Z((x-1)*8+y)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
#undef T
#define T(x,y) Z((x-2)*8+y)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
#undef T
#define T(x,y) Z((x-5)*8+y)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
endconst
#undef T
#define T(y,x) x*8+y
const sub8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
const sub8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
.byte T(6,6), T(6,7), T(7,2), T(7,3)
.byte T(7,4), T(7,5), T(7,6), T(7,7)
endconst

103
common/aarch64/dct.h Normal file
View File

@@ -0,0 +1,103 @@
/*****************************************************************************
* dct.h: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_DCT_H
#define X264_AARCH64_DCT_H
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
void x264_dct4x4dc_neon( int16_t d[16] );
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
void x264_idct4x4dc_neon( int16_t d[16] );
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif

View File

@@ -0,0 +1,43 @@
/*****************************************************************************
* deblock-a-common.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros that are intended to be used by
// the SVE/SVE2 functions as well
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.s[0], w6
and w8, w6, w6, lsl #16
b.eq 1f
ands w8, w8, w8, lsl #8
b.ge 2f
1:
ret
2:
.endm

View File

@@ -0,0 +1,98 @@
/*****************************************************************************
* deblock-a-sve.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "deblock-a-common.S"
ENABLE_SVE
.macro h264_loop_filter_chroma_sve
ptrue p0.b, vl16
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v4.8h, v0.8b
uxtl2 v5.8h, v0.16b
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
usubw2 v5.8h, v5.8h, v16.16b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
shl v5.8h, v5.8h, #2
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
uxtl v24.4s, v24.4h
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v5.8h, v5.8h, v18.16b
cmphi p1.b, p0/z, z22.b, z26.b
usubw v4.8h, v4.8h, v2.8b
usubw2 v5.8h, v5.8h, v2.16b
sli v24.4s, v24.4s, #16
dup v22.16b, w3 // beta
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v5.8h, #3
cmphi p2.b, p0/z, z22.b, z28.b
cmphi p3.b, p0/z, z22.b, z30.b
smin v4.16b, v4.16b, v24.16b
neg v25.16b, v24.16b
and p1.b, p0/z, p1.b, p2.b
smax v4.16b, v4.16b, v25.16b
and p1.b, p0/z, p1.b, p3.b
uxtl v22.8h, v0.8b
uxtl2 v23.8h, v0.16b
uxtl v28.8h, v16.8b
uxtl2 v29.8h, v16.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v29.8h, v29.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v23.8h, v23.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun v0.8b, v22.8h
sqxtun2 v16.16b, v29.8h
sqxtun2 v0.16b, v23.8h
.endm
function deblock_v_chroma_sve, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
// No performance improvement if sve load is used. So, continue using
// NEON load here
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0]
h264_loop_filter_chroma_sve
sub x0, x0, x1, lsl #1
st1b {z16.b}, p1, [x0]
add x0, x0, x1
st1b {z0.b}, p1, [x0]
ret
endfunc

800
common/aarch64/deblock-a.S Normal file
View File

@@ -0,0 +1,800 @@
/*****************************************************************************
* deblock.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "deblock-a-common.S"
.macro h264_loop_filter_luma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v24.4s, v24.4h
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
sli v24.8h, v24.8h, #8
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
sli v24.4s, v24.4s, #16
cmhi v21.16b, v22.16b, v21.16b // < alpha
dup v22.16b, w3 // beta
cmlt v23.16b, v24.16b, #0
cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16b, v22.16b, v30.16b // < beta
bic v21.16b, v21.16b, v23.16b
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
and v21.16b, v21.16b, v28.16b
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v22.16b, v17.16b // < beta
and v21.16b, v21.16b, v30.16b
cmhi v19.16b, v22.16b, v19.16b // < beta
and v17.16b, v17.16b, v21.16b
and v19.16b, v19.16b, v21.16b
and v24.16b, v24.16b, v21.16b
urhadd v28.16b, v16.16b, v0.16b
sub v21.16b, v24.16b, v17.16b
uqadd v23.16b, v18.16b, v24.16b
uhadd v20.16b, v20.16b, v28.16b
sub v21.16b, v21.16b, v19.16b
uhadd v28.16b, v4.16b, v28.16b
umin v23.16b, v23.16b, v20.16b
uqsub v22.16b, v18.16b, v24.16b
uqadd v4.16b, v2.16b, v24.16b
umax v23.16b, v23.16b, v22.16b
uqsub v22.16b, v2.16b, v24.16b
umin v28.16b, v4.16b, v28.16b
uxtl v4.8h, v0.8b
umax v28.16b, v28.16b, v22.16b
uxtl2 v20.8h, v0.16b
usubw v4.8h, v4.8h, v16.8b
usubw2 v20.8h, v20.8h, v16.16b
shl v4.8h, v4.8h, #2
shl v20.8h, v20.8h, #2
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v20.8h, v20.8h, v18.16b
usubw v4.8h, v4.8h, v2.8b
usubw2 v20.8h, v20.8h, v2.16b
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v20.8h, #3
bsl v17.16b, v23.16b, v18.16b
bsl v19.16b, v28.16b, v2.16b
neg v23.16b, v21.16b
uxtl v28.8h, v16.8b
smin v4.16b, v4.16b, v21.16b
uxtl2 v21.8h, v16.16b
smax v4.16b, v4.16b, v23.16b
uxtl v22.8h, v0.8b
uxtl2 v24.8h, v0.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v21.8h, v21.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v24.8h, v24.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun2 v16.16b, v21.8h
sqxtun v0.8b, v22.8h
sqxtun2 v0.16b, v24.8h
.endm
function deblock_v_luma_neon, export=1
h264_loop_filter_start
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0], x1
ld1 {v4.16b}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16b}, [x0], x1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16b}, [x0], x1
st1 {v16.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
st1 {v19.16b}, [x0]
ret
endfunc
function deblock_h_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8b}, [x0], x1
ld1 {v20.8b}, [x0], x1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v4.8b}, [x0], x1
ld1 {v26.8b}, [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v18.d}[1], [x0], x1
ld1 {v16.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v26.d}[1], [x0], x1
transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.s}[0], [x0], x1
st1 {v16.s}[0], [x0], x1
st1 {v0.s}[0], [x0], x1
st1 {v19.s}[0], [x0], x1
st1 {v17.s}[1], [x0], x1
st1 {v16.s}[1], [x0], x1
st1 {v0.s}[1], [x0], x1
st1 {v19.s}[1], [x0], x1
st1 {v17.s}[2], [x0], x1
st1 {v16.s}[2], [x0], x1
st1 {v0.s}[2], [x0], x1
st1 {v19.s}[2], [x0], x1
st1 {v17.s}[3], [x0], x1
st1 {v16.s}[3], [x0], x1
st1 {v0.s}[3], [x0], x1
st1 {v19.s}[3], [x0], x1
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
b.ne 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
not v30.16b, v17.16b
not v31.16b, v18.16b
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function deblock_v_luma_intra_neon, export=1
h264_loop_filter_start_intra
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
9:
ret
endfunc
function deblock_h_luma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v4.8h, v0.8b
uxtl2 v5.8h, v0.16b
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
usubw2 v5.8h, v5.8h, v16.16b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
shl v5.8h, v5.8h, #2
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
uxtl v24.4s, v24.4h
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v5.8h, v5.8h, v18.16b
cmhi v26.16b, v22.16b, v26.16b // < alpha
usubw v4.8h, v4.8h, v2.8b
usubw2 v5.8h, v5.8h, v2.16b
sli v24.4s, v24.4s, #16
dup v22.16b, w3 // beta
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v5.8h, #3
cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16b, v22.16b, v30.16b // < beta
smin v4.16b, v4.16b, v24.16b
neg v25.16b, v24.16b
and v26.16b, v26.16b, v28.16b
smax v4.16b, v4.16b, v25.16b
and v26.16b, v26.16b, v30.16b
uxtl v22.8h, v0.8b
uxtl2 v23.8h, v0.16b
and v4.16b, v4.16b, v26.16b
uxtl v28.8h, v16.8b
uxtl2 v29.8h, v16.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v29.8h, v29.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v23.8h, v23.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun v0.8b, v22.8h
sqxtun2 v16.16b, v29.8h
sqxtun2 v0.16b, v23.8h
.endm
function deblock_v_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
ret
endfunc
function deblock_h_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
deblock_h_chroma:
ld1 {v18.d}[0], [x0], x1
ld1 {v16.d}[0], [x0], x1
ld1 {v0.d}[0], [x0], x1
ld1 {v2.d}[0], [x0], x1
ld1 {v18.d}[1], [x0], x1
ld1 {v16.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.d}[0], [x0], x1
st1 {v16.d}[0], [x0], x1
st1 {v0.d}[0], [x0], x1
st1 {v2.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
st1 {v16.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
ret
endfunc
function deblock_h_chroma_422_neon, export=1
add x5, x0, x1
sub x0, x0, #4
add x1, x1, x1
h264_loop_filter_start
mov x7, x30
bl deblock_h_chroma
mov x30, x7
sub x0, x5, #4
mov v24.s[0], w6
b deblock_h_chroma
endfunc
.macro h264_loop_filter_chroma8
dup v22.8b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uxtl v4.8h, v17.8b
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
uaddw v4.8h, v4.8h, v18.8b
cmhi v26.8b, v22.8b, v26.8b // < alpha
usubw v4.8h, v4.8h, v19.8b
dup v22.8b, w3 // beta
rshrn v4.8b, v4.8h, #3
cmhi v28.8b, v22.8b, v28.8b // < beta
cmhi v30.8b, v22.8b, v30.8b // < beta
smin v4.8b, v4.8b, v24.8b
neg v25.8b, v24.8b
and v26.8b, v26.8b, v28.8b
smax v4.8b, v4.8b, v25.8b
and v26.8b, v26.8b, v30.8b
uxtl v22.8h, v17.8b
and v4.8b, v4.8b, v26.8b
uxtl v28.8h, v16.8b
saddw v28.8h, v28.8h, v4.8b
ssubw v22.8h, v22.8h, v4.8b
sqxtun v16.8b, v28.8h
sqxtun v17.8b, v22.8h
.endm
function deblock_h_chroma_mbaff_neon, export=1
h264_loop_filter_start
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4]
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
h264_loop_filter_chroma8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0]
ret
endfunc
.macro h264_loop_filter_chroma_intra width=16
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
cmhi v26.16b, v30.16b, v26.16b // < alpha
cmhi v27.16b, v31.16b, v27.16b // < beta
cmhi v28.16b, v31.16b, v28.16b // < beta
and v26.16b, v26.16b, v27.16b
and v26.16b, v26.16b, v28.16b
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
.ifc \width, 16
ushll2 v5.8h, v18.16b, #1
ushll2 v7.8h, v19.16b, #1
uaddl2 v21.8h, v16.16b, v19.16b
uaddl2 v23.8h, v17.16b, v18.16b
.endif
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h // mlal?
add v22.8h, v22.8h, v6.8h
.ifc \width, 16
add v21.8h, v21.8h, v5.8h
add v23.8h, v23.8h, v7.8h
.endif
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
.ifc \width, 16
uqrshrn2 v24.16b, v21.8h, #2
uqrshrn2 v25.16b, v23.8h, #2
.endif
bit v16.16b, v24.16b, v26.16b
bit v17.16b, v25.16b, v26.16b
.endm
function deblock_v_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v17.16b}, [x0], x1
ld1 {v19.16b}, [x0]
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.16b}, [x0], x1
st1 {v17.16b}, [x0], x1
ret
endfunc
function deblock_h_chroma_intra_mbaff_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra width=8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
ret
endfunc
function deblock_h_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
function deblock_h_chroma_422_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
// void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
// uint8_t bs[2][8][4], int mvy_limit,
// int bframe )
function deblock_strength_neon, export=1
movi v4.16b, #0
lsl w4, w4, #8
add x3, x3, #32
sub w4, w4, #(1<<8)-3
movi v5.16b, #0
dup v6.8h, w4
mov x6, #-32
bframe:
// load bytes ref
add x2, x2, #16
ld1 {v31.d}[1], [x1], #8
ld1 {v1.16b}, [x1], #16
movi v0.16b, #0
ld1 {v2.16b}, [x1], #16
ext v3.16b, v0.16b, v1.16b, #15
ext v0.16b, v0.16b, v2.16b, #15
unzip v21.4s, v22.4s, v1.4s, v2.4s
unzip v23.4s, v20.4s, v3.4s, v0.4s
ext v21.16b, v31.16b, v22.16b, #12
eor v0.16b, v20.16b, v22.16b
eor v1.16b, v21.16b, v22.16b
orr v4.16b, v4.16b, v0.16b
orr v5.16b, v5.16b, v1.16b
ld1 {v21.8h}, [x2], #16 // mv + 0x10
ld1 {v19.8h}, [x2], #16 // mv + 0x20
ld1 {v22.8h}, [x2], #16 // mv + 0x30
ld1 {v18.8h}, [x2], #16 // mv + 0x40
ld1 {v23.8h}, [x2], #16 // mv + 0x50
ext v19.16b, v19.16b, v22.16b, #12
ext v18.16b, v18.16b, v23.16b, #12
sabd v0.8h, v22.8h, v19.8h
ld1 {v19.8h}, [x2], #16 // mv + 0x60
sabd v1.8h, v23.8h, v18.8h
ld1 {v24.8h}, [x2], #16 // mv + 0x70
uqxtn v0.8b, v0.8h
ld1 {v18.8h}, [x2], #16 // mv + 0x80
ld1 {v25.8h}, [x2], #16 // mv + 0x90
uqxtn2 v0.16b, v1.8h
ext v19.16b, v19.16b, v24.16b, #12
ext v18.16b, v18.16b, v25.16b, #12
sabd v1.8h, v24.8h, v19.8h
sabd v2.8h, v25.8h, v18.8h
uqxtn v1.8b, v1.8h
uqxtn2 v1.16b, v2.8h
uqsub v0.16b, v0.16b, v6.16b
uqsub v1.16b, v1.16b, v6.16b
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
sabd v1.8h, v22.8h, v23.8h
orr v4.16b, v4.16b, v0.16b
sabd v0.8h, v21.8h, v22.8h
sabd v2.8h, v23.8h, v24.8h
sabd v3.8h, v24.8h, v25.8h
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
uqxtn v1.8b, v2.8h
uqxtn2 v1.16b, v3.8h
uqsub v0.16b, v0.16b, v6.16b
uqsub v1.16b, v1.16b, v6.16b
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
subs w5, w5, #1
orr v5.16b, v5.16b, v0.16b
b.eq bframe
movi v6.16b, #1
// load bytes nnz
ld1 {v31.d}[1], [x0], #8
ld1 {v1.16b}, [x0], #16
movi v0.16b, #0
ld1 {v2.16b}, [x0], #16
ext v3.16b, v0.16b, v1.16b, #15
ext v0.16b, v0.16b, v2.16b, #15
unzip v21.4s, v22.4s, v1.4s, v2.4s
unzip v23.4s, v20.4s, v3.4s, v0.4s
ext v21.16b, v31.16b, v22.16b, #12
movrel x7, transpose_table
ld1 {v7.16b}, [x7]
orr v0.16b, v20.16b, v22.16b
orr v1.16b, v21.16b, v22.16b
umin v0.16b, v0.16b, v6.16b
umin v1.16b, v1.16b, v6.16b
umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
umin v5.16b, v5.16b, v6.16b
add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
add v1.16b, v1.16b, v1.16b
umax v4.16b, v4.16b, v0.16b
umax v5.16b, v5.16b, v1.16b
tbl v6.16b, {v4.16b}, v7.16b
st1 {v5.16b}, [x3], x6 // bs[1]
st1 {v6.16b}, [x3] // bs[0]
ret
endfunc
const transpose_table
.byte 0, 4, 8, 12
.byte 1, 5, 9, 13
.byte 2, 6, 10, 14
.byte 3, 7, 11, 15
endconst

61
common/aarch64/deblock.h Normal file
View File

@@ -0,0 +1,61 @@
/*****************************************************************************
* deblock.h: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_DEBLOCK_H
#define X264_AARCH64_DEBLOCK_H
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif

View File

@@ -0,0 +1,66 @@
/****************************************************************************
* mc-a-common.S: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* Mans Rullgard <mans@mansr.com>
* Stefan Groenroos <stefan.gronroos@gmail.com>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros and functions that are intended to be used by
// the SVE/SVE2 functions as well
#if BIT_DEPTH == 8
// 0 < weight < 64
.macro load_weights_add_add
mov w6, w6
.endm
// weight > 64
.macro load_weights_add_sub
neg w7, w7
.endm
// weight < 0
.macro load_weights_sub_add
neg w6, w6
.endm
function pixel_avg_w4_neon
1: subs w9, w9, #2
ld1 {v0.s}[0], [x2], x3
ld1 {v2.s}[0], [x4], x5
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.s}[0], [x2], x3
ld1 {v3.s}[0], [x4], x5
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
#else // BIT_DEPTH == 10
#endif

108
common/aarch64/mc-a-sve.S Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* mc-a-sve.S: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "mc-a-common.S"
ENABLE_SVE
#if BIT_DEPTH == 8
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
// uint8_t *src1, intptr_t src1_stride,
// uint8_t *src2, intptr_t src2_stride, int weight );
.macro AVGH_SVE w h
function pixel_avg_\w\()x\h\()_sve, export=1
mov w10, #64
cmp w6, #32
mov w9, #\h
b.eq pixel_avg_w\w\()_neon
subs w7, w10, w6
b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64
cmp w6, #0
b.ge pixel_avg_weight_w\w\()_add_add_sve
b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0
endfunc
.endm
AVGH_SVE 4, 2
AVGH_SVE 4, 4
AVGH_SVE 4, 8
AVGH_SVE 4, 16
// 0 < weight < 64
.macro weight_add_add_sve dst, s1, s2, h=
mul \dst, \s1, v30.8h
mla \dst, \s2, v31.8h
.endm
// weight > 64
.macro weight_add_sub_sve dst, s1, s2, h=
mul \dst, \s1, v30.8h
mls \dst, \s2, v31.8h
.endm
// weight < 0
.macro weight_sub_add_sve dst, s1, s2, h=
mul \dst, \s2, v31.8h
mls \dst, \s1, v30.8h
.endm
.macro AVG_WEIGHT_SVE ext
function pixel_avg_weight_w4_\ext\()_sve
load_weights_\ext
ptrue p0.b, vl8
dup v30.8h, w6
dup v31.8h, w7
1: // height loop
subs w9, w9, #2
ld1b {z0.h}, p0/z, [x2]
add x2, x2, x3
ld1b {z1.h}, p0/z, [x4]
add x4, x4, x5
weight_\ext\()_sve v4.8h, v0.8h, v1.8h
ld1b {z2.h}, p0/z, [x2]
add x2, x2, x3
ld1b {z3.h}, p0/z, [x4]
add x4, x4, x5
sqrshrun v0.8b, v4.8h, #6
weight_\ext\()_sve v5.8h, v2.8h, v3.8h
st1 {v0.s}[0], [x0], x1
sqrshrun v1.8b, v5.8h, #6
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
.endm
AVG_WEIGHT_SVE add_add
AVG_WEIGHT_SVE add_sub
AVG_WEIGHT_SVE sub_add
#else // BIT_DEPTH == 10
#endif

3935
common/aarch64/mc-a.S Normal file

File diff suppressed because it is too large Load Diff

371
common/aarch64/mc-c.c Normal file
View File

@@ -0,0 +1,371 @@
/*****************************************************************************
* mc-c.c: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "mc.h"
#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
void x264_prefetch_ref_aarch64( pixel *, intptr_t, int );
#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
void x264_memzero_aligned_neon( void *dst, size_t n );
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
\
static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w8##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w20##func##_neon,\
};
MC_WEIGHT()
MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int );
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t );
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t );
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
void x264_integral_init8v_neon( uint16_t *, intptr_t );
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int );
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
x264_pixel_avg2_w8_neon,
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
x264_pixel_avg2_w16_neon,
x264_pixel_avg2_w20_neon,
};
static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
x264_mc_copy_w8_neon,
NULL,
x264_mc_copy_w16_neon,
};
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
if( w->i_offset < 0 )
{
w->weightfn = mc_offsetsub_wtab_neon;
w->cachea[0] = -w->i_offset;
}
else
{
w->weightfn = mc_offsetadd_wtab_neon;
w->cachea[0] = w->i_offset;
}
}
else if( !w->i_denom )
w->weightfn = mc_nodenom_wtab_neon;
else
w->weightfn = mc_wtab_neon;
}
static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
}
else if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
else
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, *i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
return dst;
}
else if( weight->weightfn )
{
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
#define x264_hpel_filter_neon x264_template(hpel_filter_neon)
void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc,
pixel *src, intptr_t stride, int width,
int height, int16_t *buf );
#if !HIGH_BIT_DEPTH && HAVE_I8MM
#define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm)
void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc,
pixel *src, intptr_t stride, int width,
int height, int16_t *buf );
#endif // !HIGH_BIT_DEPTH && HAVE_I8MM
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
PROPAGATE_LIST(neon)
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
{
if( cpu&X264_CPU_ARMV8 )
{
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
pf->prefetch_ref = x264_prefetch_ref_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
pf->memcpy_aligned = x264_memcpy_aligned_neon;
pf->memzero_aligned = x264_memzero_aligned_neon;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
pf->weight = mc_wtab_neon;
pf->offsetadd = mc_offsetadd_wtab_neon;
pf->offsetsub = mc_offsetsub_wtab_neon;
pf->weight_cache = weight_cache_neon;
pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->integral_init4h = x264_integral_init4h_neon;
pf->integral_init8h = x264_integral_init8h_neon;
pf->integral_init4v = x264_integral_init4v_neon;
pf->integral_init8v = x264_integral_init8v_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
pf->plane_copy = plane_copy_neon;
pf->plane_copy_swap = plane_copy_swap_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = plane_copy_interleave_neon;
pf->hpel_filter = x264_hpel_filter_neon;
}
#if !HIGH_BIT_DEPTH
#if HAVE_SVE
if( cpu&X264_CPU_SVE )
{
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve;
}
#endif
#if HAVE_I8MM
if( cpu&X264_CPU_I8MM )
{
pf->hpel_filter = x264_hpel_filter_neon_i8mm;
}
#endif // HAVE_I8MM
#endif // !HIGH_BIT_DEPTH
}

32
common/aarch64/mc.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* mc.h: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_MC_H
#define X264_AARCH64_MC_H
#define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf );
#endif

View File

@@ -0,0 +1,44 @@
/****************************************************************************
* pixel-a-common.S: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros and constants that are intended to be used by
// the SVE/SVE2 functions as well
const mask_ac_4_8
.short 0, -1, -1, -1, 0, -1, -1, -1
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm

View File

@@ -0,0 +1,523 @@
/*****************************************************************************
* pixel-a-sve.S: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "pixel-a-common.S"
ENABLE_SVE
#if BIT_DEPTH == 8
.macro SSD_START_SVE_4
ptrue p0.h, vl4
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
sub v2.4h, v16.4h, v17.4h
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
smull v0.4s, v2.4h, v2.4h
.endm
.macro SSD_SVE_4
sub v2.4h, v16.4h, v17.4h
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_END_SVE_4
sub v2.4h, v16.4h, v17.4h
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_START_SVE_8
ptrue p0.h, vl8
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
sub v2.8h, v16.8h, v17.8h
ld1b {z16.h}, p0/z, [x0]
smull v0.4s, v2.4h, v2.4h
ld1b {z17.h}, p0/z, [x2]
smlal2 v0.4s, v2.8h, v2.8h
add x0, x0, x1
add x2, x2, x3
.endm
.macro SSD_SVE_8
sub v2.8h, v16.8h, v17.8h
ld1b {z16.h}, p0/z, [x0]
smlal v0.4s, v2.4h, v2.4h
ld1b {z17.h}, p0/z, [x2]
smlal2 v0.4s, v2.8h, v2.8h
add x0, x0, x1
add x2, x2, x3
.endm
.macro SSD_END_SVE_8
sub v2.8h, v16.8h, v17.8h
smlal v0.4s, v2.4h, v2.4h
smlal2 v0.4s, v2.8h, v2.8h
.endm
.macro SSD_FUNC_SVE w h
function pixel_ssd_\w\()x\h\()_sve, export=1
SSD_START_SVE_\w
.rept \h-2
SSD_SVE_\w
.endr
SSD_END_SVE_\w
addv s0, v0.4s
mov w0, v0.s[0]
ret
endfunc
.endm
.macro load_diff_fly_sve_8x8
ld1b {z1.h}, p0/z, [x2]
ld1b {z0.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z3.h}, p0/z, [x2]
ld1b {z2.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v16.8h, v0.8h, v1.8h
sub v17.8h, v2.8h, v3.8h
ld1b {z5.h}, p0/z, [x2]
ld1b {z4.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z7.h}, p0/z, [x2]
ld1b {z6.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v18.8h, v4.8h, v5.8h
sub v19.8h, v6.8h, v7.8h
ld1b {z1.h}, p0/z, [x2]
ld1b {z0.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z3.h}, p0/z, [x2]
ld1b {z2.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v20.8h, v0.8h, v1.8h
sub v21.8h, v2.8h, v3.8h
ld1b {z5.h}, p0/z, [x2]
ld1b {z4.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z7.h}, p0/z, [x2]
ld1b {z6.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
sub v22.8h, v4.8h, v5.8h
sub v23.8h, v6.8h, v7.8h
.endm
.macro pixel_var_sve_8 h
function pixel_var_8x\h\()_sve, export=1
ptrue p0.h, vl8
ld1b {z16.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z17.h}, p0/z, [x0]
add x0, x0, x1
mov x2, \h - 4
mul v1.8h, v16.8h, v16.8h
mul v2.8h, v17.8h, v17.8h
add v0.8h, v16.8h, v17.8h
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
uaddlp v1.4s, v1.8h
uaddlp v2.4s, v2.8h
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
1: subs x2, x2, #4
add v0.8h, v0.8h, v18.8h
mul v24.8h, v18.8h, v18.8h
ld1b {z20.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v19.8h
mul v25.8h, v19.8h, v19.8h
uadalp v1.4s, v24.8h
ld1b {z21.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v20.8h
mul v26.8h, v20.8h, v20.8h
uadalp v2.4s, v25.8h
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v21.8h
mul v27.8h, v21.8h, v21.8h
uadalp v1.4s, v26.8h
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
uadalp v2.4s, v27.8h
b.gt 1b
add v0.8h, v0.8h, v18.8h
mul v28.8h, v18.8h, v18.8h
add v0.8h, v0.8h, v19.8h
mul v29.8h, v19.8h, v19.8h
uadalp v1.4s, v28.8h
uadalp v2.4s, v29.8h
b var_end
endfunc
.endm
function var_end
add v1.4s, v1.4s, v2.4s
uaddlv s0, v0.8h
uaddlv d1, v1.4s
mov w0, v0.s[0]
mov x1, v1.d[0]
orr x0, x0, x1, lsl #32
ret
endfunc
.macro SUMSUBL_AB_SVE sum, sub, a, b
add \sum, \a, \b
sub \sub, \a, \b
.endm
function pixel_sa8d_8x8_sve, export=1
ptrue p0.h, vl8
mov x4, x30
bl pixel_sa8d_8x8_sve
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
add w0, w0, #1
lsr w0, w0, #1
ret x4
endfunc
.macro sa8d_satd_sve_8x8 satd=
function pixel_sa8d_\satd\()8x8_sve
load_diff_fly_sve_8x8
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
.ifc \satd, satd_
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
transpose v4.4s, v6.4s, v24.4s, v26.4s
transpose v5.4s, v7.4s, v25.4s, v27.4s
transpose v24.4s, v26.4s, v0.4s, v2.4s
transpose v25.4s, v27.4s, v1.4s, v3.4s
abs v0.8h, v4.8h
abs v1.8h, v5.8h
abs v2.8h, v6.8h
abs v3.8h, v7.8h
abs v4.8h, v24.8h
abs v5.8h, v25.8h
abs v6.8h, v26.8h
abs v7.8h, v27.8h
umax v0.8h, v0.8h, v2.8h
umax v1.8h, v1.8h, v3.8h
umax v2.8h, v4.8h, v6.8h
umax v3.8h, v5.8h, v7.8h
add v26.8h, v0.8h, v1.8h
add v27.8h, v2.8h, v3.8h
.endif
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
transpose v20.8h, v21.8h, v16.8h, v17.8h
transpose v4.8h, v5.8h, v0.8h, v1.8h
transpose v22.8h, v23.8h, v18.8h, v19.8h
transpose v6.8h, v7.8h, v2.8h, v3.8h
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
transpose v20.4s, v22.4s, v2.4s, v0.4s
transpose v21.4s, v23.4s, v3.4s, v1.4s
transpose v16.4s, v18.4s, v24.4s, v4.4s
transpose v17.4s, v19.4s, v25.4s, v5.4s
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
transpose v16.2d, v20.2d, v0.2d, v4.2d
transpose v17.2d, v21.2d, v1.2d, v5.2d
transpose v18.2d, v22.2d, v2.2d, v6.2d
transpose v19.2d, v23.2d, v3.2d, v7.2d
abs v16.8h, v16.8h
abs v20.8h, v20.8h
abs v17.8h, v17.8h
abs v21.8h, v21.8h
abs v18.8h, v18.8h
abs v22.8h, v22.8h
abs v19.8h, v19.8h
abs v23.8h, v23.8h
umax v16.8h, v16.8h, v20.8h
umax v17.8h, v17.8h, v21.8h
umax v18.8h, v18.8h, v22.8h
umax v19.8h, v19.8h, v23.8h
add v0.8h, v16.8h, v17.8h
add v1.8h, v18.8h, v19.8h
ret
endfunc
.endm
.macro HADAMARD_AC_SVE w h
function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
ptrue p0.h, vl8
movrel x5, mask_ac_4_8
mov x4, x30
ld1 {v30.8h,v31.8h}, [x5]
movi v28.16b, #0
movi v29.16b, #0
bl hadamard_ac_8x8_sve
.if \h > 8
bl hadamard_ac_8x8_sve
.endif
.if \w > 8
sub x0, x0, x1, lsl #3
add x0, x0, #8
bl hadamard_ac_8x8_sve
.endif
.if \w * \h == 256
sub x0, x0, x1, lsl #4
bl hadamard_ac_8x8_sve
.endif
addv s1, v29.4s
addv s0, v28.4s
mov w1, v1.s[0]
mov w0, v0.s[0]
lsr w1, w1, #2
lsr w0, w0, #1
orr x0, x0, x1, lsl #32
ret x4
endfunc
.endm
// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
function hadamard_ac_8x8_sve
ld1b {z16.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z17.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h
ld1b {z20.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z21.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h
ld1b {z22.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z23.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h
SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
transpose v0.4s, v2.4s, v16.4s, v18.4s
transpose v1.4s, v3.4s, v17.4s, v19.4s
transpose v4.4s, v6.4s, v20.4s, v22.4s
transpose v5.4s, v7.4s, v21.4s, v23.4s
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
abs v0.8h, v16.8h
abs v4.8h, v20.8h
abs v1.8h, v17.8h
abs v5.8h, v21.8h
abs v2.8h, v18.8h
abs v6.8h, v22.8h
abs v3.8h, v19.8h
abs v7.8h, v23.8h
add v0.8h, v0.8h, v4.8h
add v1.8h, v1.8h, v5.8h
and v0.16b, v0.16b, v30.16b
add v2.8h, v2.8h, v6.8h
add v3.8h, v3.8h, v7.8h
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
uadalp v28.4s, v0.8h
uadalp v28.4s, v1.8h
SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
transpose v16.2d, v17.2d, v6.2d, v7.2d
transpose v18.2d, v19.2d, v4.2d, v5.2d
transpose v20.2d, v21.2d, v2.2d, v3.2d
abs v16.8h, v16.8h
abs v17.8h, v17.8h
abs v18.8h, v18.8h
abs v19.8h, v19.8h
abs v20.8h, v20.8h
abs v21.8h, v21.8h
transpose v7.2d, v6.2d, v1.2d, v0.2d
umax v3.8h, v16.8h, v17.8h
umax v2.8h, v18.8h, v19.8h
umax v1.8h, v20.8h, v21.8h
SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
add v2.8h, v2.8h, v3.8h
add v2.8h, v2.8h, v1.8h
and v4.16b, v4.16b, v31.16b
add v2.8h, v2.8h, v2.8h
abs v5.8h, v5.8h
abs v4.8h, v4.8h
add v2.8h, v2.8h, v5.8h
add v2.8h, v2.8h, v4.8h
uadalp v29.4s, v2.8h
ret
endfunc
SSD_FUNC_SVE 4, 4
SSD_FUNC_SVE 4, 8
SSD_FUNC_SVE 4, 16
SSD_FUNC_SVE 8, 4
SSD_FUNC_SVE 8, 8
pixel_var_sve_8 8
pixel_var_sve_8 16
sa8d_satd_sve_8x8
HADAMARD_AC_SVE 8, 8
HADAMARD_AC_SVE 8, 16
HADAMARD_AC_SVE 16, 8
HADAMARD_AC_SVE 16, 16
#else /* BIT_DEPTH == 10 */
.macro SSD_START_SVE_4
ptrue p0.s, vl4
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
sub v2.4s, v16.4s, v17.4s
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
mul v0.4s, v2.4s, v2.4s
.endm
.macro SSD_SVE_4
sub v2.4s, v16.4s, v17.4s
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
mla v0.4s, v2.4s, v2.4s
.endm
.macro SSD_END_SVE_4
sub v2.4s, v16.4s, v17.4s
mla v0.4s, v2.4s, v2.4s
.endm
.macro SSD_FUNC_SVE w h
function pixel_ssd_\w\()x\h\()_sve, export=1
SSD_START_SVE_\w
.rept \h-2
SSD_SVE_\w
.endr
SSD_END_SVE_\w
addv s0, v0.4s
fmov w0, s0
ret
endfunc
.endm
SSD_FUNC_SVE 4, 4
SSD_FUNC_SVE 4, 8
SSD_FUNC_SVE 4, 16
#endif /* BIT_DEPTH == 8 */

3040
common/aarch64/pixel-a.S Normal file

File diff suppressed because it is too large Load Diff

191
common/aarch64/pixel.h Normal file
View File

@@ -0,0 +1,191 @@
/*****************************************************************************
* pixel.h: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_PIXEL_H
#define X264_AARCH64_PIXEL_H
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
#define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
#define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
#define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
#if HAVE_DOTPROD
#define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
#define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
#define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
#define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
#define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
#define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
#define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
#define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
#define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
#define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
#define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
#endif // HAVE_DOTPROD
#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;
#define DECL_PIXELS_SSD_SVE( ret, args ) \
ret x264_pixel_ssd_8x8_sve args;\
ret x264_pixel_ssd_8x4_sve args;\
ret x264_pixel_ssd_4x16_sve args;\
ret x264_pixel_ssd_4x8_sve args;\
ret x264_pixel_ssd_4x4_sve args;
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X1_SSD_SVE( ) \
DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, neon )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
DECL_X1_SSD_SVE( )
#if HAVE_DOTPROD
DECL_X1( sad, neon_dotprod )
DECL_X4( sad, neon_dotprod )
DECL_X1( ssd, neon_dotprod )
#endif // HAVE_DOTPROD
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
int x264_pixel_vsad_neon( pixel *, intptr_t, int );
#if HAVE_DOTPROD
#define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
#endif // HAVE_DOTPROD
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t );
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t );
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t );
#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
const pixel *, intptr_t,
int sums[2][4] );
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int );
#endif

908
common/aarch64/predict-a.S Normal file
View File

@@ -0,0 +1,908 @@
/*****************************************************************************
* predict.S: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const p8weight, align=4
.short 1, 2, 3, 4, 1, 2, 3, 4
endconst
const p16weight, align=4
.short 1, 2, 3, 4, 5, 6, 7, 8
endconst
.macro ldcol.8 vd, xn, xm, n=8, hi=0
.if \n == 8 || \hi == 0
ld1 {\vd\().b}[0], [\xn], \xm
ld1 {\vd\().b}[1], [\xn], \xm
ld1 {\vd\().b}[2], [\xn], \xm
ld1 {\vd\().b}[3], [\xn], \xm
.endif
.if \n == 8 || \hi == 1
ld1 {\vd\().b}[4], [\xn], \xm
ld1 {\vd\().b}[5], [\xn], \xm
ld1 {\vd\().b}[6], [\xn], \xm
ld1 {\vd\().b}[7], [\xn], \xm
.endif
.endm
.macro ldcol.16 vd, xn, xm
ldcol.8 \vd, \xn, \xm
ld1 {\vd\().b}[ 8], [\xn], \xm
ld1 {\vd\().b}[ 9], [\xn], \xm
ld1 {\vd\().b}[10], [\xn], \xm
ld1 {\vd\().b}[11], [\xn], \xm
ld1 {\vd\().b}[12], [\xn], \xm
ld1 {\vd\().b}[13], [\xn], \xm
ld1 {\vd\().b}[14], [\xn], \xm
ld1 {\vd\().b}[15], [\xn], \xm
.endm
function predict_4x4_h_aarch64, export=1
ldurb w1, [x0, #0*FDEC_STRIDE-1]
mov w5, #0x01010101
ldrb w2, [x0, #1*FDEC_STRIDE-1]
ldrb w3, [x0, #2*FDEC_STRIDE-1]
mul w1, w1, w5
ldrb w4, [x0, #3*FDEC_STRIDE-1]
mul w2, w2, w5
str w1, [x0, #0*FDEC_STRIDE]
mul w3, w3, w5
str w2, [x0, #1*FDEC_STRIDE]
mul w4, w4, w5
str w3, [x0, #2*FDEC_STRIDE]
str w4, [x0, #3*FDEC_STRIDE]
ret
endfunc
function predict_4x4_v_aarch64, export=1
ldur w1, [x0, #0 - 1 * FDEC_STRIDE]
str w1, [x0, #0 + 0 * FDEC_STRIDE]
str w1, [x0, #0 + 1 * FDEC_STRIDE]
str w1, [x0, #0 + 2 * FDEC_STRIDE]
str w1, [x0, #0 + 3 * FDEC_STRIDE]
ret
endfunc
function predict_4x4_dc_neon, export=1
sub x1, x0, #FDEC_STRIDE
ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE]
ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
add w4, w4, w5
ldr s0, [x1]
add w6, w6, w7
uaddlv h0, v0.8b
add w4, w4, w6
dup v0.4h, v0.h[0]
dup v1.4h, w4
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #3
str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret
endfunc
function predict_4x4_dc_top_neon, export=1
sub x1, x0, #FDEC_STRIDE
ldr s0, [x1]
uaddlv h0, v0.8b
dup v0.4h, v0.h[0]
rshrn v0.8b, v0.8h, #2
str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret
ret
endfunc
function predict_4x4_ddr_neon, export=1
sub x1, x0, #FDEC_STRIDE+1
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
ext v0.8b, v1.8b, v0.8b, #7
ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
ext v0.8b, v2.8b, v0.8b, #7 // a
ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
ext v1.8b, v3.8b, v0.8b, #7 // b
ext v2.8b, v4.8b, v1.8b, #7 // c
uaddl v0.8h, v0.8b, v1.8b
uaddl v1.8h, v1.8b, v2.8b
add v0.8h, v0.8h, v1.8h
rshrn v0.8b, v0.8h, #2
ext v3.8b, v0.8b, v0.8b, #3
ext v2.8b, v0.8b, v0.8b, #2
ext v1.8b, v0.8b, v0.8b, #1
str s3, [x0], #FDEC_STRIDE
str s2, [x0], #FDEC_STRIDE
str s1, [x0], #FDEC_STRIDE
str s0, [x0]
ret
endfunc
function predict_4x4_ddl_neon, export=1
sub x0, x0, #FDEC_STRIDE
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x0], x7
dup v3.8b, v0.b[7]
ext v1.8b, v0.8b, v0.8b, #1
ext v2.8b, v0.8b, v3.8b, #2
uhadd v0.8b, v0.8b, v2.8b
urhadd v0.8b, v0.8b, v1.8b
str s0, [x0], #FDEC_STRIDE
ext v1.8b, v0.8b, v0.8b, #1
ext v2.8b, v0.8b, v0.8b, #2
str s1, [x0], #FDEC_STRIDE
ext v3.8b, v0.8b, v0.8b, #3
str s2, [x0], #FDEC_STRIDE
str s3, [x0]
ret
endfunc
function predict_8x8_dc_neon, export=1
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1], #16
ld1 {v1.8b}, [x1]
ext v0.16b, v0.16b, v0.16b, #7
uaddlv h1, v1.8b
uaddlv h0, v0.8b
add v0.8h, v0.8h, v1.8h
dup v0.8h, v0.h[0]
rshrn v0.8b, v0.8h, #4
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8_h_neon, export=1
mov x7, #FDEC_STRIDE
ld1 {v16.16b}, [x1]
dup v0.8b, v16.b[14]
dup v1.8b, v16.b[13]
st1 {v0.8b}, [x0], x7
dup v2.8b, v16.b[12]
st1 {v1.8b}, [x0], x7
dup v3.8b, v16.b[11]
st1 {v2.8b}, [x0], x7
dup v4.8b, v16.b[10]
st1 {v3.8b}, [x0], x7
dup v5.8b, v16.b[9]
st1 {v4.8b}, [x0], x7
dup v6.8b, v16.b[8]
st1 {v5.8b}, [x0], x7
dup v7.8b, v16.b[7]
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ret
endfunc
function predict_8x8_v_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x1]
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8_ddl_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1]
movi v3.16b, #0
dup v2.16b, v0.b[15]
ext v4.16b, v3.16b, v0.16b, #15
ext v2.16b, v0.16b, v2.16b, #1
uhadd v4.16b, v4.16b, v2.16b
urhadd v0.16b, v0.16b, v4.16b
ext v1.16b, v0.16b, v0.16b, #1
ext v2.16b, v0.16b, v0.16b, #2
st1 {v1.8b}, [x0], x7
ext v3.16b, v0.16b, v0.16b, #3
st1 {v2.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b, #4
st1 {v3.8b}, [x0], x7
ext v5.16b, v0.16b, v0.16b, #5
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #6
st1 {v5.8b}, [x0], x7
ext v7.16b, v0.16b, v0.16b, #7
st1 {v6.8b}, [x0], x7
ext v0.16b, v0.16b, v0.16b, #8
st1 {v7.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_ddr_neon, export=1
ld1 {v0.16b,v1.16b}, [x1]
ext v2.16b, v0.16b, v1.16b, #7
ext v4.16b, v0.16b, v1.16b, #9
ext v3.16b, v0.16b, v1.16b, #8
uhadd v2.16b, v2.16b, v4.16b
urhadd v7.16b, v3.16b, v2.16b
add x0, x0, #7*FDEC_STRIDE
mov x7, #-1*FDEC_STRIDE
ext v6.16b, v7.16b, v7.16b, #1
st1 {v7.8b}, [x0], x7
ext v5.16b, v7.16b, v7.16b, #2
st1 {v6.8b}, [x0], x7
ext v4.16b, v7.16b, v7.16b, #3
st1 {v5.8b}, [x0], x7
ext v3.16b, v7.16b, v7.16b, #4
st1 {v4.8b}, [x0], x7
ext v2.16b, v7.16b, v7.16b, #5
st1 {v3.8b}, [x0], x7
ext v1.16b, v7.16b, v7.16b, #6
st1 {v2.8b}, [x0], x7
ext v0.16b, v7.16b, v7.16b, #7
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_vl_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1]
ext v1.16b, v1.16b, v0.16b, #15
ext v2.16b, v0.16b, v2.16b, #1
uhadd v1.16b, v1.16b, v2.16b
urhadd v3.16b, v0.16b, v2.16b
urhadd v0.16b, v0.16b, v1.16b
ext v4.16b, v0.16b, v0.16b, #1
st1 {v3.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b, #1
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #2
st1 {v5.8b}, [x0], x7
ext v7.16b, v3.16b, v3.16b, #2
st1 {v6.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b, #3
st1 {v7.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b, #3
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #4
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0], x7
ret
endfunc
function predict_8x8_vr_neon, export=1
add x1, x1, #8
mov x7, #FDEC_STRIDE
ld1 {v2.16b}, [x1]
ext v1.16b, v2.16b, v2.16b, #14
ext v0.16b, v2.16b, v2.16b, #15
uhadd v3.16b, v2.16b, v1.16b
urhadd v2.16b, v2.16b, v0.16b
urhadd v0.16b, v0.16b, v3.16b
ext v1.16b, v2.16b, v2.16b, #8
uzp1 v2.8b, v0.8b, v0.8b
uzp2 v3.8b, v0.8b, v0.8b
ext v0.16b, v0.16b, v0.16b, #8
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ext v4.8b, v3.8b, v1.8b, #7
ext v5.8b, v2.8b, v0.8b, #7
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
ext v6.8b, v3.8b, v1.8b, #6
ext v7.8b, v2.8b, v0.8b, #6
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ext v1.8b, v3.8b, v1.8b, #5
ext v0.8b, v2.8b, v0.8b, #5
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_hd_neon, export=1
add x1, x1, #7
mov x7, #FDEC_STRIDE
ld1 {v1.16b}, [x1]
ext v3.16b, v1.16b, v1.16b, #1
ext v2.16b, v1.16b, v1.16b, #2
urhadd v4.16b, v1.16b, v3.16b
uhadd v1.16b, v1.16b, v2.16b
urhadd v0.16b, v1.16b, v3.16b
zip1 v16.8b, v4.8b, v0.8b
zip2 v17.8b, v4.8b, v0.8b
ext v7.16b, v0.16b, v0.16b, #8
ext v0.8b, v17.8b, v7.8b, #6
ext v1.8b, v17.8b, v7.8b, #4
st1 {v0.8b}, [x0], x7
ext v2.8b, v17.8b, v7.8b, #2
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v3.8b, v16.8b, v17.8b, #6
st1 {v17.8b}, [x0], x7
ext v4.8b, v16.8b, v17.8b, #4
st1 {v3.8b}, [x0], x7
ext v5.8b, v16.8b, v17.8b, #2
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v16.8b}, [x0], x7
ret
endfunc
function predict_8x8_hu_neon, export=1
add x1, x1, #7
mov x7, #FDEC_STRIDE
ld1 {v7.8b}, [x1]
dup v6.8b, v7.b[0]
rev64 v7.8b, v7.8b
ext v4.8b, v7.8b, v6.8b, #2
ext v2.8b, v7.8b, v6.8b, #1
uhadd v5.8b, v7.8b, v4.8b
urhadd v0.8b, v2.8b, v7.8b
urhadd v1.8b, v5.8b, v2.8b
zip1 v16.8b, v0.8b, v1.8b
zip2 v17.8b, v0.8b, v1.8b
dup v18.4h, v17.h[3]
ext v0.8b, v16.8b, v17.8b, #2
ext v1.8b, v16.8b, v17.8b, #4
ext v2.8b, v16.8b, v17.8b, #6
st1 {v16.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v4.8b, v17.8b, v18.8b, #2
ext v5.8b, v17.8b, v18.8b, #4
ext v6.8b, v17.8b, v18.8b, #6
st1 {v17.8b}, [x0], x7
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0]
ret
endfunc
function predict_8x8c_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v0.8b, v0.8h, #2
dup v3.8b, v0.b[1]
dup v2.8b, v0.b[0]
transpose v0.2s, v1.2s, v2.2s, v3.2s
b pred8x8c_dc_end
endfunc
function predict_8x8c_dc_left_neon, export=1
ldurb w2, [x0, #0 * FDEC_STRIDE - 1]
ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
mov x1, #FDEC_STRIDE
add w2, w2, w3
add w3, w4, w5
ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
add w6, w6, w7
add w7, w8, w9
add w2, w2, w3
add w6, w6, w7
dup v0.8h, w2
dup v1.8h, w6
rshrn v0.8b, v0.8h, #2
rshrn v1.8b, v1.8h, #2
b pred8x8c_dc_end
endfunc
function predict_8x8c_dc_neon, export=1
mov x1, #FDEC_STRIDE
sub x2, x0, #FDEC_STRIDE
ldurb w10, [x0, #0 * FDEC_STRIDE - 1]
ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
add w10, w10, w11
ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
add w12, w12, w13
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
add w4, w4, w5
add w6, w6, w7
add w10, w10, w12, lsl #16
add w4, w4, w6, lsl #16
ld1 {v0.8b}, [x2]
add x10, x10, x4, lsl #32
uaddlp v0.4h, v0.8b // s0, s1
mov v1.d[0], x10 // s2, s3
add v3.4h, v0.4h, v1.4h
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
uzp1 v1.2d, v1.2d, v1.2d
uzp1 v0.2d, v0.2d, v0.2d
rshrn v3.8b, v1.8h, #3
rshrn v2.8b, v0.8h, #2
uzp1 v0.8b, v3.8b, v2.8b
uzp2 v1.8b, v2.8b, v3.8b
pred8x8c_dc_end:
add x2, x0, #2 * FDEC_STRIDE
add x4, x0, #4 * FDEC_STRIDE
add x5, x0, #6 * FDEC_STRIDE
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x2], x1
st1 {v0.8b}, [x0]
st1 {v0.8b}, [x2]
st1 {v1.8b}, [x4], x1
st1 {v1.8b}, [x5], x1
st1 {v1.8b}, [x4]
st1 {v1.8b}, [x5]
ret
endfunc
function predict_8x8c_h_neon, export=1
sub x1, x0, #1
mov x7, #FDEC_STRIDE
.rept 4
ld1r {v0.8b}, [x1], x7
ld1r {v1.8b}, [x1], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8c_v_aarch64, export=1
ldur x1, [x0, #-FDEC_STRIDE]
.irp c, 0,1,2,3,4,5,6,7
str x1, [x0, #\c * FDEC_STRIDE]
.endr
ret
endfunc
function predict_8x8c_p_neon, export=1
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #4
sub x3, x3, #1
ld1 {v0.s}[0], [x3]
ld1 {v2.s}[0], [x2], x1
ldcol.8 v0, x3, x1, 4, hi=1
add x3, x3, x1
ldcol.8 v3, x3, x1, 4
movrel x4, p8weight
movrel x5, p16weight
uaddl v4.8h, v2.8b, v3.8b
rev32 v0.8b, v0.8b
trn1 v2.2s, v2.2s, v3.2s
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v7.8h
ld1 {v0.8h}, [x5]
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s, #4
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s, #5 // b, c, x, x
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h, #2
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
rev64 v4.4h, v4.4h
add v4.4h, v4.4h, v0.4h
shl v2.4h, v4.4h, #4 // a
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
ext v0.16b, v0.16b, v0.16b, #14
sub v6.4h, v5.4h, v3.4h
mov v0.h[0], wzr
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
add v1.8h, v1.8h, v0.8h // pix + x*b
mov x3, #8
1:
subs x3, x3, #1
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
st1 {v0.8b}, [x0], x1
b.ne 1b
ret
endfunc
.macro loadsum4 wd, t1, t2, t3, x, idx
.if \idx == 0
ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
.else
ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
.endif
ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
add \wd, \wd, \t1
add \t1, \t2, \t3
add \wd, \wd, \t1
.endm
function predict_8x16c_h_neon, export=1
sub x2, x0, #1
add x3, x0, #FDEC_STRIDE - 1
mov x7, #2 * FDEC_STRIDE
add x1, x0, #FDEC_STRIDE
.rept 4
ld1r {v0.8b}, [x2], x7
ld1r {v1.8b}, [x3], x7
ld1r {v2.8b}, [x2], x7
ld1r {v3.8b}, [x3], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x1], x7
st1 {v2.8b}, [x0], x7
st1 {v3.8b}, [x1], x7
.endr
ret
endfunc
function predict_8x16c_v_neon, export=1
sub x1, x0, #FDEC_STRIDE
mov x2, #2 * FDEC_STRIDE
ld1 {v0.8b}, [x1], x2
.rept 8
st1 {v0.8b}, [x0], x2
st1 {v0.8b}, [x1], x2
.endr
ret
endfunc
function predict_8x16c_p_neon, export=1
movrel x4, p16weight
ld1 {v17.8h}, [x4]
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #4
sub x3, x3, #1
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
ext v4.8b, v2.8b, v2.8b, #3
ext v5.8b, v3.8b, v3.8b, #7
rev32 v0.8b, v0.8b
rev64 v1.8b, v1.8b
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v17.8h
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s // H
usubl v3.8h, v3.8b, v1.8b
mul v3.8h, v3.8h, v17.8h
saddlp v3.4s, v3.8h
addp v3.4s, v3.4s, v3.4s
addp v3.4s, v3.4s, v3.4s // V
ext v17.16b, v17.16b, v17.16b, #14
shl v4.4h, v4.4h, #4 // a
shl v6.2s, v2.2s, #4 // 16 * H
shl v7.2s, v3.2s, #2 // 4 * V
add v2.2s, v2.2s, v6.2s // 17 * H
add v3.2s, v3.2s, v7.2s // 5 * V
rshrn v2.4h, v2.4s, #5 // b
rshrn v3.4h, v3.4s, #6 // c
mov v17.h[0], wzr
sub v4.4h, v4.4h, v2.4h // a - b
shl v6.4h, v2.4h, #1 // 2 * b
add v4.4h, v4.4h, v3.4h // a - b + c
shl v7.4h, v3.4h, #3 // 8 * c
sub v4.4h, v4.4h, v6.4h // a - 3b + c
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v4.h[0] // i00
dup v2.8h, v3.h[0] // c
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
mov x3, #16
1:
subs x3, x3, #2
sqrshrun v4.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
sqrshrun v5.8b, v1.8h, #5
st1 {v4.8b}, [x0], x1
add v1.8h, v1.8h, v2.8h
st1 {v5.8b}, [x0], x1
b.ne 1b
ret
endfunc
function predict_8x16c_dc_neon, export=1
mov x1, #FDEC_STRIDE
sub x10, x0, #FDEC_STRIDE
loadsum4 w2, w3, w4, w5, x0, 0
ld1 {v6.8b}, [x10]
loadsum4 w6, w7, w8, w9, x0, 4
uaddlp v6.4h, v6.8b
dup v22.8h, w2 // s2
dup v23.8h, w6 // s3
loadsum4 w2, w3, w4, w5, x0, 8
addp v6.4h, v6.4h, v6.4h // s0, s1
loadsum4 w6, w7, w8, w9, x0, 12
dup v20.8h, v6.h[0] // s0
dup v21.8h, v6.h[1] // s1
dup v24.8h, w2 // s4
dup v25.8h, w6 // s5
ext v16.16b, v20.16b, v21.16b, #8
ext v17.16b, v22.16b, v21.16b, #8
ext v1.16b, v23.16b, v21.16b, #8
ext v2.16b, v24.16b, v21.16b, #8
ext v3.16b, v25.16b, v21.16b, #8
add v0.8h, v16.8h, v17.8h
add v1.8h, v1.8h, v23.8h
add v2.8h, v2.8h, v24.8h
add v3.8h, v3.8h, v25.8h
rshrn v0.8b, v0.8h, #3
rshrn v1.8b, v1.8h, #3
rshrn v2.8b, v2.8h, #3
rshrn v3.8b, v3.8h, #3
add x11, x0, #4 * FDEC_STRIDE
add x12, x0, #8 * FDEC_STRIDE
add x13, x0, #12 * FDEC_STRIDE
.rept 4
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x11], x1
st1 {v2.8b}, [x12], x1
st1 {v3.8b}, [x13], x1
.endr
ret
endfunc
function predict_8x16c_dc_left_neon, export=1
mov x1, #FDEC_STRIDE
ldurb w2, [x0, # 0 * FDEC_STRIDE - 1]
ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
add w2, w2, w3
ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
add w4, w4, w5
ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
add w2, w2, w4
ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
dup v0.8h, w2
add w6, w6, w7
rshrn v0.8b, v0.8h, #2
add w8, w8, w9
ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
add w6, w6, w8
ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
dup v1.8h, w6
add w10, w10, w11
rshrn v1.8b, v1.8h, #2
add w12, w12, w13
ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
add w10, w10, w12
ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
dup v2.8h, w10
add w2, w2, w3
rshrn v2.8b, v2.8h, #2
add w4, w4, w5
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
add w2, w2, w4
st1 {v0.8b}, [x0], x1
dup v3.8h, w2
st1 {v0.8b}, [x0], x1
rshrn v3.8b, v3.8h, #2
.irp idx, 1, 2, 3
.rept 4
st1 {v\idx\().8b}, [x0], x1
.endr
.endr
ret
endfunc
function predict_8x16c_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v4.8b, v0.8h, #2
dup v0.8b, v4.b[0]
dup v1.8b, v4.b[1]
ext v0.8b, v0.8b, v1.8b, #4
.rept 16
st1 {v0.8b}, [x0], x1
.endr
ret
endfunc
function predict_16x16_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.16b}, [x2]
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_left_neon, export=1
sub x2, x0, #1
mov x1, #FDEC_STRIDE
ldcol.16 v0, x2, x1
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_neon, export=1
sub x3, x0, #FDEC_STRIDE
sub x2, x0, #1
mov x1, #FDEC_STRIDE
ld1 {v0.16b}, [x3]
ldcol.16 v1, x2, x1
uaddlv h0, v0.16b
uaddlv h1, v1.16b
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #5
dup v0.16b, v0.b[0]
pred16x16_dc_end:
.rept 16
st1 {v0.16b}, [x0], x1
.endr
ret
endfunc
function predict_16x16_h_neon, export=1
sub x1, x0, #1
mov x7, #FDEC_STRIDE
.rept 8
ld1r {v0.16b}, [x1], x7
ld1r {v1.16b}, [x1], x7
st1 {v0.16b}, [x0], x7
st1 {v1.16b}, [x0], x7
.endr
ret
endfunc
function predict_16x16_v_neon, export=1
sub x0, x0, #FDEC_STRIDE
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x0], x7
.rept 16
st1 {v0.16b}, [x0], x7
.endr
ret
endfunc
function predict_16x16_p_neon, export=1
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #8
sub x3, x3, #1
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
rev64 v0.8b, v0.8b
rev64 v1.8b, v1.8b
movrel x4, p16weight
uaddl v4.8h, v2.8b, v3.8b
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
usubl v3.8h, v3.8b, v1.8b
mul v2.8h, v2.8h, v7.8h
mul v3.8h, v3.8h, v7.8h
saddlp v2.4s, v2.8h
saddlp v3.4s, v3.8h
addp v2.4s, v2.4s, v3.4s
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s, #2
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s, #6 // b, c, x, x
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h, #3
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
ext v4.16b, v4.16b, v4.16b, #14
add v4.4h, v4.4h, v7.4h
shl v2.4h, v4.4h, #4 // a
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
ext v7.16b, v7.16b, v7.16b, #14
mov v7.h[0], wzr
dup v3.8h, v5.h[0]
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
shl v3.8h, v3.8h, #3
add v1.8h, v1.8h, v0.8h // pix + x*b
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
mov x3, #16
1:
subs x3, x3, #1
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
sqshrun2 v0.16b, v3.8h, #5
add v3.8h, v3.8h, v2.8h
st1 {v0.16b}, [x0], x1
b.ne 1b
ret
endfunc

116
common/aarch64/predict-c.c Normal file
View File

@@ -0,0 +1,116 @@
/*****************************************************************************
* predict.c: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_ARMV8 )
{
pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64;
pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon;
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon;
}
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_ARMV8 )
{
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64;
}
if( !(cpu&X264_CPU_NEON) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon;
pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon;
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !HIGH_BIT_DEPTH
}

119
common/aarch64/predict.h Normal file
View File

@@ -0,0 +1,119 @@
/*****************************************************************************
* predict.h: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_PREDICT_H
#define X264_AARCH64_PREDICT_H
#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
void x264_predict_4x4_h_aarch64( uint8_t *src );
#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
void x264_predict_4x4_v_aarch64( uint8_t *src );
#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
void x264_predict_8x8c_v_aarch64( uint8_t *src );
// for the merged 4x4 intra sad/satd which expects unified suffix
#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
void x264_predict_4x4_dc_top_neon( uint8_t *src );
#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
void x264_predict_4x4_ddr_neon( uint8_t *src );
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
void x264_predict_4x4_ddl_neon( uint8_t *src );
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
void x264_predict_8x8c_p_neon( uint8_t *src );
#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
void x264_predict_8x16c_dc_left_neon( uint8_t *src );
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
void x264_predict_8x16c_p_neon( uint8_t *src );
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
void x264_predict_16x16_dc_top_neon( uint8_t *src );
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
void x264_predict_16x16_dc_left_neon( uint8_t *src );
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
void x264_predict_16x16_p_neon( uint8_t *src );
#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
void x264_predict_4x4_dc_neon( uint8_t *src );
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
void x264_predict_8x8c_dc_neon( uint8_t *src );
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
void x264_predict_8x8c_h_neon( uint8_t *src );
#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
void x264_predict_8x16c_v_neon( uint8_t *src );
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
void x264_predict_8x16c_h_neon( uint8_t *src );
#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
void x264_predict_8x16c_dc_neon( uint8_t *src );
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
void x264_predict_16x16_v_neon( uint8_t *src );
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
void x264_predict_16x16_h_neon( uint8_t *src );
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
void x264_predict_16x16_dc_neon( uint8_t *src );
#define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
#define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#endif /* X264_AARCH64_PREDICT_H */

1169
common/aarch64/quant-a.S Normal file

File diff suppressed because it is too large Load Diff

95
common/aarch64/quant.h Normal file
View File

@@ -0,0 +1,95 @@
/*****************************************************************************
* quant.h: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_QUANT_H
#define X264_AARCH64_QUANT_H
#define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64)
int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias );
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
int x264_decimate_score15_neon( dctcoef * );
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
int x264_decimate_score16_neon( dctcoef * );
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
int x264_decimate_score64_neon( dctcoef * );
// BIT DEPTH = 8
#define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
int x264_coeff_last4_aarch64( dctcoef * );
#define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
int x264_coeff_last8_aarch64( dctcoef * );
// BIT DEPTH = 10
#define x264_coeff_last4_neon x264_template(coeff_last4_neon)
int x264_coeff_last4_neon( dctcoef * );
#define x264_coeff_last8_neon x264_template(coeff_last8_neon)
int x264_coeff_last8_neon( dctcoef * );
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
int x264_coeff_last15_neon( dctcoef * );
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
int x264_coeff_last16_neon( dctcoef * );
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
int x264_coeff_last64_neon( dctcoef * );
// BIT_DEPTH = 8
#define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * );
// BIT_DEPTH = 10
#define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon)
int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * );
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
#endif